In [1]:
import pandas as pd
df = pd.read_csv("spam.csv", encoding="latin-1")  # Try this if you face encoding issues
print(df.head())


     v1                                                 v2 Unnamed: 2  \
0   ham  Go until jurong point, crazy.. Available only ...        NaN   
1   ham                      Ok lar... Joking wif u oni...        NaN   
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...        NaN   
3   ham  U dun say so early hor... U c already then say...        NaN   
4   ham  Nah I don't think he goes to usf, he lives aro...        NaN   

  Unnamed: 3 Unnamed: 4  
0        NaN        NaN  
1        NaN        NaN  
2        NaN        NaN  
3        NaN        NaN  
4        NaN        NaN  


In [3]:
import pandas as pd

# Load data
df = pd.read_csv("spam.csv", encoding="latin-1")

# Check the first few rows
df.head()


Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [5]:
# Keep only relevant columns (assuming 'v1' is the label and 'v2' is the message)
df = df[['v1', 'v2']]
df.columns = ['label', 'message']  # Rename columns

# Convert labels to binary values: 'ham' -> 0, 'spam' -> 1
df['label'] = df['label'].map({'ham': 0, 'spam': 1})

# Check dataset info
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   label    5572 non-null   int64 
 1   message  5572 non-null   object
dtypes: int64(1), object(1)
memory usage: 87.2+ KB


In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

# Convert text into numerical features using TF-IDF
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)

X = vectorizer.fit_transform(df['message'])
y = df['label']

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [9]:
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

# Train Random Forest
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Train XGBoost
xgb = XGBClassifier(n_estimators=100, use_label_encoder=False, eval_metric="logloss", random_state=42)
xgb.fit(X_train, y_train)

# Get predictions
rf_preds = rf.predict_proba(X_test)[:, 1]
xgb_preds = xgb.predict_proba(X_test)[:, 1]

# Blend predictions (averaging method)
final_preds = (rf_preds + xgb_preds) / 2
final_preds = [1 if pred > 0.5 else 0 for pred in final_preds]

# Evaluate model
accuracy = accuracy_score(y_test, final_preds)
print(f"Blended Model Accuracy: {accuracy:.4f}")


Parameters: { "use_label_encoder" } are not used.



Blended Model Accuracy: 0.9785


In [11]:
import pickle

# Save models
pickle.dump(rf, open("random_forest.pkl", "wb"))
pickle.dump(xgb, open("xgboost.pkl", "wb"))
pickle.dump(vectorizer, open("vectorizer.pkl", "wb"))


In [13]:
# Load models
rf = pickle.load(open("random_forest.pkl", "rb"))
xgb = pickle.load(open("xgboost.pkl", "rb"))
vectorizer = pickle.load(open("vectorizer.pkl", "rb"))

# Predict on a new message
new_message = ["Congratulations! You've won a free iPhone!"]
new_message_tfidf = vectorizer.transform(new_message)

rf_pred = rf.predict_proba(new_message_tfidf)[:, 1]
xgb_pred = xgb.predict_proba(new_message_tfidf)[:, 1]

final_pred = (rf_pred + xgb_pred) / 2
final_label = "Spam" if final_pred > 0.5 else "Ham"

print("Prediction:", final_label)


Prediction: Ham
