In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
df=pd.read_csv('/content/sample_data/spam.csv')
df

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [6]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

# Prepare the data
X = df['Message']  # Use the 'Message' column for features
y = df['Category']  # Use the 'Category' column for labels
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature extraction
vectorizer = TfidfVectorizer() # use for numerical statistic that reflects how important a word is to a document in a collection or corpus. It's often used as a weighting factor in information retrieval and text mining.
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Model training
classifier = MultinomialNB()
classifier.fit(X_train_tfidf, y_train)

# Model evaluation
y_pred = classifier.predict(X_test_tfidf)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Predicting on the entire 'Message' column
all_messages = df['Message']  # Get all messages from the 'Message' column
all_messages_tfidf = vectorizer.transform(all_messages)  # Transform all messages
predictions = classifier.predict(all_messages_tfidf)  # Predict for all messages

# Add predictions as a new column to the dataframe
df['Prediction'] = predictions

# Print the updated dataframe
# To see the output, run the code.
df

Accuracy: 0.9650224215246637


Unnamed: 0,Category,Message,Prediction
0,ham,"Go until jurong point, crazy.. Available only ...",ham
1,ham,Ok lar... Joking wif u oni...,ham
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,spam
3,ham,U dun say so early hor... U c already then say...,ham
4,ham,"Nah I don't think he goes to usf, he lives aro...",ham
...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,spam
5568,ham,Will ü b going to esplanade fr home?,ham
5569,ham,"Pity, * was in mood for that. So...any other s...",ham
5570,ham,The guy did some bitching but I acted like i'd...,ham


In [10]:
from sklearn.linear_model import LogisticRegression
lr_classifer=LogisticRegression()
lr_classifer.fit(X_train_tfidf,y_train)
lr_predictions = lr_classifer.predict(X_test_tfidf)
lr_classifer=LogisticRegression()
lr_classifer.fit(X_train_tfidf,y_train)
lr_predictions = lr_classifer.predict(X_test_tfidf)
lr_accuracy = accuracy_score(y_test, lr_predictions)
print("Logistic Regression Accuracy:", lr_accuracy)
pred=pd.DataFrame({'predictions':lr_predictions,
                  'actual':y_test})
pred

Logistic Regression Accuracy: 0.9748878923766816


Unnamed: 0,predictions,actual
3245,ham,ham
944,ham,ham
1044,ham,ham
2484,ham,ham
812,ham,ham
...,...,...
4264,ham,ham
2439,ham,ham
5556,ham,ham
4205,ham,ham


In [12]:
from sklearn.neighbors import KNeighborsClassifier
knn_classifier = KNeighborsClassifier()
knn_classifier.fit(X_train_tfidf, y_train)
knn_predictions = knn_classifier.predict(X_test_tfidf)
knn_accuracy = accuracy_score(y_test, knn_predictions)
print("KNN Accuracy:", knn_accuracy)
pred=pd.DataFrame({'predictions':lr_predictions,
                  'actual':y_test})
pred


KNN Accuracy: 0.9192825112107623


Unnamed: 0,predictions,actual
3245,ham,ham
944,ham,ham
1044,ham,ham
2484,ham,ham
812,ham,ham
...,...,...
4264,ham,ham
2439,ham,ham
5556,ham,ham
4205,ham,ham


In [14]:
from sklearn.ensemble import RandomForestClassifier
rf_classifier = RandomForestClassifier()
rf_classifier.fit(X_train_tfidf, y_train)
rf_predictions = rf_classifier.predict(X_test_tfidf)
rf_accuracy = accuracy_score(y_test, rf_predictions)
print("Random Forest Accuracy:", rf_accuracy)
pred=pd.DataFrame({'predictions':lr_predictions,
                  'actual':y_test})
pred

Random Forest Accuracy: 0.979372197309417


Unnamed: 0,predictions,actual
3245,ham,ham
944,ham,ham
1044,ham,ham
2484,ham,ham
812,ham,ham
...,...,...
4264,ham,ham
2439,ham,ham
5556,ham,ham
4205,ham,ham


In [15]:
ab_classifier = AdaBoostClassifier()
ab_classifier.fit(X_train_tfidf, y_train)
ab_predictions = ab_classifier.predict(X_test_tfidf)
ab_accuracy = accuracy_score(y_test, ab_predictions)
print("AdaBoost Accuracy:", ab_accuracy)

AdaBoost Accuracy: 0.9488789237668162


In [16]:

if lr_accuracy > knn_accuracy and lr_accuracy > rf_accuracy and lr_accuracy > ab_accuracy:
    best_algorithm = "Logistic Regression"
    best_accuracy = lr_accuracy
elif knn_accuracy > lr_accuracy and knn_accuracy > rf_accuracy and knn_accuracy > ab_accuracy:
    best_algorithm = "KNN"
    best_accuracy = knn_accuracy
elif rf_accuracy > lr_accuracy and rf_accuracy > knn_accuracy and rf_accuracy > ab_accuracy:
    best_algorithm = "Random Forest"
    best_accuracy = rf_accuracy
else:
    best_algorithm = "AdaBoost"
    best_accuracy = ab_accuracy

print(f"The best algorithm is: {best_algorithm}")
print(f"with an accuracy of: {best_accuracy}")

The best algorithm is: Random Forest
with an accuracy of: 0.979372197309417


# New Section