# Machine Learning Algorithms

In [9]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd


from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix

from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier


from lightgbm import LGBMClassifier
import xgboost as xgb
from catboost import CatBoostClassifier
from sklearn.ensemble import AdaBoostClassifier


In [2]:
df1 = pd.read_csv("encoded_emails.csv")
df1.head(5)

Unnamed: 0,Category,Message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
X = df1["Message"]
y = df1["Category"]

In [4]:
# Text vectorization using TF-IDF
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(X)

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Gradient Boosting Techniques

In [10]:
#Ada Boost
weak_learner = DecisionTreeClassifier(max_depth=1)
ada_classifier = AdaBoostClassifier(base_estimator=weak_learner, n_estimators=50, random_state=42)
ada_classifier.fit(X_train, y_train)

#Light Gradient Boosting (lighbm)
lgb = LGBMClassifier(n_estimators=100, random_state=42, verbosity=-1)  # Set verbosity to -1 to suppress warning
lgb.fit(X_train, y_train)


# XGBoost classifier
xgb_classifier = xgb.XGBClassifier(n_estimators=100, random_state=42)
xgb_classifier.fit(X_train, y_train)

# CatBoost classifier
catboost_classifier = CatBoostClassifier(iterations=100, random_state=42)
catboost_classifier.fit(X_train, y_train)





Learning rate set to 0.152166
0:	learn: 0.5336982	total: 46.7ms	remaining: 4.62s
1:	learn: 0.4271861	total: 84.7ms	remaining: 4.15s
2:	learn: 0.3560125	total: 125ms	remaining: 4.03s
3:	learn: 0.3029378	total: 167ms	remaining: 4.02s
4:	learn: 0.2603002	total: 211ms	remaining: 4.02s
5:	learn: 0.2364007	total: 252ms	remaining: 3.95s
6:	learn: 0.2134595	total: 293ms	remaining: 3.89s
7:	learn: 0.1957810	total: 336ms	remaining: 3.86s
8:	learn: 0.1822947	total: 378ms	remaining: 3.82s
9:	learn: 0.1683614	total: 417ms	remaining: 3.75s
10:	learn: 0.1610274	total: 459ms	remaining: 3.72s
11:	learn: 0.1529592	total: 497ms	remaining: 3.65s
12:	learn: 0.1457076	total: 536ms	remaining: 3.58s
13:	learn: 0.1393862	total: 576ms	remaining: 3.54s
14:	learn: 0.1333184	total: 617ms	remaining: 3.5s
15:	learn: 0.1291421	total: 660ms	remaining: 3.46s
16:	learn: 0.1252931	total: 699ms	remaining: 3.41s
17:	learn: 0.1192121	total: 740ms	remaining: 3.37s
18:	learn: 0.1165899	total: 782ms	remaining: 3.33s
19:	learn:

<catboost.core.CatBoostClassifier at 0x1f4f90f7d10>

In [11]:
# Convert probabilities to binary predictions
y_pred = lgb.predict(X_test)
y_pred_binary = [1 if pred >= 0.5 else 0 for pred in y_pred]
# Calculate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred_binary)
print("Accuracy of Lightbm                            :", accuracy)

# Make predictions on the test set
y_pred = ada_classifier.predict(X_test.toarray())
# Calculate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy of Ada Boost                          :", accuracy)


# Make predictions on the test set
y_pred = xgb_classifier.predict(X_test)
# Calculate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy of XGboost                            :", accuracy)


# Make predictions on the test set
y_pred = catboost_classifier.predict(X_test)
# Calculate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy of Catboost                           :", accuracy)

Accuracy of Lightbm                            : 0.9832535885167464
Accuracy of Ada Boost                          : 0.979066985645933
Accuracy of XGboost                            : 0.9832535885167464
Accuracy of Catboost                           : 0.9772727272727273
