# **Installation**

In [None]:
#Installation
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://archive.apache.org/dist/spark/spark-3.1.1/spark-3.1.1-bin-hadoop3.2.tgz
!tar xf spark-3.1.1-bin-hadoop3.2.tgz
!pip install -q findspark

import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.1.1-bin-hadoop3.2"

In [None]:
#spark initialize
import findspark
findspark.init()

In [None]:
#Creating Session
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("SpamEmailClassification").getOrCreate()

**Import Pakage**

# **Load Dataset**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
from pyspark.sql import SparkSession
# spark = SparkSession.builder.appName("NumericalProcessing").getOrCreate()
data = spark.read.csv("/content/drive/MyDrive/Research/mail_data.csv", header=True, inferSchema=True)
data.show()

+--------+--------------------+
|Category|             Message|
+--------+--------------------+
|     ham|Go until jurong p...|
|     ham|Ok lar... Joking ...|
|    spam|Free entry in 2 a...|
|     ham|U dun say so earl...|
|     ham|Nah I don't think...|
|    spam|FreeMsg Hey there...|
|     ham|Even my brother i...|
|     ham|As per your reque...|
|    spam|WINNER!! As a val...|
|    spam|Had your mobile 1...|
|     ham|I'm gonna be home...|
|    spam|SIX chances to wi...|
|    spam|URGENT! You have ...|
|     ham|I've been searchi...|
|     ham|I HAVE A DATE ON ...|
|    spam|XXXMobileMovieClu...|
|     ham|Oh k...i'm watchi...|
|     ham|Eh u remember how...|
|     ham|Fine if thats th...|
|    spam|England v Macedon...|
+--------+--------------------+
only showing top 20 rows



In [None]:
from pyspark.sql.functions import length

In [None]:
data=data.withColumn("length",length(data["Message"]))

In [None]:
data.show()

+--------+--------------------+------+
|Category|             Message|length|
+--------+--------------------+------+
|     ham|Go until jurong p...|   111|
|     ham|Ok lar... Joking ...|    29|
|    spam|Free entry in 2 a...|   155|
|     ham|U dun say so earl...|    49|
|     ham|Nah I don't think...|    61|
|    spam|FreeMsg Hey there...|   147|
|     ham|Even my brother i...|    77|
|     ham|As per your reque...|   160|
|    spam|WINNER!! As a val...|   157|
|    spam|Had your mobile 1...|   154|
|     ham|I'm gonna be home...|   109|
|    spam|SIX chances to wi...|   136|
|    spam|URGENT! You have ...|   155|
|     ham|I've been searchi...|   196|
|     ham|I HAVE A DATE ON ...|    35|
|    spam|XXXMobileMovieClu...|   149|
|     ham|Oh k...i'm watchi...|    26|
|     ham|Eh u remember how...|    81|
|     ham|Fine if thats th...|    56|
|    spam|England v Macedon...|   155|
+--------+--------------------+------+
only showing top 20 rows



**Column Indexing**

In [None]:
data.columns

['Category', 'Message', 'length']

**Checking DataTypes of the Columns**

In [None]:
data.printSchema()#value type


root
 |-- Category: string (nullable = true)
 |-- Message: string (nullable = true)
 |-- length: integer (nullable = true)



In [None]:
data.dtypes

[('Category', 'string'), ('Message', 'string'), ('length', 'int')]

In [None]:
#Cheking for null values
for col in data.columns:
    print(col.ljust(10), data.filter(data[col].isNull()).count())

Category   0
Message    0
length     0


In [None]:
#Converting dataset into pandas to observe the data in a more formatted way
data.limit(5580).toPandas()

Unnamed: 0,Category,Message,length
0,ham,"Go until jurong point, crazy.. Available only ...",111
1,ham,Ok lar... Joking wif u oni...,29
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,155
3,ham,U dun say so early hor... U c already then say...,49
4,ham,"Nah I don't think he goes to usf, he lives aro...",61
...,...,...,...
5569,spam,This is the 2nd time we have tried 2 contact u...,160
5570,ham,Will ü b going to esplanade fr home?,36
5571,ham,"Pity, * was in mood for that. So...any other s...",57
5572,ham,The guy did some bitching but I acted like i'd...,125


In [None]:
#To know the shape of the dataframe
print('Number of rows: \t', data.count())
print('Number of columns: \t', len(data.columns))

Number of rows: 	 5574
Number of columns: 	 3


In [None]:
import pyspark.sql.functions as F

# Replace the values of 0,1 in the "Category" column
data2 = data.withColumn("Category", F.when(data["Category"] == "spam", 0).otherwise(1))

In [None]:
data2.show()

+--------+--------------------+------+
|Category|             Message|length|
+--------+--------------------+------+
|       1|Go until jurong p...|   111|
|       1|Ok lar... Joking ...|    29|
|       0|Free entry in 2 a...|   155|
|       1|U dun say so earl...|    49|
|       1|Nah I don't think...|    61|
|       0|FreeMsg Hey there...|   147|
|       1|Even my brother i...|    77|
|       1|As per your reque...|   160|
|       0|WINNER!! As a val...|   157|
|       0|Had your mobile 1...|   154|
|       1|I'm gonna be home...|   109|
|       0|SIX chances to wi...|   136|
|       0|URGENT! You have ...|   155|
|       1|I've been searchi...|   196|
|       1|I HAVE A DATE ON ...|    35|
|       0|XXXMobileMovieClu...|   149|
|       1|Oh k...i'm watchi...|    26|
|       1|Eh u remember how...|    81|
|       1|Fine if thats th...|    56|
|       0|England v Macedon...|   155|
+--------+--------------------+------+
only showing top 20 rows



In [None]:
data3 = data2.limit(10).toPandas()
data3

Unnamed: 0,Category,Message,length
0,1,"Go until jurong point, crazy.. Available only ...",111
1,1,Ok lar... Joking wif u oni...,29
2,0,Free entry in 2 a wkly comp to win FA Cup fina...,155
3,1,U dun say so early hor... U c already then say...,49
4,1,"Nah I don't think he goes to usf, he lives aro...",61
5,0,FreeMsg Hey there darling it's been 3 week's n...,147
6,1,Even my brother is not like to speak with me. ...,77
7,1,As per your request 'Melle Melle (Oru Minnamin...,160
8,0,WINNER!! As a valued network customer you have...,157
9,0,Had your mobile 11 months or more? U R entitle...,154


In [None]:
data2.withColumn("length",length(data["Message"])).show()

+--------+--------------------+------+
|Category|             Message|length|
+--------+--------------------+------+
|       1|Go until jurong p...|   111|
|       1|Ok lar... Joking ...|    29|
|       0|Free entry in 2 a...|   155|
|       1|U dun say so earl...|    49|
|       1|Nah I don't think...|    61|
|       0|FreeMsg Hey there...|   147|
|       1|Even my brother i...|    77|
|       1|As per your reque...|   160|
|       0|WINNER!! As a val...|   157|
|       0|Had your mobile 1...|   154|
|       1|I'm gonna be home...|   109|
|       0|SIX chances to wi...|   136|
|       0|URGENT! You have ...|   155|
|       1|I've been searchi...|   196|
|       1|I HAVE A DATE ON ...|    35|
|       0|XXXMobileMovieClu...|   149|
|       1|Oh k...i'm watchi...|    26|
|       1|Eh u remember how...|    81|
|       1|Fine if thats th...|    56|
|       0|England v Macedon...|   155|
+--------+--------------------+------+
only showing top 20 rows



In [None]:
data2.groupBy('Category').mean().show()


+--------+-------------+-----------------+
|Category|avg(Category)|      avg(length)|
+--------+-------------+-----------------+
|       1|          1.0|71.18810855603894|
|       0|          0.0|137.7550200803213|
+--------+-------------+-----------------+



**Tokenization**

In [None]:
from pyspark.ml.feature import Tokenizer

tokenizer = Tokenizer(inputCol="Message", outputCol="Tokenizered_Message")
tokenized_data = tokenizer.transform(data2)
tokenized_data.show()

+--------+--------------------+------+--------------------+
|Category|             Message|length| Tokenizered_Message|
+--------+--------------------+------+--------------------+
|       1|Go until jurong p...|   111|[go, until, juron...|
|       1|Ok lar... Joking ...|    29|[ok, lar..., joki...|
|       0|Free entry in 2 a...|   155|[free, entry, in,...|
|       1|U dun say so earl...|    49|[u, dun, say, so,...|
|       1|Nah I don't think...|    61|[nah, i, don't, t...|
|       0|FreeMsg Hey there...|   147|[freemsg, hey, th...|
|       1|Even my brother i...|    77|[even, my, brothe...|
|       1|As per your reque...|   160|[as, per, your, r...|
|       0|WINNER!! As a val...|   157|[winner!!, as, a,...|
|       0|Had your mobile 1...|   154|[had, your, mobil...|
|       1|I'm gonna be home...|   109|[i'm, gonna, be, ...|
|       0|SIX chances to wi...|   136|[six, chances, to...|
|       0|URGENT! You have ...|   155|[urgent!, you, ha...|
|       1|I've been searchi...|   196|[i

**Stop word removal**

In [None]:
from pyspark.ml.feature import StopWordsRemover

remover = StopWordsRemover(inputCol="Tokenizered_Message", outputCol="StopWordsRemover_Message")
filtered_data = remover.transform(tokenized_data)
filtered_data.show()

+--------+--------------------+------+--------------------+------------------------+
|Category|             Message|length| Tokenizered_Message|StopWordsRemover_Message|
+--------+--------------------+------+--------------------+------------------------+
|       1|Go until jurong p...|   111|[go, until, juron...|    [go, jurong, poin...|
|       1|Ok lar... Joking ...|    29|[ok, lar..., joki...|    [ok, lar..., joki...|
|       0|Free entry in 2 a...|   155|[free, entry, in,...|    [free, entry, 2, ...|
|       1|U dun say so earl...|    49|[u, dun, say, so,...|    [u, dun, say, ear...|
|       1|Nah I don't think...|    61|[nah, i, don't, t...|    [nah, think, goes...|
|       0|FreeMsg Hey there...|   147|[freemsg, hey, th...|    [freemsg, hey, da...|
|       1|Even my brother i...|    77|[even, my, brothe...|    [even, brother, l...|
|       1|As per your reque...|   160|[as, per, your, r...|    [per, request, 'm...|
|       0|WINNER!! As a val...|   157|[winner!!, as, a,...|    [w

**Text Cleaning**

In [None]:
from pyspark.sql.functions import regexp_replace, lower, col #regular expression remove & lower case convert

cleaned_data = filtered_data.withColumn("cleaned_Message",
                    lower(regexp_replace(col("Message"), "[^a-zA-Z\\s]", "")))

cleaned_data.show()

+--------+--------------------+------+--------------------+------------------------+--------------------+
|Category|             Message|length| Tokenizered_Message|StopWordsRemover_Message|     cleaned_Message|
+--------+--------------------+------+--------------------+------------------------+--------------------+
|       1|Go until jurong p...|   111|[go, until, juron...|    [go, jurong, poin...|go until jurong p...|
|       1|Ok lar... Joking ...|    29|[ok, lar..., joki...|    [ok, lar..., joki...|ok lar joking wif...|
|       0|Free entry in 2 a...|   155|[free, entry, in,...|    [free, entry, 2, ...|free entry in  a ...|
|       1|U dun say so earl...|    49|[u, dun, say, so,...|    [u, dun, say, ear...|u dun say so earl...|
|       1|Nah I don't think...|    61|[nah, i, don't, t...|    [nah, think, goes...|nah i dont think ...|
|       0|FreeMsg Hey there...|   147|[freemsg, hey, th...|    [freemsg, hey, da...|freemsg hey there...|
|       1|Even my brother i...|    77|[even, m

**Text Analysis**

In [None]:
from pyspark.sql.functions import length

data_with_length = cleaned_data.withColumn("text_length", length(col("Message")))
avg_length = data_with_length.agg({"text_length": "avg"}).collect()[0][0]
avg_length

80.10907786149983

In [None]:
from pyspark.sql.functions import length

data_with_length = cleaned_data.withColumn("text_length", length(col("cleaned_Message")))
avg_length = data_with_length.agg({"text_length": "avg"}).collect()[0][0]
avg_length

73.44384642985288

# **Prepare Data**

In [None]:
from sklearn.model_selection import train_test_split
# Assuming 'cleaned_data' is PySpark DataFrame with a 'cleaned_Message' column
pandas_df = cleaned_data.select('cleaned_Message', 'Category').toPandas()#convert to pandas

# Extracting features (X) and labels (Y)
X = pandas_df['cleaned_Message']
Y = pandas_df['Category']

# Split the data into training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=3)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import ExtraTreesClassifier

In [None]:
svc = SVC(kernel='sigmoid', gamma=1.0)
knc = KNeighborsClassifier()
mnb = MultinomialNB()
lrc = LogisticRegression(solver='liblinear', penalty='l1')
etc = ExtraTreesClassifier(n_estimators=50, random_state=2)

In [None]:
clfs = {
    'SVC' : svc,
    'KN' : knc,
    'NB': mnb,
    'LR': lrc,
    'ETC': etc,
}

In [None]:
#SVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, precision_score

def train_classifier(clf, X_train, Y_train, X_test, Y_test):
    tfidf_vectorizer = TfidfVectorizer()
    X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
    X_test_tfidf = tfidf_vectorizer.transform(X_test)

    clf.fit(X_train_tfidf, Y_train)
    y_pred = clf.predict(X_test_tfidf)

    accuracy = accuracy_score(Y_test, y_pred)
    precision = precision_score(Y_test, y_pred)

    return accuracy, precision

# Assuming 'svc' is your Support Vector Classifier instance
svc = SVC()

accuracy, precision = train_classifier(svc, X_train, Y_train, X_test, Y_test)
print("Accuracy:", accuracy)
print("Precision:", precision)


Accuracy: 0.97847533632287
Precision: 0.9758308157099698


In [None]:
accuracy_scores = []
precision_scores = []

for name,clf in clfs.items():

    current_accuracy,current_precision = train_classifier(clf, X_train, Y_train, X_test, Y_test)

    print("For ",name)
    print("Accuracy - ",current_accuracy)
    print("Precision - ",current_precision)

    accuracy_scores.append(current_accuracy)
    precision_scores.append(current_precision)

For  SVC
Accuracy -  0.9820627802690582
Precision -  0.9817258883248731
For  KN
Accuracy -  0.9165919282511211
Precision -  0.9132075471698113
For  NB
Accuracy -  0.9551569506726457
Precision -  0.9509322865554465
For  LR
Accuracy -  0.957847533632287
Precision -  0.961
For  ETC
Accuracy -  0.9766816143497757
Precision -  0.9738693467336683


In [None]:
performance_df = pd.DataFrame({'Algorithm':clfs.keys(),'Accuracy':accuracy_scores,'Precision':precision_scores}).sort_values('Precision',ascending=False)

In [None]:
performance_df

Unnamed: 0,Algorithm,Accuracy,Precision
0,SVC,0.982063,0.981726
4,ETC,0.976682,0.973869
3,LR,0.957848,0.961
2,NB,0.955157,0.950932
1,KN,0.916592,0.913208


In [None]:
performance_df1 = pd.melt(performance_df, id_vars = "Algorithm")

In [None]:
performance_df1

Unnamed: 0,Algorithm,variable,value
0,SVC,Accuracy,0.982063
1,ETC,Accuracy,0.976682
2,LR,Accuracy,0.957848
3,NB,Accuracy,0.955157
4,KN,Accuracy,0.916592
5,SVC,Precision,0.981726
6,ETC,Precision,0.973869
7,LR,Precision,0.961
8,NB,Precision,0.950932
9,KN,Precision,0.913208


In [None]:
temp_df = pd.DataFrame({'Algorithm':clfs.keys(),'Accuracy_max_ft_3000':accuracy_scores,'Precision_max_ft_3000':precision_scores}).sort_values('Precision_max_ft_3000',ascending=False)

In [None]:
temp_df = pd.DataFrame({'Algorithm':clfs.keys(),'Accuracy_scaling':accuracy_scores,'Precision_scaling':precision_scores}).sort_values('Precision_scaling',ascending=False)

In [None]:
new_df = performance_df.merge(temp_df,on='Algorithm')

In [None]:
new_df_scaled = new_df.merge(temp_df,on='Algorithm')

In [None]:
temp_df = pd.DataFrame({'Algorithm':clfs.keys(),'Accuracy_num_chars':accuracy_scores,'Precision_num_chars':precision_scores}).sort_values('Precision_num_chars',ascending=False)

In [None]:
new_df_scaled.merge(temp_df,on='Algorithm')

Unnamed: 0,Algorithm,Accuracy,Precision,Accuracy_scaling_x,Precision_scaling_x,Accuracy_scaling_y,Precision_scaling_y,Accuracy_num_chars,Precision_num_chars
0,SVC,0.982063,0.981726,0.982063,0.981726,0.982063,0.981726,0.982063,0.981726
1,ETC,0.976682,0.973869,0.976682,0.973869,0.976682,0.973869,0.976682,0.973869
2,LR,0.957848,0.961,0.957848,0.961,0.957848,0.961,0.957848,0.961
3,NB,0.955157,0.950932,0.955157,0.950932,0.955157,0.950932,0.955157,0.950932
4,KN,0.916592,0.913208,0.916592,0.913208,0.916592,0.913208,0.916592,0.913208


In [None]:


# Sample data for illustration purposes
corpus = ["Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wa"]
labels = [1]  # Assuming binary labels, adjust as needed

# Initialize and fit the vectorizer
feature_extraction = TfidfVectorizer()
X_train_transformed = feature_extraction.fit_transform(X_train)

# Train your model with the transformed features
model = SVC()
model.fit(X_train_transformed, Y_train)

# Now, you can use the same vectorizer to transform new data
input_text = ["REMINDER FROM O2: To get 2.50 pounds free call credit and details of great offers pls reply 2 this text with your valid name, house no and postcode"]
input_features = feature_extraction.transform(input_text)

# Make predictions
prediction = model.predict(input_features)
print(prediction)

if prediction[0] == 1:
    print("ham")
else:
    print("spam")

[0]
spam


In [None]:
import pickle
pickle.dump(tfidf,open('vectorizer.pkl','wb'))
pickle.dump(clf,open('model.pkl','wb'))

# **Save and Load the Model**

In [None]:
import joblib

# Assuming 'clf' is your trained StackingClassifier
# Also, 'Spam_Email_Classification_model.pkl' is the name you choose for your saved model file
joblib.dump(clf, 'Spam_Email_Classification_model.pkl')

['Spam_Email_Classification_model.pkl']