#$$ \textbf{SPAM DETECTION ANALYSIS USING NLP} $$

In [None]:
# import modules
import pandas as pd
import nltk
nltk.download('stopwords')
import re
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
# loading the dataset 
df=pd.read_csv('spam.csv', encoding='latin-1')

In [None]:
df

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,
...,...,...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,,,
5568,ham,Will Ì_ b going to esplanade fr home?,,,
5569,ham,"Pity, * was in mood for that. So...any other s...",,,
5570,ham,The guy did some bitching but I acted like i'd...,,,


$$ \textbf{since apart from the first two columns all the columns contains the null values only and hence we take only the first two columns} $$

In [None]:
df = df[['v2','v1']]

$$ \textbf{ changing the column name as it is bit vague} $$

In [None]:
df.rename(columns={'v2':'messages','v1':'label'}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [None]:
df.head()

Unnamed: 0,messages,label
0,"Go until jurong point, crazy.. Available only ...",ham
1,Ok lar... Joking wif u oni...,ham
2,Free entry in 2 a wkly comp to win FA Cup fina...,spam
3,U dun say so early hor... U c already then say...,ham
4,"Nah I don't think he goes to usf, he lives aro...",ham


$$ \textbf{ DATA PREPROCESSING } $$

In [None]:
df.isnull().sum() # checking for the null value 

messages    0
label       0
dtype: int64

No null values in the data

In [None]:
Stopwords= set(stopwords.words('english'))
def text_clean (text):
  text= text.lower()  # converting all the alphabets in the lowercase
  text= re.sub(r'[^0-9a-zA-Z]',' ',text) # removing the special characters
  text=re.sub(r'\s+',' ',text)  # removing extra spaces
  text=" ".join(word for word in text.split() if word not in Stopwords)   # remove stopwords
  return text

In [None]:
# clean the message
df['clean_text']=df['messages'].apply(text_clean)
df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,messages,label,clean_text
0,"Go until jurong point, crazy.. Available only ...",ham,go jurong point crazy available bugis n great ...
1,Ok lar... Joking wif u oni...,ham,ok lar joking wif u oni
2,Free entry in 2 a wkly comp to win FA Cup fina...,spam,free entry 2 wkly comp win fa cup final tkts 2...
3,U dun say so early hor... U c already then say...,ham,u dun say early hor u c already say
4,"Nah I don't think he goes to usf, he lives aro...",ham,nah think goes usf lives around though


$$ \textbf{INPUT SPLIT} $$

In [None]:
x=df["clean_text"]
y=df["label"]

$$ \textbf{MODEL TRAINING} $$

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.metrics import classification_report
def clasification(model , x, y):
  # TEST AND TRAIN DATA SPLIT
   X_train, X_test, Y_train, Y_test = train_test_split(x,y, test_size=0.3, random_state=42, shuffle=True, stratify=y)
   # model training
   pipeline_model = Pipeline([('vector',CountVectorizer()),('tfidf',TfidfTransformer()),('CLF',model)])
   pipeline_model.fit(X_train, Y_train)
   print("accuracy", pipeline_model.score(X_test,Y_test)*100)
   Y_pred = pipeline_model.predict(X_test)
   print(classification_report(Y_test,Y_pred))


$$ \textbf{ USING LOGISTICSREGRESSION} $$

In [None]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
clasification(model, x,y)

accuracy 96.5909090909091
              precision    recall  f1-score   support

         ham       0.96      1.00      0.98      1448
        spam       0.99      0.75      0.86       224

    accuracy                           0.97      1672
   macro avg       0.98      0.88      0.92      1672
weighted avg       0.97      0.97      0.96      1672



$$ \textbf{ USING MULTINOMIAL-NB} $$

In [None]:
from sklearn.naive_bayes import MultinomialNB
model= MultinomialNB()
clasification(model,x,y)

accuracy 96.65071770334929
              precision    recall  f1-score   support

         ham       0.96      1.00      0.98      1448
        spam       1.00      0.75      0.86       224

    accuracy                           0.97      1672
   macro avg       0.98      0.88      0.92      1672
weighted avg       0.97      0.97      0.96      1672



$$ \textbf{ USING SUPPORT VECTOR MACHINE } $$

In [None]:
from sklearn.svm import SVC
model=SVC(C=3)
clasification(model,x,y)

accuracy 98.38516746411483
              precision    recall  f1-score   support

         ham       0.98      1.00      0.99      1448
        spam       1.00      0.88      0.94       224

    accuracy                           0.98      1672
   macro avg       0.99      0.94      0.96      1672
weighted avg       0.98      0.98      0.98      1672



$$ \textbf{ USING RANDOMFOREST} $$

In [None]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier()
clasification(model,x,y)

accuracy 97.60765550239235
              precision    recall  f1-score   support

         ham       0.97      1.00      0.99      1448
        spam       1.00      0.82      0.90       224

    accuracy                           0.98      1672
   macro avg       0.99      0.91      0.94      1672
weighted avg       0.98      0.98      0.98      1672

