In [3]:
import pandas as pd
import re
import nltk
import swifter
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score
import joblib




In [4]:
!pip install swifter



In [5]:
nltk.download('stopwords')
nltk.download('punkt_tab')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [6]:
df=pd.read_csv('/content/combined_data.csv')
df.head()

Unnamed: 0,label,text
0,1,ounce feather bowl hummingbird opec moment ala...
1,1,wulvob get your medircations online qnb ikud v...
2,0,computer connection from cnn com wednesday es...
3,1,university degree obtain a prosperous future m...
4,0,thanks for all your answers guys i know i shou...


In [7]:
df_copy=df.copy()

In [8]:
df_copy.isnull().sum()

Unnamed: 0,0
label,0
text,0


In [9]:
df_copy.shape

(83448, 2)

In [10]:
df_copy.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 83448 entries, 0 to 83447
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   label   83448 non-null  int64 
 1   text    83448 non-null  object
dtypes: int64(1), object(1)
memory usage: 1.3+ MB


In [11]:
# so we are checking here how many emails are spam or not spam?
df['label'].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
1,43910
0,39538


In [12]:
df_copy['text'] = df_copy['text'].str.lower()

In [13]:
df_copy['text'] = df_copy['text'].str.replace('[^a-zA-Z0-9]',' ', regex=True)

#Text Preprocessing

In [14]:


stop_words=set(stopwords.words('english'))
def tokenize_and_remove_stopwords(text):
    token=word_tokenize(text)
    filtered_tokens=[word for word in token if word not in stop_words]
    return filtered_tokens



In [15]:
df_copy['text'].swifter.apply(tokenize_and_remove_stopwords)

Pandas Apply:   0%|          | 0/83448 [00:00<?, ?it/s]

Unnamed: 0,text
0,"[ounce, feather, bowl, hummingbird, opec, mome..."
1,"[wulvob, get, medircations, online, qnb, ikud,..."
2,"[computer, connection, cnn, com, wednesday, es..."
3,"[university, degree, obtain, prosperous, futur..."
4,"[thanks, answers, guys, know, checked, rsync, ..."
...,...
83443,"[hi, given, date, get, last, date, month, data..."
83444,"[order, software, cd, download, site, immediat..."
83445,"[dear, valued, member, canadianpharmacy, provi..."
83446,"[subscribe, change, profile, contact, us, long..."


In [16]:
lemmatizer=WordNetLemmatizer()

def lemmatize_tokens(tokens):
    lemmatized_list = [lemmatizer.lemmatize(word) for word in tokens]

    return lemmatized_list



In [17]:
df_copy['text'].swifter.apply(lemmatize_tokens)

Pandas Apply:   0%|          | 0/83448 [00:00<?, ?it/s]

Unnamed: 0,text
0,"[o, u, n, c, e, , f, e, a, t, h, e, r, , b, ..."
1,"[w, u, l, v, o, b, , g, e, t, , y, o, u, r, ..."
2,"[ , c, o, m, p, u, t, e, r, , c, o, n, n, e, ..."
3,"[u, n, i, v, e, r, s, i, t, y, , d, e, g, r, ..."
4,"[t, h, a, n, k, s, , f, o, r, , a, l, l, , ..."
...,...
83443,"[h, i, , g, i, v, e, n, , a, , d, a, t, e, ..."
83444,"[n, o, w, , y, o, u, , c, a, n, , o, r, d, ..."
83445,"[d, e, a, r, , v, a, l, u, e, d, , m, e, m, ..."
83446,"[s, u, b, s, c, r, i, b, e, , c, h, a, n, g, ..."


In [18]:
vectorizer=TfidfVectorizer()
X=vectorizer.fit_transform(df_copy['text'])
y=df_copy['label']

In [19]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)
model=LogisticRegression()
model.fit(X_train,y_train)

In [20]:
y_pred=model.predict(X_test)

In [21]:
print(accuracy_score(y_test,y_pred))
print(precision_score(y_test,y_pred))
print(recall_score(y_test,y_pred))

0.983343319352906
0.9769248086447546
0.9916590493601463


In [22]:
joblib.dump(model,'spam_detection_model.pkl')
joblib.dump(vectorizer,'vectorizer.pkl')

['vectorizer.pkl']