In [1]:

import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
import re
from nltk.stem import WordNetLemmatizer
import joblib
import nltk
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

In [2]:
df = pd.read_csv('spam_ham_dataset.csv')
nltk.download('wordnet')
nltk.download('punkt')
print(df)

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


      Unnamed: 0 label                                               text  \
0            605   ham  Subject: enron methanol ; meter # : 988291\r\n...   
1           2349   ham  Subject: hpl nom for january 9 , 2001\r\n( see...   
2           3624   ham  Subject: neon retreat\r\nho ho ho , we ' re ar...   
3           4685  spam  Subject: photoshop , windows , office . cheap ...   
4           2030   ham  Subject: re : indian springs\r\nthis deal is t...   
...          ...   ...                                                ...   
5166        1518   ham  Subject: put the 10 on the ft\r\nthe transport...   
5167         404   ham  Subject: 3 / 4 / 2000 and following noms\r\nhp...   
5168        2933   ham  Subject: calpine daily gas nomination\r\n>\r\n...   
5169        1409   ham  Subject: industrial worksheets for august 2000...   
5170        4807  spam  Subject: important online banking alert\r\ndea...   

      label_num  
0             0  
1             0  
2             0  
3  

In [3]:
df.loc[df['label']=='spam','label'] = 0
df.loc[df['label']=='ham','label'] = 1
df.head()
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    text = re.sub(r'\d+', '', text) #usuwamy liczby#
    text = re.sub(r'\W+', ' ', text) #usuwamy interpunkcję#
    text = text.lower() #usuwamy duże litery#
    text = ' '.join([lemmatizer.lemmatize(word) for word in text.split()])
    return text

df['text'] = df['text'].apply(preprocess_text)
print(df)

      Unnamed: 0 label                                               text  \
0            605     1  subject enron methanol meter this is a follow ...   
1           2349     1  subject hpl nom for january see attached file ...   
2           3624     1  subject neon retreat ho ho ho we re around to ...   
3           4685     0  subject photoshop window office cheap main tre...   
4           2030     1  subject re indian spring this deal is to book ...   
...          ...   ...                                                ...   
5166        1518     1  subject put the on the ft the transport volume...   
5167         404     1  subject and following noms hpl can t take the ...   
5168        2933     1  subject calpine daily gas nomination julie a i...   
5169        1409     1  subject industrial worksheet for august activi...   
5170        4807     0  subject important online banking alert dear va...   

      label_num  
0             0  
1             0  
2             0  
3  

In [4]:
X = df['text']
y = df['label']

In [5]:


X_train , X_test, y_train, y_test = train_test_split(X,y, test_size=0.5, random_state=42, stratify=y)
y_train , y_test = y_train.astype('int'), y_test.astype('int')


In [6]:
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english', ngram_range=(1, 2))),
    ('clf', LogisticRegression(max_iter=1000))
])
params = {
    'tfidf__max_features': [500,1000, 2000, None],
    'clf__C': [0.1, 1.0, 10.0, 100],
    'clf__solver': ['liblinear', 'saga']
}
gs = GridSearchCV(estimator=pipeline, param_grid=params, cv=5, scoring='accuracy', verbose=1)
gs.fit(X_train,y_train)



Fitting 5 folds for each of 32 candidates, totalling 160 fits


In [7]:
model = gs.best_estimator_
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1-Score: {f1}')
print(f'Confusion Matrix:\n{conf_matrix}')

Accuracy: 0.9911059551430781
Precision: 0.9967123287671232
Recall: 0.9907407407407407
F1-Score: 0.9937175635072385
Confusion Matrix:
[[ 744    6]
 [  17 1819]]


Prediction: Spam
