In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
import re
from nltk.stem import WordNetLemmatizer
import nltk

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
nltk.download('wordnet')
nltk.download('punkt')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [6]:
class DataLoader:
    def __init__(self, file_name):
        self.file_name = file_name
        self.df = None
        self.lemmatizer = WordNetLemmatizer()

    def load_data(self):
        self.df = pd.read_csv(self.file_name)
        return self.df

    def preprocess_text(self, text):
        text = re.sub(r'\d+', '', text)  # usuwa liczby
        text = re.sub(r'\W+', ' ', text)  # usuwa interpunkcję
        text = text.lower()  # usuwa wielkie litery
        text = ' '.join([self.lemmatizer.lemmatize(word) for word in text.split()])
        return text

    def preprocess_data(self):
        self.df['text'] = self.df['text'].apply(self.preprocess_text)
        self.df.loc[self.df['label'] == 'spam', 'label'] = 0
        self.df.loc[self.df['label'] == 'ham', 'label'] = 1
        return self.df

    def split_data(self, test_size=0.2, random_state=42):
        X = self.df['text']
        y = self.df['label'].astype('int')
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state, stratify=y)
        return X_train, X_test, y_train, y_test



data_loader = DataLoader('spam_ham_dataset.csv')

data_loader.load_data()
data_loader.preprocess_data()

X_train, X_test, y_train, y_test = data_loader.split_data()

pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english', ngram_range=(1, 2))),
    ('clf', LogisticRegression(max_iter=1000))
])

params = {
    'tfidf__max_features': [500, 1000, 2000, None],
    'clf__C': [0.1, 1.0, 10.0, 100],
    'clf__solver': ['liblinear', 'saga']
}

gs = GridSearchCV(estimator=pipeline, param_grid=params, cv=5, scoring='accuracy', verbose=1)
gs.fit(X_train, y_train)


model = gs.best_estimator_
y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1-Score: {f1}')
print(f'Confusion Matrix:\n{conf_matrix}')


Fitting 5 folds for each of 32 candidates, totalling 160 fits
Accuracy: 0.9903381642512077
Precision: 0.9945429740791268
Recall: 0.9918367346938776
F1-Score: 0.9931880108991826
Confusion Matrix:
[[296   4]
 [  6 729]]
