In [None]:
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk
import re
import pickle

from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score,recall_score,f1_score

In [2]:
def clean_text(text):
    words_to_remove = set(stopwords.words('english'))
    text = text.lower()
    # Remove punctuation but keep !, $, @, ? since they exist more in spams than hams as in EDA notebook
    text = re.sub(r"[^\w\s!$@?]", "", text)  
    text = nltk.word_tokenize(text)
    text = [word for word in text if word not in words_to_remove]
    text = ' '.join(text)
    return text

In [3]:
def lemmatize_text(text):
    lemmatizer = WordNetLemmatizer()
    text = nltk.word_tokenize(text)
    text = [lemmatizer.lemmatize(word) for word in text]
    text = ' '.join(text)
    return text

In [4]:
def show_score(yhat,y_acutal):
    print(f'recall score: {recall_score(y_acutal,yhat)}')
    print(f'precision score: {precision_score(y_acutal,yhat)}')
    print(f'f1 score: {f1_score(y_acutal,yhat)}')
    print(f'confusion_matrix: {confusion_matrix(y_acutal,yhat)}')    

In [5]:
## Load the model
model = pickle.load(open('../Models/model.pkl', 'rb'))

## load vectorizer
vectorizer = pickle.load(open('../Models/vectorizer.pkl', 'rb'))    

In [6]:
test_data = pd.read_csv('../Data/test.csv')

In [7]:
## Clean the data
test_data['Message'] = test_data['Message'].apply(clean_text)
test_data['Message'] = test_data['Message'].apply(lemmatize_text)

In [8]:
test_data['total_number'] = test_data['Message'].apply(lambda x: len(re.findall(r'\d+', x)))
test_data['len'] = test_data['Message'].apply(lambda x: len(x))

X_test_countvectorized = pd.DataFrame(vectorizer.transform(test_data['Message']).toarray(),
                                      columns=vectorizer.get_feature_names_out())

X_test = pd.concat([test_data[['total_number', 'len']].reset_index(drop=True), X_test_countvectorized], axis=1)

In [11]:
test_data['Category'].replace({'ham': 0, 'spam': 1}, inplace=True)

  test_data['Category'].replace({'ham': 0, 'spam': 1}, inplace=True)


In [12]:
## predict the test data
yhat = model.predict(X_test)
show_score(yhat,test_data['Category'])

recall score: 0.8666666666666667
precision score: 0.9848484848484849
f1 score: 0.9219858156028369
confusion_matrix: [[482   1]
 [ 10  65]]


In [18]:
# Assuming:
# - test_data is your test dataset with a column 'Category' for true labels
# - yhat are the predicted labels from your model

# Find correctly classified samples
correctly_classified = test_data[test_data['Category'] == yhat]

# If you want to see only samples correctly classified as class 1
correctly_classified_class_1 = test_data[(test_data['Category'] == yhat) & (yhat == 1)]

In [19]:
correctly_classified.head()

Unnamed: 0,Category,Message,total_number,len
0,0,better still catch let ask sell ltgt,0,36
1,1,loan purpose 500 75000 homeowner tenant welcom...,4,117
2,0,every day use sleep ltgt,0,24
3,0,unless situation go gurl would appropriate,0,42
4,1,1 nokia tone 4 ur mob every week ! txt nok 870...,7,116


In [21]:
correctly_classified_class_1.head(10)

Unnamed: 0,Category,Message,total_number,len
1,1,loan purpose 500 75000 homeowner tenant welcom...,4,117
4,1,1 nokia tone 4 ur mob every week ! txt nok 870...,7,116
21,1,double min double txt 12 price linerental late...,5,129
22,1,get official england poly ringtone colour flag...,4,133
55,1,1st wk free ! gr8 tone str8 2 u wk txt nokia 8...,9,105
70,1,urgent ! mobile number awarded 2000 prize guar...,5,109
77,1,youve tkts euro2004 cup final 800 cash collect...,5,89
79,1,december ! mobile 11mths ? entitled update lat...,2,116
80,1,ur cashbalance currently 500 pound maximize ur...,7,119
86,1,moby pub quizwin 100 high street prize u know ...,4,119
