In [121]:
import numpy as np
import pandas as pd

import string
import nltk

import itertools
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV

import warnings
warnings.filterwarnings("ignore")

In [71]:
spam_dataset = pd.read_csv('spam.csv', encoding = "ISO-8859-1", usecols=[0, 1], names=['Spam', 'Text'],skiprows=1)
spam_dataset['Spam'] = spam_dataset['Spam'].replace(['ham', 'spam'], [0, 1])
spam_dataset

Unnamed: 0,Spam,Text
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,1,This is the 2nd time we have tried 2 contact u...
5568,0,Will Ì_ b going to esplanade fr home?
5569,0,"Pity, * was in mood for that. So...any other s..."
5570,0,The guy did some bitching but I acted like i'd...


In [72]:
print(spam_dataset['Spam'].value_counts(normalize=True))

0    0.865937
1    0.134063
Name: Spam, dtype: float64


In [73]:
def remove_puncation(text):
    cleaned = ''.join([word for word in text if word not in string.punctuation])
    return cleaned
spam_dataset['Cleaned_Text'] = spam_dataset['Text'].apply(lambda x: remove_puncation(x))

In [74]:
def tokenize(text):
    # Usunięcie wielkich liter
    clean_text = text.lower()
    # Tokenizacja
    tokenized_text = nltk.word_tokenize(clean_text)
    return tokenized_text

spam_dataset['Tokenized_Text'] = spam_dataset['Cleaned_Text'].apply(lambda x: tokenize(x))

In [75]:
stopwords = nltk.corpus.stopwords.words("english")
def remove_stopwords(text):
    without_stopwords = [word for word in text if word not in stopwords]
    return without_stopwords
spam_dataset['WithoutStop_Text'] = spam_dataset['Tokenized_Text'].apply(lambda x: remove_stopwords(x))

In [76]:
stemmer = nltk.PorterStemmer()
def stemming(text):
    stemmed_words = [stemmer.stem(word) for word in text]
    return stemmed_words
spam_dataset['Stemmed_Text'] = spam_dataset['WithoutStop_Text'].apply(lambda x: stemming(x))

In [77]:
lemmater = nltk.WordNetLemmatizer()
def lemmatizing(text):
    lemmatized_words = [lemmater.lemmatize(word) for word in text]
    return lemmatized_words
spam_dataset['Lemmatized_Text'] = spam_dataset['WithoutStop_Text'].apply(lambda x: lemmatizing(x))

In [98]:
X = spam_dataset['Lemmatized_Text']
y = spam_dataset['Spam']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

184                            [going, nothing, greatbye]
2171                               [wont, wats, wit, guy]
5422                   [ok, ksry, knw, 2, sivatats, askd]
4113    [stand, away, doesnt, heart, ache, without, do...
4588                     [finished, work, yet, something]
                              ...                        
1932                       [jus, finished, avatar, nigro]
5316                       [jus, finish, watching, tv, u]
2308    [moby, pub, quizwin, å£100, high, street, priz...
1903    [free, entry, 2, weekly, comp, chance, win, ip...
763     [nothing, jus, tot, u, would, ask, co, u, ba, ...
Name: Lemmatized_Text, Length: 4457, dtype: object

In [88]:
tfidf = TfidfVectorizer() 
tfidf_train = tfidf.fit_transform(X_train.apply(lambda x: ' '.join(x)))
tfidf_test = tfidf.transform(X_test.apply(lambda x: ' '.join(x)))

In [122]:
clf = RandomForestClassifier()
clf.fit(tfidf_train,y_train)
clf.score(tfidf_test,y_test)

0.9757847533632287

In [104]:
importances = clf.feature_importances_
how_many_importance=((importances>0.001).sum()/importances.shape)
print(how_many_importance)

[0.02097723]


In [109]:
tfidf_2 = TfidfVectorizer(min_df=0.021) 
tfidf_2_train = tfidf_2.fit_transform(X_train.apply(lambda x: ' '.join(x)))
tfidf_2_test = tfidf_2.transform(X_test.apply(lambda x: ' '.join(x)))
clf_2 = RandomForestClassifier()
clf_2.fit(tfidf_2_train,y_train)
clf_2.score(tfidf_2_test,y_test)

0.9506726457399103

In [113]:
tfidf = TfidfVectorizer()
params_tfidf = {'ngram_range': [(1,1),(1,2),(2,2)],
                'min_df':[0.01,0.02,0.03,0.04,0.05],
                'max_df': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6]}
tfidf_gridsearch = GridSearchCV(tfidf,
                             params_tfidf,
                             scoring='f1_macro',
                             cv=5,
                             verbose=10, n_jobs=-1)
tfidf_gridsearch.fit(X_train.apply(lambda x: ' '.join(x)),y_train)
print('\nBest hyperparameter:', tfidf_gridsearch.best_params_)
tfidf_3 = tfidf_gridsearch.best_estimator_

Fitting 5 folds for each of 90 candidates, totalling 450 fits

Best hyperparameter: {'max_df': 0.1, 'min_df': 0.01, 'ngram_range': (1, 1)}


In [114]:
tfidf_3_train = tfidf_3.fit_transform(X_train.apply(lambda x: ' '.join(x)))
tfidf_3_test = tfidf_3.transform(X_test.apply(lambda x: ' '.join(x)))

In [116]:
rf = RandomForestClassifier(n_estimators=1000, n_jobs=-1)
params_rf = {'max_depth': [20,50,100,200,500],
             'min_samples_leaf': [1,2,3,5,10,15,20]}
rf_gridsearch = GridSearchCV(rf,
                             params_rf,
                             scoring='f1_macro',
                             cv=5,
                             verbose=10, n_jobs=-1)
rf_gridsearch.fit(tfidf_3_train,y_train)
print('\nBest hyperparameter:', rf_gridsearch.best_params_)
clf_3 = rf_gridsearch.best_estimator_

Fitting 5 folds for each of 35 candidates, totalling 175 fits

Best hyperparameter: {'max_depth': 50, 'min_samples_leaf': 1}


In [117]:
clf_3.fit(tfidf_3_train,y_train)
clf_3.score(tfidf_3_test,y_test)

0.9641255605381166

In [119]:
print('Random Forest:                             ', clf.score(tfidf_test,y_test))
print('Random Forest - feature importance > 0.001:', clf_2.score(tfidf_2_test,y_test))
print('Random Forest - GridSearch:                ', clf_3.score(tfidf_3_test,y_test))

Random Forest:                              0.9766816143497757
Random Forest - feature importance > 0.001: 0.9506726457399103
Random Forest - GridSearch:                 0.9641255605381166
