In [1]:
import pandas as pd
import numpy as np
import re
import string
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
import matplotlib.pyplot as plt

In [9]:
fullCorpus = pd.read_csv('SMSSpamCollection',sep='\t', header=None, names=['labels','text'])

In [10]:
fullCorpus['txt_len'] = fullCorpus['text'].apply(lambda x : len(x) - x.count(" ") )
fullCorpus['punct_percent'] = fullCorpus['text'].apply(lambda x :
                                                       round(sum([1 for char in x if char in string.punctuation])*100/
                                                             (len(x) - x.count(" ") ),3))

In [17]:
lm = nltk.WordNetLemmatizer()
ps = nltk.PorterStemmer()
stopword= nltk.corpus.stopwords.words('english')

In [4]:
def clean_text_stemm(text):
    txt_clean = "".join([char for char in text if char not in string.punctuation])
    txt_tokenized = re.split('\W+',txt_clean)
    txt_stop = [ps.stem(word) for word in txt_tokenized if word not in stopword]
    return txt_stop

In [18]:
def clean_text_lemmatizer(text):
    txt_clean = "".join([char for char in text if char not in string.punctuation])
    txt_tokenized = re.split('\W+',txt_clean)
    txt_stop = [lm.lemmatize(word) for word in txt_tokenized if word not in stopword]
    return txt_stop

In [22]:
X_train, X_test, y_train, y_test = train_test_split(fullCorpus[['text','txt_len','punct_percent']], fullCorpus[['labels']], test_size=0.2)

In [23]:
fullCorpus

Unnamed: 0,text,txt_len,punct_percent
2861,I want to be there so i can kiss you and feel ...,45,0.000
780,That means get the door,19,0.000
2565,"Under the sea, there lays a rock. In the rock,...",112,9.821
1745,Someone has conacted our dating service and en...,132,1.515
3619,Hey check it da. I have listed da.,27,7.407
...,...,...,...
4180,Can ü send me a copy of da report?,26,3.846
3369,"Hey elaine, is today's meeting still on?",34,8.824
4601,Hi did u decide wot 2 get 4 his bday if not il...,74,0.000
1441,Cool breeze... Bright sun... Fresh flower... T...,112,16.964


In [24]:
count_vect = CountVectorizer(analyzer=clean_text_lemmatizer)
count_vectorized = count_vect.fit(X_train['text'])

count_train = count_vectorized.transform(X_train['text'])
count_test = count_vectorized.transform(X_test['text'])

X_train_count = pd.concat([X_train[['txt_len','punct_percent']].reset_index(drop=True),
                               pd.DataFrame(count_train.toarray())] , axis=1 )

X_test_count = pd.concat([X_test[['txt_len','punct_percent']].reset_index(drop=True),
                               pd.DataFrame(count_test.toarray())] , axis=1 )


print(X_train_count.shape)
print(X_test_count.shape)
#count_vect.get_feature_names()
X_train_count.head()

(4457, 9667)
(1115, 9667)


Unnamed: 0,txt_len,punct_percent,0,1,2,3,4,5,6,7,...,9655,9656,9657,9658,9659,9660,9661,9662,9663,9664
0,22,9.091,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,34,8.824,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,21,4.762,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,25,4.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,30,26.667,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [25]:
tfidf_vect = TfidfVectorizer(analyzer=clean_text_lemmatizer)
tfidf_vectorized = tfidf_vect.fit(X_train['text'])

tfidf_train = tfidf_vectorized.transform(X_train['text'])
tfidf_test = tfidf_vectorized.transform(X_test['text'])

X_train_tfidf = pd.concat([X_train[['txt_len','punct_percent']].reset_index(drop=True),
                               pd.DataFrame(tfidf_train.toarray())] , axis=1 )

X_test_tfidf = pd.concat([X_test[['txt_len','punct_percent']].reset_index(drop=True),
                               pd.DataFrame(tfidf_test.toarray())] , axis=1 )



print(X_train_tfidf.shape)
print(X_test_tfidf.shape)
#count_tfidf.get_feature_names()
X_train_tfidf.head()

(4457, 9667)
(1115, 9667)


Unnamed: 0,txt_len,punct_percent,0,1,2,3,4,5,6,7,...,9655,9656,9657,9658,9659,9660,9661,9662,9663,9664
0,22,9.091,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,34,8.824,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,21,4.762,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,25,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,30,26.667,0.247123,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Using Count Vectorizer data

In [32]:
rf = RandomForestClassifier()
params = {
    'n_estimators':[10,50,100,200],
    'max_depth':[20,50,100,None]
}

gs = GridSearchCV(rf,params,n_jobs=-1)
gs_model = gs.fit(X_train_count,y_train)
pd.DataFrame(gs_model.cv_results_).sort_values('mean_test_score',ascending=False)[0:5]

  self.best_estimator_.fit(X, y, **fit_params)


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
9,9.09336,0.208013,0.323902,0.051906,100.0,50,"{'max_depth': 100, 'n_estimators': 50}",0.974215,0.974215,0.977553,0.973064,0.965208,0.972851,0.004106,1
10,16.906093,0.125138,0.379362,0.047132,100.0,100,"{'max_depth': 100, 'n_estimators': 100}",0.974215,0.975336,0.978676,0.974186,0.961841,0.972851,0.005744,2
11,33.514197,0.852337,0.466053,0.039762,100.0,200,"{'max_depth': 100, 'n_estimators': 200}",0.976457,0.975336,0.975309,0.973064,0.961841,0.972401,0.005394,3
14,18.143377,0.286067,0.326448,0.028349,,100,"{'max_depth': None, 'n_estimators': 100}",0.975336,0.971973,0.978676,0.971942,0.962963,0.972178,0.005238,4
15,26.898147,1.785905,0.268833,0.037232,,200,"{'max_depth': None, 'n_estimators': 200}",0.973094,0.971973,0.977553,0.974186,0.961841,0.97173,0.005285,5


In [None]:
gb = GradientBoostingClassifier()
params = {
    'n_estimators':[10,50,100,200],
    'max_depth':[20,50,100,None]
}

gs = GridSearchCV(gb,params,n_jobs=-1)
gs_model = gs.fit(X_train_count,y_train)
pd.DataFrame(gs_model.cv_results_).sort_values('mean_test_score',ascending=False)[0:5]

### Using TFIDF Vectorizer data

In [None]:
rf = RandomForestClassifier()
params = {
    'n_estimators':[10,50,100,200],
    'max_depth':[20,50,100,None]
}

gs = GridSearchCV(rf,params,n_jobs=-1)
gs_model = gs.fit(X_train_tfidf,y_train)
pd.DataFrame(gs_model.cv_results_).sort_values('mean_test_score',ascending=False)[0:5]

In [None]:
gb = GradientBoostingClassifier()
params = {
    'n_estimators':[10,50,100,200],
    'max_depth':[20,50,100,None]
}

gs = GridSearchCV(gb,params,n_jobs=-1)
gs_model = gs.fit(X_train_tfidf,y_train)
pd.DataFrame(gs_model.cv_results_).sort_values('mean_test_score',ascending=False)[0:5]