In [119]:
import pandas as pd

# Чтение данных
sms = pd.read_csv('data/SMS.tsv', delimiter='\t')
sms

Unnamed: 0,class,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [120]:
# there was a problem with nltk
#
# import nltk
# import ssl
#
# try:
#     _create_unverified_https_context = ssl._create_unverified_context
# except AttributeError:
#     pass
# else:
#     ssl._create_default_https_context = _create_unverified_https_context
#
# nltk.download()

In [121]:
import re
from nltk.corpus import stopwords

# import nltk
# nltk.download('stopwords')

# Предобработка данных
good_chars = set([' '] + list(map(chr, range(ord('a'), ord('z') + 1))))
stopwords = stopwords.words("english")
print("stop words len = ", len(stopwords))
print("first 10 stop words: ", stopwords[:10], end=', ')

sms.text = sms.text.apply(lambda row: row.lower())
sms.text = sms.text.apply(lambda row: re.sub("\s+", " ", row).strip())
sms.text = sms.text.apply(lambda row: ''.join(ch for ch in row if ch in good_chars))
sms.text = sms.text.apply(lambda row: ' '.join(filter(lambda word: word not in stopwords, row.split(" "))))
sms

stop words len =  179
first 10 stop words:  ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"], 

Unnamed: 0,class,text
0,ham,go jurong point crazy available bugis n great ...
1,ham,ok lar joking wif u oni
2,spam,free entry wkly comp win fa cup final tkts st...
3,ham,u dun say early hor u c already say
4,ham,nah dont think goes usf lives around though
...,...,...
5567,spam,nd time tried contact u u pound prize claim...
5568,ham,b going esplanade fr home
5569,ham,pity mood soany suggestions
5570,ham,guy bitching acted like id interested buying s...


In [122]:
texts = sms['text']
classes = sms['class'].apply(lambda c: 1 if c == 'spam' else 0)

In [123]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(max_features=1200)
X = tfidf.fit_transform(texts)
X = pd.DataFrame(data=X.toarray(), columns=tfidf.get_feature_names_out())
X.head()

Unnamed: 0,abiola,able,abt,accept,access,account,across,actually,add,address,...,yet,yo,yoga,youd,youll,youre,youve,yr,yrs,yup
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [163]:
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X, classes, test_size=0.2)
feat_names = X_train.columns

In [125]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier()
rfc.fit(X_train, Y_train)
feat_importances = rfc.feature_importances_
feat_importances_sorted = sorted(zip(feat_names, feat_importances), key=lambda x: x[1], reverse=True)
top_feats_rfc = [x[0] for x in feat_importances_sorted[:30]]

In [134]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression()
rfe = RFE(logreg, n_features_to_select=30)
rfe.fit(X_train, Y_train)
top_feats_rfe = pd.Series(dict(zip(feat_names, rfe.support_)))
top_feats_rfe = list(top_feats_rfe[top_feats_rfe == True].index)

In [127]:
correlations = X_train.corrwith(Y_train)
top_features = correlations.abs().sort_values(ascending=False)[:30].index
top_feats_corr = list(X_train[top_features].columns)

In [155]:
pd.DataFrame(data={
    "Random Forest Classifier": top_feats_rfc,
    "SVM-RFE": top_feats_rfe,
    "Pearson correlation": top_feats_corr,
})

Unnamed: 0,Random Forest Classifier,SVM-RFE,Pearson correlation
0,call,apply,txt
1,txt,awarded,claim
2,free,box,free
3,claim,call,mobile
4,mobile,camera,prize
5,stop,cash,call
6,prize,chat,urgent
7,reply,claim,guaranteed
8,win,code,stop
9,text,contact,reply


In [158]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectKBest, chi2, mutual_info_classif

selector_chi2 = SelectKBest(score_func=chi2, k=30)
selector_chi2.fit(X_train, Y_train)
feat_cols_chi2 = selector_chi2.get_support(indices=True)
top_feats_chi2 = [feat_names[i] for i in feat_cols_chi2[:30]]

selector_mic = SelectKBest(score_func=mutual_info_classif, k=30)
selector_mic.fit(X_train, Y_train)
feat_cols_mic = selector_mic.get_support(indices=True)
top_feats_mic = [feat_names[i] for i in feat_cols_mic[:30]]

lr = LogisticRegression(solver='liblinear')
lr.fit(X_train, Y_train)
top_feats_lr = list(pd.Series(lr.coef_[0], feat_names).sort_values(ascending=False)[:30].index)

In [159]:
pd.DataFrame(data={
    "my: Random Forest Classifier": top_feats_rfc,
    "my: SVM-RFE": top_feats_rfe,
    "my: Pearson correlation": top_feats_corr,
    "lib: SelectKBest with chi2": top_feats_chi2,
    "lib: SelectKBest with mutual_info_classif": top_feats_mic,
    "lib: Logistic Regression": top_feats_lr,
})

Unnamed: 0,my: Random Forest Classifier,my: SVM-RFE,my: Pearson correlation,lib: SelectKBest with chi2,lib: SelectKBest with mutual_info_classif,lib: Logistic Regression
0,call,apply,txt,awarded,box,txt
1,txt,awarded,claim,box,call,text
2,free,box,free,call,cash,claim
3,claim,call,mobile,camera,claim,free
4,mobile,camera,prize,cash,contact,mobile
5,stop,cash,call,claim,draw,call
6,prize,chat,urgent,code,free,stop
7,reply,claim,guaranteed,collection,land,reply
8,win,code,stop,contact,landline,prize
9,text,contact,reply,draw,mobile,win


In [166]:
import numpy as np

X_train_my1 = pd.DataFrame(X_train, columns=np.array(top_feats_rfc))
X_test_my1 = pd.DataFrame(X_test, columns=np.array(top_feats_rfc))

X_train_my2 = pd.DataFrame(X_train, columns=np.array(top_feats_rfe))
X_test_my2 = pd.DataFrame(X_test, columns=np.array(top_feats_rfe))

X_train_my3 = pd.DataFrame(X_train, columns=np.array(top_feats_corr))
X_test_my3 = pd.DataFrame(X_test, columns=np.array(top_feats_corr))

X_train_lib1 = pd.DataFrame(X_train, columns=np.array(top_feats_chi2))
X_test_lib1 = pd.DataFrame(X_test, columns=np.array(top_feats_chi2))

X_train_lib2 = pd.DataFrame(X_train, columns=np.array(top_feats_mic))
X_test_lib2 = pd.DataFrame(X_test, columns=np.array(top_feats_mic))

X_train_lib3 = pd.DataFrame(X_train, columns=np.array(top_feats_lr))
X_test_lib3 = pd.DataFrame(X_test, columns=np.array(top_feats_lr))

In [183]:
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier


def compare(model, model_name):
    print(model_name, "accuracies:")

    model.fit(X_train, Y_train)
    y_pred = model.predict(X_test)
    print("all features:")
    print(accuracy_score(Y_test, y_pred))

    model.fit(X_train_my1, Y_train)
    y_pred = model.predict(X_test_my1)
    print("my: Random Forest Classifier")
    print(accuracy_score(Y_test, y_pred))

    model.fit(X_train_my2, Y_train)
    y_pred = model.predict(X_test_my2)
    print("my: SVM-RFE")
    print(accuracy_score(Y_test, y_pred))

    model.fit(X_train_my3, Y_train)
    y_pred = model.predict(X_test_my3)
    print("my: Pearson correlation")
    print(accuracy_score(Y_test, y_pred))

    model.fit(X_train_lib1, Y_train)
    y_pred = model.predict(X_test_lib1)
    print("lib: SelectKBest with chi2")
    print(accuracy_score(Y_test, y_pred))

    model.fit(X_train_lib2, Y_train)
    y_pred = model.predict(X_test_lib2)
    print("lib: SelectKBest with mutual_info_classif")
    print(accuracy_score(Y_test, y_pred))

    model.fit(X_train_lib3, Y_train)
    y_pred = model.predict(X_test_lib3)
    print("lib: Logistic Regression")
    print(accuracy_score(Y_test, y_pred))

In [184]:
compare(DecisionTreeClassifier(), "Decision Tree")

Decision Tree accuracies:
all features:
0.9659192825112107
my: Random Forest Classifier
0.9497757847533632
my: SVM-RFE
0.9497757847533632
my: Pearson correlation
0.9452914798206278
lib: SelectKBest with chi2
0.9452914798206278
lib: SelectKBest with mutual_info_classif
0.9452914798206278
lib: Logistic Regression
0.9515695067264573


In [185]:
compare(RandomForestClassifier(), "Random Forest")

Random Forest accuracies:
all features:
0.9748878923766816
my: Random Forest Classifier
0.9632286995515695
my: SVM-RFE
0.9596412556053812
my: Pearson correlation
0.9506726457399103
lib: SelectKBest with chi2
0.9542600896860987
lib: SelectKBest with mutual_info_classif
0.947085201793722
lib: Logistic Regression
0.9632286995515695


In [186]:
compare(SVC(), "SVC")

SVC accuracies:
all features:
0.9802690582959641
my: Random Forest Classifier
0.95695067264574
my: SVM-RFE
0.9641255605381166
my: Pearson correlation
0.9524663677130045
lib: SelectKBest with chi2
0.9551569506726457
lib: SelectKBest with mutual_info_classif
0.9443946188340807
lib: Logistic Regression
0.9623318385650225
