In [103]:
import pandas as pd
import numpy as np

# Предобработка

In [104]:
df = pd.read_csv('../data/spam.csv', encoding = 'latin-1')
df.head(10)

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,
5,spam,FreeMsg Hey there darling it's been 3 week's n...,,,
6,ham,Even my brother is not like to speak with me. ...,,,
7,ham,As per your request 'Melle Melle (Oru Minnamin...,,,
8,spam,WINNER!! As a valued network customer you have...,,,
9,spam,Had your mobile 11 months or more? U R entitle...,,,


In [105]:
df.isna().sum()

v1               0
v2               0
Unnamed: 2    5522
Unnamed: 3    5560
Unnamed: 4    5566
dtype: int64

In [106]:
df.drop(columns = ['Unnamed: 2', 'Unnamed: 3','Unnamed: 4'], inplace = True)

In [107]:
df.head(10)

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
5,spam,FreeMsg Hey there darling it's been 3 week's n...
6,ham,Even my brother is not like to speak with me. ...
7,ham,As per your request 'Melle Melle (Oru Minnamin...
8,spam,WINNER!! As a valued network customer you have...
9,spam,Had your mobile 11 months or more? U R entitle...


In [108]:
df.rename(columns = {'v1': 'status', 'v2': 'raw_text'}, inplace = True)
df['status'] = df['status'].map({'ham': 1, 'spam': 0})

In [109]:
import re

In [110]:
df['processed_text'] = df['raw_text'].apply(
    lambda x: re.sub('[^a-zA-Z]', ' ', x).lower()   
)

In [111]:
df.head(20)

Unnamed: 0,status,raw_text,processed_text
0,1,"Go until jurong point, crazy.. Available only ...",go until jurong point crazy available only ...
1,1,Ok lar... Joking wif u oni...,ok lar joking wif u oni
2,0,Free entry in 2 a wkly comp to win FA Cup fina...,free entry in a wkly comp to win fa cup fina...
3,1,U dun say so early hor... U c already then say...,u dun say so early hor u c already then say
4,1,"Nah I don't think he goes to usf, he lives aro...",nah i don t think he goes to usf he lives aro...
5,0,FreeMsg Hey there darling it's been 3 week's n...,freemsg hey there darling it s been week s n...
6,1,Even my brother is not like to speak with me. ...,even my brother is not like to speak with me ...
7,1,As per your request 'Melle Melle (Oru Minnamin...,as per your request melle melle oru minnamin...
8,0,WINNER!! As a valued network customer you have...,winner as a valued network customer you have...
9,0,Had your mobile 11 months or more? U R entitle...,had your mobile months or more u r entitle...


# Токенезация

In [112]:
import spacy
nlp = spacy.load("en_core_web_sm")

In [113]:
df['tockens'] = df['processed_text'].apply(
    lambda x: [token for token in nlp(x)]
)

In [114]:
df.head()

Unnamed: 0,status,raw_text,processed_text,tockens
0,1,"Go until jurong point, crazy.. Available only ...",go until jurong point crazy available only ...,"[go, until, jurong, point, , crazy, , avail..."
1,1,Ok lar... Joking wif u oni...,ok lar joking wif u oni,"[ok, lar, , joking, wif, u, oni, ]"
2,0,Free entry in 2 a wkly comp to win FA Cup fina...,free entry in a wkly comp to win fa cup fina...,"[free, entry, in, , a, wkly, comp, to, win, ..."
3,1,U dun say so early hor... U c already then say...,u dun say so early hor u c already then say,"[u, dun, say, so, early, hor, , u, c, alrea..."
4,1,"Nah I don't think he goes to usf, he lives aro...",nah i don t think he goes to usf he lives aro...,"[nah, i, don, t, think, he, goes, to, usf, , ..."


# Удаление стоп-слов

In [115]:
stopwords = nlp.Defaults.stop_words

In [116]:
df['clear_tockenlist'] = df['tockens'].apply(
    lambda x: [token for token in x if token.lower_ not in stopwords]
)

In [117]:
df

Unnamed: 0,status,raw_text,processed_text,tockens,clear_tockenlist
0,1,"Go until jurong point, crazy.. Available only ...",go until jurong point crazy available only ...,"[go, until, jurong, point, , crazy, , avail...","[jurong, point, , crazy, , available, bugis..."
1,1,Ok lar... Joking wif u oni...,ok lar joking wif u oni,"[ok, lar, , joking, wif, u, oni, ]","[ok, lar, , joking, wif, u, oni, ]"
2,0,Free entry in 2 a wkly comp to win FA Cup fina...,free entry in a wkly comp to win fa cup fina...,"[free, entry, in, , a, wkly, comp, to, win, ...","[free, entry, , wkly, comp, win, fa, cup, fi..."
3,1,U dun say so early hor... U c already then say...,u dun say so early hor u c already then say,"[u, dun, say, so, early, hor, , u, c, alrea...","[u, dun, early, hor, , u, c, ]"
4,1,"Nah I don't think he goes to usf, he lives aro...",nah i don t think he goes to usf he lives aro...,"[nah, i, don, t, think, he, goes, to, usf, , ...","[nah, don, t, think, goes, usf, , lives]"
...,...,...,...,...,...
5567,0,This is the 2nd time we have tried 2 contact u...,this is the nd time we have tried contact u...,"[this, is, the, , nd, time, we, have, tried, ...","[ , nd, time, tried, , contact, u, , u, won..."
5568,1,Will Ì_ b going to esplanade fr home?,will b going to esplanade fr home,"[will, , b, going, to, esplanade, fr, home]","[ , b, going, esplanade, fr, home]"
5569,1,"Pity, * was in mood for that. So...any other s...",pity was in mood for that so any other s...,"[pity, , was, in, mood, for, that, , so, ...","[pity, , mood, , , suggestions]"
5570,1,The guy did some bitching but I acted like i'd...,the guy did some bitching but i acted like i d...,"[the, guy, did, some, bitching, but, i, acted,...","[guy, bitching, acted, like, d, interested, bu..."


# Лемматизация

In [118]:
df['normalized_tockenlist'] = df['clear_tockenlist'].apply(
    lambda x: [token.lemma_ for token in x]
)

In [119]:
df

Unnamed: 0,status,raw_text,processed_text,tockens,clear_tockenlist,normalized_tockenlist
0,1,"Go until jurong point, crazy.. Available only ...",go until jurong point crazy available only ...,"[go, until, jurong, point, , crazy, , avail...","[jurong, point, , crazy, , available, bugis...","[jurong, point, , crazy, , available, bugis..."
1,1,Ok lar... Joking wif u oni...,ok lar joking wif u oni,"[ok, lar, , joking, wif, u, oni, ]","[ok, lar, , joking, wif, u, oni, ]","[ok, lar, , joke, wif, u, oni, ]"
2,0,Free entry in 2 a wkly comp to win FA Cup fina...,free entry in a wkly comp to win fa cup fina...,"[free, entry, in, , a, wkly, comp, to, win, ...","[free, entry, , wkly, comp, win, fa, cup, fi...","[free, entry, , wkly, comp, win, fa, cup, fi..."
3,1,U dun say so early hor... U c already then say...,u dun say so early hor u c already then say,"[u, dun, say, so, early, hor, , u, c, alrea...","[u, dun, early, hor, , u, c, ]","[u, dun, early, hor, , u, c, ]"
4,1,"Nah I don't think he goes to usf, he lives aro...",nah i don t think he goes to usf he lives aro...,"[nah, i, don, t, think, he, goes, to, usf, , ...","[nah, don, t, think, goes, usf, , lives]","[nah, don, t, think, go, usf, , live]"
...,...,...,...,...,...,...
5567,0,This is the 2nd time we have tried 2 contact u...,this is the nd time we have tried contact u...,"[this, is, the, , nd, time, we, have, tried, ...","[ , nd, time, tried, , contact, u, , u, won...","[ , nd, time, try, , contact, u, , u, win, ..."
5568,1,Will Ì_ b going to esplanade fr home?,will b going to esplanade fr home,"[will, , b, going, to, esplanade, fr, home]","[ , b, going, esplanade, fr, home]","[ , b, go, esplanade, fr, home]"
5569,1,"Pity, * was in mood for that. So...any other s...",pity was in mood for that so any other s...,"[pity, , was, in, mood, for, that, , so, ...","[pity, , mood, , , suggestions]","[pity, , mood, , , suggestion]"
5570,1,The guy did some bitching but I acted like i'd...,the guy did some bitching but i acted like i d...,"[the, guy, did, some, bitching, but, i, acted,...","[guy, bitching, acted, like, d, interested, bu...","[guy, bitching, act, like, d, interested, buy,..."


# Векторизация

In [120]:
corpus = []
for i in df['normalized_tockenlist']:
    text = ' '.join([str(row) for row in i])
    corpus.append(text)

In [121]:
print(corpus[:10])

['jurong point   crazy    available bugis n great world la e buffet     cine get amore wat   ', 'ok lar     joke wif u oni   ', 'free entry    wkly comp win fa cup final tkts    st        text fa        receive entry question std txt rate t c s apply                s', 'u dun early hor     u c   ', 'nah don t think go usf   live', 'freemsg hey darle s    week s word   d like fun   tb ok   xxx std chgs send          rcv', 'brother like speak   treat like aids patent', 'request   melle melle   oru minnaminunginte nurungu vettam    set callertune caller   press     copy friend callertune', 'winner    value network customer select receivea        prize reward   claim               claim code kl      valid     hour', 'mobile     month   u r entitle update late colour mobile camera free   mobile update co free            ']


# Bag of words

In [122]:
from sklearn.feature_extraction.text import CountVectorizer
count_vectorizer = CountVectorizer(ngram_range=(1,4))

In [123]:
bow = count_vectorizer.fit_transform(corpus)
bow.shape

(5572, 79901)

In [124]:
bow.todense()

matrix([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

# TD-IDF

In [125]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [126]:
model = TfidfVectorizer(ngram_range=(1,4))
tfidf = model.fit_transform(corpus)
tfidf.shape

(5572, 79901)

In [127]:
tfidf.todense()

matrix([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]])

# Понижение размерности

In [128]:
y = df['status']

In [129]:
from sklearn.feature_selection import SelectKBest
skb_bow = SelectKBest(k=60)
X_bow = skb_bow.fit_transform(bow, y)
X_bow.shape

(5572, 60)

In [130]:
skb_td = SelectKBest(k=60)
X_td = skb_td.fit_transform(tfidf, y)
X_td.shape

(5572, 60)

In [131]:
#from sklearn.decomposition import PCA

#pca_bow = PCA(n_components=60)
#X_bow = pca_bow.fit_transform(bow, y)

#pca_td = PCA(n_components=60)
#X_td = pca_td.fit_transform(tfidf, y)

# Train/test split

In [132]:
from sklearn.model_selection import train_test_split

In [133]:
X_train_bow, X_test_bow, y_train_bow, y_test_bow = train_test_split(X_bow,y, test_size=0.2)
X_train_td, X_test_td, y_train_td, y_test_td = train_test_split(X_td, y, test_size=0.2)

# Обучение моделей

In [134]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import BaggingClassifier
from sklearn.metrics import accuracy_score,precision_score,confusion_matrix,classification_report

In [135]:
classifiers = {
    'KNN': KNeighborsClassifier(n_neighbors=4),
    'SVC' : SVC(),
    'Bagging': BaggingClassifier()
}

def opt_estimator(X_train, X_test, y_train, y_test):
    for index, (name, classifier) in enumerate(classifiers.items()):
        classifier.fit(X_train.toarray(), np.ravel(y_train))
        y_pred = classifier.predict(X_test.toarray())
        current_ac = accuracy_score(y_test, y_pred)
        print(classification_report(y_test_bow, y_pred), '\n\n')

## Bag of words

In [136]:
opt_estimator(X_train_bow, X_test_bow, y_train_bow, y_test_bow)

              precision    recall  f1-score   support

           0       0.90      0.79      0.84       145
           1       0.97      0.99      0.98       970

    accuracy                           0.96      1115
   macro avg       0.93      0.89      0.91      1115
weighted avg       0.96      0.96      0.96      1115
 


              precision    recall  f1-score   support

           0       0.88      0.88      0.88       145
           1       0.98      0.98      0.98       970

    accuracy                           0.97      1115
   macro avg       0.93      0.93      0.93      1115
weighted avg       0.97      0.97      0.97      1115
 


              precision    recall  f1-score   support

           0       0.92      0.81      0.86       145
           1       0.97      0.99      0.98       970

    accuracy                           0.97      1115
   macro avg       0.95      0.90      0.92      1115
weighted avg       0.97      0.97      0.97      1115
 




## TD-IDF

In [137]:
opt_estimator(X_train_td, X_test_td, y_train_td, y_test_td)

              precision    recall  f1-score   support

           0       0.14      0.14      0.14       145
           1       0.87      0.87      0.87       970

    accuracy                           0.77      1115
   macro avg       0.51      0.51      0.51      1115
weighted avg       0.78      0.77      0.78      1115
 


              precision    recall  f1-score   support

           0       0.13      0.13      0.13       145
           1       0.87      0.87      0.87       970

    accuracy                           0.77      1115
   macro avg       0.50      0.50      0.50      1115
weighted avg       0.77      0.77      0.77      1115
 


              precision    recall  f1-score   support

           0       0.13      0.14      0.13       145
           1       0.87      0.86      0.87       970

    accuracy                           0.77      1115
   macro avg       0.50      0.50      0.50      1115
weighted avg       0.77      0.77      0.77      1115
 




# Тематическое моделирование

In [138]:
from sklearn.decomposition import LatentDirichletAllocation

# Bag of words

In [139]:
lda_bow = LatentDirichletAllocation(n_components=5, max_iter=20,
                                learning_method='online',
                                learning_offset=50)

In [140]:
lda_top_bow = lda_bow.fit_transform(bow)

In [147]:
for i,topic in enumerate(lda_top_bow[0]):
  print("Тема ",i,": ",topic*100,"%")

Тема  0 :  0.4279582285280306 %
Тема  1 :  0.427667384837064 %
Тема  2 :  0.42969024484301777 %
Тема  3 :  0.4322933225128951 %
Тема  4 :  98.282390819279 %


In [148]:
for i, comp in enumerate(lda_bow.components_):
    vocab_comp = zip(count_vectorizer.get_feature_names_out(), comp)
    sorted_words = sorted(vocab_comp, key= lambda x:x[1], reverse=True)[:10]
    print("Содержание темы "+str(i)+": ")
    for t in sorted_words:
        print(t[0],end=" ")
    print("\n")

Содержание темы 0: 
free text txt stop mobile win reply ur new www 

Содержание темы 1: 
gt lt lt gt want like get time send tell don 

Содержание темы 2: 
love come day good not miss like sorry get need 

Содержание темы 3: 
go ok lor home time ur not wat da get 

Содержание темы 4: 
know let send yeah not ask number ur get text 



# TF-IDF

In [143]:
lda_tfidf = LatentDirichletAllocation(n_components=5, max_iter=20,
                                learning_method='online',
                                learning_offset=50)

In [144]:
lda_top_tfidf = lda_tfidf.fit_transform(tfidf)

In [149]:
for i,topic in enumerate(lda_top_tfidf[5000]):
  print("Тема ",i,": ",topic*100,"%")

Тема  0 :  4.274925682548418 %
Тема  1 :  82.8632108637139 %
Тема  2 :  4.297487048821645 %
Тема  3 :  4.30727808983062 %
Тема  4 :  4.257098315085401 %


In [150]:
for i, comp in enumerate(lda_tfidf.components_):
    vocab_comp = zip(model.get_feature_names_out(), comp)
    sorted_words = sorted(vocab_comp, key= lambda x:x[1], reverse=True)[:10]
    print("Содержание темы "+str(i)+": ")
    for t in sorted_words:
        print(t[0],end=" ")
    print("\n")

Содержание темы 0: 
lt gt lt gt free send message min mobile text phone 

Содержание темы 1: 
thank wait special great sir day work lol sent nope 

Содержание темы 2: 
ok ll lor later sorry come home go wat get 

Содержание темы 3: 
come time know not day dear ur love think good 

Содержание темы 4: 
check mean door fine get library story simple person story answer 

