<h2>Загрузка и первичная предобработка данных</h2>

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('train_spam.csv')
df.head(20)

Unnamed: 0,text_type,text
0,ham,make sure alex knows his birthday is over in f...
1,ham,a resume for john lavorato thanks vince i will...
2,spam,plzz visit my website moviesgodml to get all m...
3,spam,urgent your mobile number has been awarded wit...
4,ham,overview of hr associates analyst project per ...
5,spam,i am so greatful to mrs valerie dodson for hel...
6,ham,url url date not supplied government employees...
7,spam,welcome to jefferson pilot s gateway to wealth...
8,ham,looks like your ham corpus by and large has to...
9,ham,how are you with moneyas in to youmoney aint a...


In [3]:
df.tail(20)

Unnamed: 0,text_type,text
16258,spam,urgent your 4 costa del sol holiday or £5000 a...
16259,ham,preface for book julie the introduction looks ...
16260,ham,download progress indicator i guess
16261,spam,𝑰 𝒘𝒂𝒔 𝒅𝒐𝒘𝒏 𝒂𝒏𝒅 𝒖𝒏𝒅𝒆𝒓 𝒂𝒕 𝒂 𝒑𝒐𝒊𝒏𝒕 𝒖𝒏𝒕𝒊𝒍 𝑰 𝒘𝒂𝒔 𝒊𝒏...
16262,ham,california update 5 4 01 if you have any quest...
16263,ham,just checked out heading out to drop off my st...
16264,spam,i just created my youtube channel 3 days ago t...
16265,ham,i am sorry
16266,ham,crypto nympho is a asshole his video on youtub...
16267,spam,your e mail to anvasetc 1111 groups msn com ca...


In [4]:
print(f"sample size: {df.shape[0]}")
print()
print(f"sample size for each class: \n{df['text_type'].value_counts()}")

sample size: 16278

sample size for each class: 
ham     11469
spam     4809
Name: text_type, dtype: int64


In [5]:
df_spam = df[df['text_type'].values == 'spam']
text_values = df_spam.loc[:, 'text'].values[:5]

for value in text_values:
    print(f"{value}\n")

plzz visit my website moviesgodml to get all movies for free and also i provide direct download links no redirect and ads😊😊😊😊😁

urgent your mobile number has been awarded with a £2000 prize guaranteed call 09061790121 from land line claim 3030 valid 12hrs only 150ppm

i am so greatful to mrs valerie dodson for helping change my life she gives free bitcoins to newly subscribed members and her trading signals are accurate just click on the link below

welcome to jefferson pilot s gateway to wealth as one of the industry s premier producers you must continually seek qualified cutting edge services financial profiles concept profiles professional is a personal and business analysis tool that analyzes a client s insurance investment and financial planning goals to help them see their situation today compared to their objectives market profiles professional is an ideal tool for true financial planning it not only provides a thorough analysis including asset allocation but it can calculate ta

In [6]:
df.isnull().values.any()

False

In [7]:
duplicateRows = df[df.duplicated ()]

duplicateRows

Unnamed: 0,text_type,text
4145,ham,SPAM ALERT 🚔 User: Username: @DillyBubbl...
4463,ham,SPAM ALERT 🚔 User: Username: @DillyBubbl...
5640,ham,/ban
6933,ham,SPAM ALERT 🚔 User: Username: @DillyBubbl...
7663,ham,SPAM ALERT 🚔 User: Username: @DillyBubbl...
7918,ham,SPAM ALERT 🚔 User: Username: @DillyBubbl...
9790,ham,SPAM ALERT 🚔 User: Username: @DillyBubbl...
10290,ham,SPAM DETECTION User: Username: @DillyBubb...
10811,ham,SPAM DETECTION User: Username: @DillyBubb...
13210,ham,SPAM DETECTION User: Username: @DillyBubb...


In [8]:
df2 = df.drop_duplicates().reset_index()
del df2['index']
print(df2.head())
df2['text_type'] = df2['text_type'].where(df2.loc[:, 'text_type'].values == "spam", 0)
df2['text_type'] = df2['text_type'].where(df2.loc[:, 'text_type'].values != "spam", 1)
df2.head()

  text_type                                               text
0       ham  make sure alex knows his birthday is over in f...
1       ham  a resume for john lavorato thanks vince i will...
2      spam  plzz visit my website moviesgodml to get all m...
3      spam  urgent your mobile number has been awarded wit...
4       ham  overview of hr associates analyst project per ...


Unnamed: 0,text_type,text
0,0,make sure alex knows his birthday is over in f...
1,0,a resume for john lavorato thanks vince i will...
2,1,plzz visit my website moviesgodml to get all m...
3,1,urgent your mobile number has been awarded wit...
4,0,overview of hr associates analyst project per ...


<h2>Предобработка данных</h2>

In [9]:
from sklearn.model_selection import train_test_split
import re
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk import word_tokenize

In [10]:
def df_preprocess(text):  
    regrex_pattern = re.compile('[^a-zA-Z0-9 ]')
    
    # Mathematical Alphanumeric Symbols
    regrex_pattern_alphanumeric = re.compile(pattern = "[" u"\U0001D400-\U0001D7FF" "]+", flags = re.UNICODE)
    
    regrex_pattern_emoticons = re.compile(pattern = "["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           "]+", flags = re.UNICODE)

    text = text.lower()
    text = re.sub('((www\.[^\s]+)|(https?://[^\s]+)|(http?://[^\s]+)|(https[^\s]+))|(http[^\s]+)|(www[^\s]+)', 'url', text)
    text = regrex_pattern_alphanumeric.sub('alphanumeric', text)
    text = regrex_pattern_emoticons.sub(' emoticon ', text)
    text = re.sub('@[^\s]+', 'user', text)
    text = regrex_pattern.sub(' ', text)
    text = ' '.join(text.split())

    lemmatizer = WordNetLemmatizer()
    text = [lemmatizer.lemmatize(word) for word in text.split()]
    stop_words = set(stopwords.words("english"))
    text = ' '.join(word for word in text if not word in stop_words)

    return text

In [11]:
df2['text'] = df2['text'].apply(df_preprocess)

In [12]:
df2.head()

Unnamed: 0,text_type,text
0,0,make sure alex know birthday fifteen minute fa...
1,0,resume john lavorato thanks vince get moving r...
2,1,plzz visit website moviesgodml get movie free ...
3,1,urgent mobile number ha awarded 2000 prize gua...
4,0,overview hr associate analyst project per davi...


In [13]:
df2[df2['text'] == '']

Unnamed: 0,text_type,text
3235,0,
3300,0,
3546,0,
5572,0,
6353,0,
6885,0,
7437,0,
9311,1,
9763,0,
10729,0,


In [14]:
print(df.iloc[3300])

text_type                                                  ham
text         কলার চিন্তা পরে ফোন নিয়ে প্যারায় আছি সবাই মন...
Name: 3300, dtype: object


In [15]:
df3 = df2[df2['text'] != '']
print(f'df2 shape: {df2.shape}')
print(f'df3 shape: {df3.shape}')

df2 shape: (16267, 2)
df3 shape: (16250, 2)


<h2>"Векторизация"</h2>

In [16]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import  TfidfVectorizer
import gensim.downloader as api

In [17]:
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score

from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import ComplementNB
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

In [18]:
y, X = list(zip(*df3.to_numpy()))
X = np.array(X)
y = np.array(y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=21)
print(X_train[:10])

['sorry ill call later' 'breaking 95000 holder' 'filter'
 'okie wan meet bishan co bishan im driving today'
 'santa calling would little one like call santa xmas eve call 09077818151 book time calls150ppm last 3mins 30 tc url'
 'ace boogie welcome emoticon girl boy group emoticon group emoticon unlimited emoticon user user user user emoticon dating group emoticon user user user user user user user user ace boogie download 1million video emoticon new video desi indian video'
 'guzzle like fountain spur rock customer speaks girlfriend really enjoying making homemade erotic film get pretending like porn star even though ever two u see one thing wa really missing movie wa money shot frank wa lucky money shot wa worth dollar ordered spur home movie end gigantic cum shot would make even veteran porn star jealous thanks spur helping spice sex life anthony ky spur really work ha improved sperm motility morphology point girlfriend pregnant fertility blend really doe help improve male fertility 

In [64]:
#Векторизация с помощью: Bag of words
vectorizer_BOW = CountVectorizer(ngram_range=(1, 3))

X_train_BOW = vectorizer_BOW.fit_transform(X_train)
X_test_BOW = vectorizer_BOW.transform(X_test)

In [66]:
print(X_train_BOW.shape, X_test_BOW.shape)
X_train_BOW[200]

(11375, 544343) (4875, 544343)


<1x544343 sparse matrix of type '<class 'numpy.int64'>'
	with 203 stored elements in Compressed Sparse Row format>

In [65]:
#Векторизация с помощью: TF-IDF
vectorizer_TFIDF = TfidfVectorizer(ngram_range=(1, 3))

X_train_TFIDF = vectorizer_TFIDF.fit_transform(X_train)
X_test_TFIDF = vectorizer_TFIDF.transform(X_test)

In [22]:
print(X_train_TFIDF.shape, X_test_TFIDF.shape)

(11375, 544343) (4875, 544343)


In [23]:
X_train_TFIDF[200]

<1x544343 sparse matrix of type '<class 'numpy.float64'>'
	with 203 stored elements in Compressed Sparse Row format>

In [24]:
#Векторизация с помощью: GloVe
glove_model = api.load("glove-twitter-100")  
glove_model.most_similar("cat")

[('dog', 0.875208854675293),
 ('kitty', 0.8015091419219971),
 ('pet', 0.7986468076705933),
 ('cats', 0.797942578792572),
 ('kitten', 0.7936834096908569),
 ('puppy', 0.7702749967575073),
 ('monkey', 0.758426308631897),
 ('bear', 0.7507943511009216),
 ('dogs', 0.7460062503814697),
 ('pig', 0.7117346525192261)]

In [25]:
def replace_digital(text):
    text = re.sub('[0-9]+', 'number', text)
    return text

In [68]:
X_train_tokens = [replace_digital(row).split() for row in X_train]
X_train_glove = []

word_unknown = glove_model['unknown']
for row in X_train_tokens:
    new_row = list(map(lambda x: glove_model[x] if x in glove_model else word_unknown, row))
    X_train_glove.append(np.concatenate(new_row))


max_len = 0
for row in X_train_glove:
    if len(row) >= max_len:
        max_len = len(row)

len_x_train = len(X_train_glove)
for i in range(len_x_train):
    if len(X_train_glove[i]) < max_len:
        buf = np.array([0] * (max_len - len(X_train_glove[i])))
        X_train_glove[i] = np.concatenate([X_train_glove[i], buf])

X_train_glove = np.array(X_train_glove)

X_test_tokens = [replace_digital(row).split() for row in X_test]
X_test_glove = []

for row in X_test_tokens:
    new_row = list(map(lambda x: glove_model[x] if x in glove_model else word_unknown, row))
    X_test_glove.append(np.concatenate(new_row))

len_x_train = len(X_test_glove)
for i in range(len_x_train):
    if len(X_test_glove[i]) < max_len:
        buf = np.array([0] * (max_len - len(X_test_glove[i])))
        X_test_glove[i] = np.concatenate([X_test_glove[i], buf])

X_test_glove = np.array(X_test_glove)

  X_test_glove = np.array(X_test_glove)


<h2>Обучение классификаторов</h2>

In [27]:
knn = KNeighborsClassifier(n_neighbors = 10, metric = 'cosine')
knn.fit(X_train_BOW, y_train)

knn_TFIDF = KNeighborsClassifier(n_neighbors = 10, metric = 'cosine')
knn_TFIDF.fit(X_train_TFIDF, y_train)

knn_glove = KNeighborsClassifier(n_neighbors = 10, metric = 'cosine')
knn_glove.fit(X_train_glove, y_train)

KNeighborsClassifier(metric='cosine', n_neighbors=10)

In [28]:
log_regr = LogisticRegression(class_weight = 'balanced', max_iter = 200)
log_regr.fit(X_train_BOW, y_train)

log_regr_TFIDF = LogisticRegression(class_weight = 'balanced', max_iter = 200)
log_regr_TFIDF.fit(X_train_TFIDF, y_train)

log_regr_glove = LogisticRegression(class_weight = 'balanced', max_iter = 400)
log_regr_glove.fit(X_train_glove, y_train)

LogisticRegression(class_weight='balanced', max_iter=400)

In [29]:
svc = SVC(class_weight = 'balanced')
svc.fit(X_train_BOW, y_train)

svc_TFIDF = SVC(class_weight = 'balanced')
svc_TFIDF.fit(X_train_TFIDF, y_train)

svc_glove = SVC(class_weight = 'balanced', max_iter = 100)
svc_glove.fit(X_train_glove, y_train)



SVC(class_weight='balanced', max_iter=100)

In [32]:
nb = ComplementNB()
nb.fit(X_train_BOW, y_train)

nb_TFIDF = ComplementNB()
nb_TFIDF.fit(X_train_TFIDF, y_train)

ComplementNB()

In [33]:
rand_forest = RandomForestClassifier()
rand_forest.fit(X_train_BOW, y_train)

rand_forest_TFIDF = RandomForestClassifier()
rand_forest_TFIDF.fit(X_train_TFIDF, y_train)

rand_forest_glove = RandomForestClassifier()
rand_forest_glove.fit(X_train_glove, y_train)

RandomForestClassifier()

In [34]:
xgb = XGBClassifier()
xgb.fit(X_train_BOW, y_train)

xgb_TFIDF = XGBClassifier()
xgb_TFIDF.fit(X_train_TFIDF, y_train)

xgb_glove = XGBClassifier()
xgb_glove.fit(X_train_glove, y_train)

XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=None, n_jobs=None,
              num_parallel_tree=None, random_state=None, ...)

In [None]:
# catboost = CatBoostClassifier(
#     iterations = 1000,
#     learning_rate = 0.1,
#     custom_loss = ['AUC', 'Accuracy'])

# catboost.fit(X_train_BOW, y_train,
#         eval_set = (X_test_BOW, y_test),
#         verbose = 200)

0:	learn: 0.6488584	test: 0.6489202	best: 0.6489202 (0)	total: 1.94s	remaining: 32m 23s
200:	learn: 0.1862464	test: 0.2279813	best: 0.2279813 (200)	total: 5m 50s	remaining: 23m 13s
400:	learn: 0.1452739	test: 0.2072736	best: 0.2072736 (400)	total: 11m 36s	remaining: 17m 20s
600:	learn: 0.1215847	test: 0.1976670	best: 0.1976670 (600)	total: 17m 23s	remaining: 11m 33s
800:	learn: 0.1042544	test: 0.1917579	best: 0.1917532 (799)	total: 23m 10s	remaining: 5m 45s
999:	learn: 0.0904869	test: 0.1864115	best: 0.1864022 (997)	total: 28m 52s	remaining: 0us

bestTest = 0.1864021983
bestIteration = 997

Shrink model to first 998 iterations.


<catboost.core.CatBoostClassifier at 0x1de18a8f100>

In [None]:
#catboost.save_model('catboost_BOW')

In [36]:
catboost_BOW = CatBoostClassifier().load_model('catboost_BOW')

In [None]:
# catboost_TFIDF = CatBoostClassifier(
#     iterations = 1000,
#     learning_rate = 0.1,
#     custom_loss = ['AUC', 'Accuracy']
# )

# catboost_TFIDF.fit(X_train_TFIDF, y_train,
#         eval_set = (X_test_TFIDF, y_test),
#         verbose = 200
# )

0:	learn: 0.6268112	test: 0.6212080	best: 0.6212080 (0)	total: 1.82s	remaining: 30m 16s
200:	learn: 0.1683011	test: 0.2229566	best: 0.2228808 (199)	total: 5m 50s	remaining: 23m 13s
400:	learn: 0.1229462	test: 0.2053478	best: 0.2053478 (400)	total: 11m 38s	remaining: 17m 23s
600:	learn: 0.0978043	test: 0.1984136	best: 0.1984136 (600)	total: 17m 24s	remaining: 11m 33s
800:	learn: 0.0799562	test: 0.1946916	best: 0.1946740 (799)	total: 23m 12s	remaining: 5m 45s
999:	learn: 0.0662885	test: 0.1937758	best: 0.1930122 (893)	total: 28m 58s	remaining: 0us

bestTest = 0.1930121513
bestIteration = 893

Shrink model to first 894 iterations.


<catboost.core.CatBoostClassifier at 0x1de26602490>

In [None]:
#catboost_TFIDF.save_model('catboost_TFIDF')

In [37]:
catboost_TFIDF = CatBoostClassifier().load_model('catboost_TFIDF')

<h2>Тестирование</h2>

In [38]:
def h_max(row):
	is_max = row == row.max()
	return ['color: green' if cell else '' for cell in is_max]

def h_min(row):
	is_min = row == row.min()
	return ['color: red' if cell else '' for cell in is_min]

def calc_score(models, models_name, X_train, y_train, X_test, y_test):

    data = dict()
    for i, model in enumerate(models):
        score = []

        y_predict_train = model.predict(X_train)
        y_predict_test = model.predict(X_test)

        score.append(round(accuracy_score(y_predict_train, y_train), 3))
        score.append(round(accuracy_score(y_predict_test, y_test), 3))
        score.append(round(recall_score(y_predict_train, y_train), 3))
        score.append(round(recall_score(y_predict_test, y_test), 3))
        score.append(round(roc_auc_score(y_predict_train, y_train), 3))
        score.append(round(roc_auc_score(y_predict_test, y_test), 3))

        data[models_name[i]] = score
    
    index = ['accuracy_train', 'accuracy_test', 'recall_train', 'recall_test', 'roc_auc_train', 'roc_auc_test']
    df_score = pd.DataFrame(data, index = index)
    return df_score.style.apply(h_max, axis = 1).apply(h_min, axis = 1).format('{:.3f}')

In [39]:
models = [knn, log_regr, svc, nb, rand_forest, xgb, catboost_BOW]
models_name = ['KNN', 'Log_Regr', 'SVM', 'NB', 'Rand_forest', 'xgb', 'catboost']

param = {'models': models,
         'models_name': models_name,
         'X_train': X_train_BOW,
         'y_train': y_train,
         'X_test': X_test_BOW,
         'y_test': y_test}

result = calc_score(**param)
result

  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


Unnamed: 0,KNN,Log_Regr,SVM,NB,Rand_forest,xgb,catboost
accuracy_train,0.875,0.998,0.973,0.973,0.999,0.959,0.706
accuracy_test,0.865,0.949,0.934,0.917,0.898,0.934,0.697
recall_train,0.94,0.997,0.977,0.916,0.999,0.976,0.219
recall_test,0.917,0.942,0.938,0.829,0.971,0.929,0.25
roc_auc_train,0.9,0.998,0.974,0.958,0.999,0.965,0.463
roc_auc_test,0.884,0.947,0.935,0.895,0.925,0.933,0.473


In [40]:
models_TFIDF = [knn_TFIDF, log_regr_TFIDF, svc_TFIDF, nb_TFIDF, rand_forest_TFIDF, xgb_TFIDF, catboost_TFIDF]
models_name_TFIDF = ['KNN', 'Log_Regr', 'SVM', 'NB', 'Rand_forest', 'xgb', 'catboost']

param_TFIDF = {'models': models_TFIDF,
         'models_name': models_name_TFIDF,
         'X_train': X_train_TFIDF,
         'y_train': y_train,
         'X_test': X_test_TFIDF,
         'y_test': y_test}

result_TFIDF = calc_score(**param_TFIDF)
result_TFIDF

  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


Unnamed: 0,KNN,Log_Regr,SVM,NB,Rand_forest,xgb,catboost
accuracy_train,0.891,0.984,0.996,0.988,0.999,0.966,0.707
accuracy_test,0.877,0.933,0.945,0.899,0.905,0.92,0.697
recall_train,0.941,0.955,0.988,0.979,0.999,0.982,0.0
recall_test,0.922,0.855,0.913,0.892,0.977,0.921,0.0
roc_auc_train,0.909,0.976,0.994,0.986,0.999,0.971,0.354
roc_auc_test,0.893,0.913,0.936,0.897,0.931,0.92,0.348


In [62]:
#нужно доделать (проблема в размерности X_test_glove)
models_glove = [knn_glove, log_regr_glove, svc_glove, rand_forest_glove, xgb_glove]
models_name_glove = ['KNN', 'Log_Regr', 'SVM', 'Rand_forest', 'xgb']

param_glove = {'models': models_glove,
         'models_name': models_name_glove,
         'X_train': X_train_glove,
         'y_train': y_train,
         'X_test': X_test_glove,
         'y_test': y_test}

result_glove = calc_score(**param_glove)
result_glove

  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  return array(a, dtype, copy=False, order=order)


ValueError: Expected 2D array, got 1D array instead:
array=[array([ 0.38609001,  0.36987999, -0.64570999, ...,  0.        ,
         0.        ,  0.        ])
 array([ 0.69809002, -0.4294    ,  0.029514  , ...,  0.        ,
         0.        ,  0.        ])
 array([ 0.55792999,  0.10748   , -0.57490999, ...,  0.        ,
         0.        ,  0.        ])                               ...
 array([ 0.034845,  0.072968, -0.10695 , ...,  0.      ,  0.      ,
         0.      ])
 array([ 0.091552,  0.093336, -0.028113, ...,  0.      ,  0.      ,
         0.      ])
 array([ 0.29028001,  0.13014001, -0.0052254 , ...,  0.        ,
         0.        ,  0.        ])                              ].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

<h2>"Скорринг" тестовых данных</h2>

В качестве классификатора была выбрана логистическая регрессия с векторизацией "Bag of words" по результатам сравнения целевой метрики "ROC AUC"

In [100]:
from sklearn import metrics
import matplotlib.pyplot as plt
import plotly.graph_objs as go

In [101]:
y_pred_proba = log_regr.predict_proba(X_test_BOW)[:,1]
fpr, tpr, _ = metrics.roc_curve(y_test,  y_pred_proba)
auc = metrics.roc_auc_score(y_test, y_pred_proba)

trace = go.Scatter(x=fpr, y=tpr, mode='lines', name='AUC = %0.2f' % auc,
                   line=dict(color='darkorange', width=2))
reference_line = go.Scatter(x=[0,1], y=[0,1], mode='lines', name='Reference Line',
                            line=dict(color='navy', width=2, dash='dash'))
fig = go.Figure(data=[trace, reference_line])
fig.update_layout(title='Interactive ROC Curve',
                  xaxis_title='False Positive Rate',
                  yaxis_title='True Positive Rate')
fig.show()

In [87]:
df_test = pd.read_csv('test_spam.csv')
df_test.head()

Unnamed: 0,text
0,j jim whitehead ejw cse ucsc edu writes j you ...
1,original message from bitbitch magnesium net p...
2,java for managers vince durasoft who just taug...
3,there is a youtuber name saiman says
4,underpriced issue with high return on equity t...


In [88]:
print(f"sample size: {df.shape[0]}")

sample size: 16278


In [85]:
df_test2 = df_test.copy()
df_test2['text'] = df_test2['text'].apply(df_preprocess)
df_test2.head()

Unnamed: 0,text
0,j jim whitehead ejw cse ucsc edu writes j open...
1,original message bitbitch magnesium net people...
2,java manager vince durasoft taught java class ...
3,youtuber name saiman say
4,underpriced issue high return equity oil gas a...


In [86]:
X = df_test2['text'].to_numpy()
print(X[100])

ill let know kick


In [89]:
X_BOW = vectorizer_BOW.transform(X)

In [92]:
y = log_regr.predict(X_BOW)
print(y[:10])

[0 0 0 0 1 0 0 0 0 0]


In [93]:
y = np.where(y == 1, 'spam', 'ham')
df_test['score'] = y
df_test.head()

Unnamed: 0,text,score
0,j jim whitehead ejw cse ucsc edu writes j you ...,ham
1,original message from bitbitch magnesium net p...,ham
2,java for managers vince durasoft who just taug...,ham
3,there is a youtuber name saiman says,ham
4,underpriced issue with high return on equity t...,spam


In [94]:
df_test.to_csv('result_spam.csv')