In [17]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score

# Предобработка данных

In [29]:
train_data_path = 'train_spam.csv'
train_data = pd.read_csv(train_data_path)

test_data_path = 'test_spam.csv'
test_data = pd.read_csv(test_data_path)

In [30]:
train_data

Unnamed: 0,text_type,text
0,ham,make sure alex knows his birthday is over in f...
1,ham,a resume for john lavorato thanks vince i will...
2,spam,plzz visit my website moviesgodml to get all m...
3,spam,urgent your mobile number has been awarded wit...
4,ham,overview of hr associates analyst project per ...
...,...,...
16273,spam,if you are interested in binary options tradin...
16274,spam,dirty pictureblyk on aircel thanks you for bei...
16275,ham,or you could do this g on mon 1635465 sep 1635...
16276,ham,insta reels par 80 गंद bhara pada hai 👀 kuch b...


Преобразуем столбец text_type: знаечние "spam" заменим на 1, а "ham" на 0, создадим из этих значений новый столбец

In [34]:
train_data['label'] = train_data['text_type'].map({'spam': 1, 'ham': 0})

Выведем краткую статистику по длинам приведенных текстов

In [35]:
train_data['text_length'] = train_data['text'].apply(len)

text_length_stats = train_data['text_length'].describe()

text_length_stats

count    16278.000000
mean       310.383524
std        287.818154
min          1.000000
25%         60.000000
50%        157.000000
75%        639.000000
max        800.000000
Name: text_length, dtype: float64

Самыми распространенными методами для решения такой задачи классификации текста являются наивный Байес, логистическая регрессия и XBBoost. Применим каждый из них, выберем метод с наибольшим значением ROC-AUC

# Наивный Байес

In [36]:
pipeline_NB = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english')),
    ('classifier', MultinomialNB()),
])


X_train, X_val, y_train, y_val = train_test_split(
    train_data['text'], 
    train_data['label'], 
    test_size=0.2, 
    random_state=42
)

pipeline_NB.fit(X_train, y_train)

probabilities_NB = pipeline_NB.predict_proba(X_val)[:, 1]

In [37]:
roc_auc_NB = roc_auc_score(y_val, probabilities_NB)
roc_auc_NB

0.9546788563845106

# Логистическая регрессия

In [10]:
pipeline_LR = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english')),
    ('classifier', LogisticRegression()),
])

pipeline_LR.fit(X_train, y_train)

probabilities_LR = pipeline_LR.predict_proba(X_val)[:, 1]

In [11]:
roc_auc_LR = roc_auc_score(y_val, probabilities_LR)
roc_auc_LR

0.9768876129826024

# XGBoost

In [19]:
pipeline_XGB = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english')),
    ('classifier', XGBClassifier(use_label_encoder=False, eval_metric='logloss')),
])

pipeline_XGB.fit(X_train, y_train)

probabilities_LR = pipeline_XGB.predict_proba(X_val)[:, 1]

In [21]:
roc_auc_xgb = roc_auc_score(y_val, probabilities_LR)
roc_auc_xgb

0.9714616832593364

# Применение логистической регрессии к тестовым данным

Так как знаечние ROC-AUC у логистической регрессии оказалось наибольшим, применим ее к тестовому набору данных

In [23]:
test_probabilities = pipeline_LR.predict_proba(test_data['text'])[:, 1]

output_LR = pd.DataFrame({
    'text': test_data['text'],
    'score': test_probabilities
})

output_csv_path = 'spam_predictions.csv'
output_LR.to_csv(output_csv_path, index=False)

In [25]:
output_LR

Unnamed: 0,text,score
0,j jim whitehead ejw cse ucsc edu writes j you ...,0.156657
1,original message from bitbitch magnesium net p...,0.123235
2,java for managers vince durasoft who just taug...,0.089381
3,there is a youtuber name saiman says,0.156820
4,underpriced issue with high return on equity t...,0.396997
...,...,...
4065,husband to wifetum meri zindagi hoorwifeor kya...,0.194229
4066,baylor enron case study cindy yes i shall co a...,0.001815
4067,boring as compared to tp,0.211398
4068,hellogorgeous hows u my fone was on charge lst...,0.058747
