In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import cross_val_score, StratifiedKFold, GridSearchCV, RandomizedSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

from matplotlib import pyplot as plt
import seaborn as sns

In [3]:
train = pd.read_csv('/kaggle/input/spam-detection-vk/train_spam.csv')

In [4]:
train['text_type'] = train['text_type'].apply(lambda x: 1 if x=='spam' else 0)

In [5]:
title_transformer = TfidfVectorizer(ngram_range=(1, 5),  
                                    analyzer='char',
                                    min_df = 0.0001,
                                    max_features=100000,
                                    stop_words='english')
logit = LogisticRegression(random_state=17,
                           C = 10,
                           solver='lbfgs', 
                           #n_jobs=4, 
                           max_iter=500)

model = Pipeline([('tfidf', title_transformer), ('logreg', logit)])

In [6]:
model.fit(train['text'], train['text_type'])



In [7]:
test = pd.read_csv('/kaggle/input/spam-detection-vk/test_spam.csv')

In [8]:
submit = pd.DataFrame({'score' : [], 'text' : []})
submit['text'] = test['text']
submit['score'] = model.predict_proba(submit['text'])[:, 1]

In [9]:
roc_auc_score(train['text_type'], model.predict_proba(train['text'])[:, 1])

0.9998077941929624

In [10]:
submit.head()

Unnamed: 0,score,text
0,0.02429,j jim whitehead ejw cse ucsc edu writes j you ...
1,0.014705,original message from bitbitch magnesium net p...
2,0.055647,java for managers vince durasoft who just taug...
3,0.030902,there is a youtuber name saiman says
4,0.723091,underpriced issue with high return on equity t...


In [11]:
submit.to_csv('/kaggle/working/submission.csv', index=False)