In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from joblib import dump

In [None]:
data = pd.read_csv('data_clean.csv')
data = data.loc[:1000]
data.info()

In [None]:
data = data[data['target'].notna()]
data = data[data['clear_text'].notna()]
data.drop_duplicates(inplace=True)
data.info()

In [None]:
train_df, test_df = train_test_split(data, test_size=0.2, random_state=17)
train_data = train_df['clear_text']
train_target = train_df['target']
test_data = test_df['clear_text']
test_target = test_df['target']

In [None]:
tfidf = TfidfVectorizer(ngram_range=(5, 12), max_features=10000)
tfidf.fit(train_data)
dump(tfidf, 'tfidf.joblib')

In [None]:
train_features = tfidf.transform(train_data)
test_features = tfidf.transform(test_data)

In [None]:
lr = LogisticRegression(random_state=12345, C=0.655, max_iter=1000, class_weight='balanced', solver='saga')
lr.fit(train_features, train_target.values)
dump(lr, 'logreg.joblib')

In [None]:
print("Test error: %.3f" % (roc_auc_score(test_target.values, lr.predict_proba(test_features)[:, 1])))