In [47]:
import pandas as pd
from pathlib import Path
import pickle
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import MaxAbsScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score

In [5]:
work_path = Path('.')

In [6]:
df = pd.read_csv(work_path.joinpath('full_data.csv').__str__())

In [9]:
target_class_map = {target: n for n, target in enumerate(df.topic.unique())}

In [85]:
target_class_map

{'Наука и техника': 0,
 'Экономика': 1,
 'Силовые структуры': 2,
 'Туризм/Путешествия': 3,
 'Общество/Россия': 4,
 'Спорт': 5,
 'Бывший СССР': 6}

In [13]:
df.topic = df.topic.map(target_class_map)

In [18]:
test_idx = []
for target in df.topic.unique():
    test_idx += df[df.topic == target].sample(frac=0.25, random_state=42).index.tolist()

In [21]:
train_df = df.drop(index=test_idx)
test_df = df.loc[test_idx]

In [26]:
test_df.groupby('topic').agg({'url': 'count'}) / len(test_df)

Unnamed: 0_level_0,url
topic,Unnamed: 1_level_1
0,0.094008
1,0.323519
2,0.191632
3,0.123795
4,0.235882
5,0.004477
6,0.026687


In [28]:
train_df.groupby('topic').agg({'url': 'count'}) / len(train_df)

Unnamed: 0_level_0,url
topic,Unnamed: 1_level_1
0,0.094049
1,0.323521
2,0.191542
3,0.123716
4,0.235898
5,0.004533
6,0.02674


# base_line

In [32]:
X_train = train_df.content.to_numpy()
y_train = train_df.topic.to_numpy()

X_test = test_df.content.to_numpy()
y_test = test_df.topic.to_numpy()

In [34]:
vec = CountVectorizer() # подбор гиперпараметров очень помогает
vec.fit(X_train)

bow = vec.transform(X_train)  # bow — bag of words (мешок слов)
bow_test = vec.transform(X_test)

In [35]:
scaler = MaxAbsScaler()
bow = scaler.fit_transform(bow)
bow_test = scaler.transform(bow_test)

In [36]:
clf = LogisticRegression(max_iter=200, random_state=42)
clf.fit(bow, y_train)
pred = clf.predict(bow_test)

In [46]:
f1_score(y_test, pred, average='weighted')

np.float64(0.9106877884200845)

In [51]:
with open(work_path.joinpath('models/base_line_log_reg_model.pickle').__str__(),'wb') as f:
    pickle.dump(clf, f)

# params search

## stop_words and lemmatize

In [52]:
import nltk
nltk.download("stopwords")
from nltk.corpus import stopwords
from pymystem3 import Mystem
from string import punctuation

[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [53]:
mystem = Mystem() 
russian_stopwords = stopwords.words("russian")

In [54]:
def preprocess_text(text):
    tokens = mystem.lemmatize(text.lower())
    tokens = [token for token in tokens if token not in russian_stopwords\
              and token != " " \
              and token.strip() not in punctuation]
    
    text = " ".join(tokens)
    
    return text

In [55]:
X_train = train_df.content.apply(preprocess_text).to_numpy()
y_train = train_df.topic.to_numpy()

X_test = test_df.content.apply(preprocess_text).to_numpy()
y_test = test_df.topic.to_numpy()

In [56]:
vec = CountVectorizer() 
vec.fit(X_train)

bow = vec.transform(X_train)
bow_test = vec.transform(X_test)

In [57]:
scaler = MaxAbsScaler()
bow = scaler.fit_transform(bow)
bow_test = scaler.transform(bow_test)

In [58]:
clf = LogisticRegression(max_iter=200, random_state=42)
clf.fit(bow, y_train)
pred = clf.predict(bow_test)

In [59]:
f1_score(y_test, pred, average='weighted')

np.float64(0.9139257939919727)

**Вывод**
Лемматизация и удалиение стоп слов улучшает score

### LogisticRegression params

In [76]:
from sklearn.model_selection import GridSearchCV

In [77]:
parameters = {
    'penalty': ['l2'],
    'class_weight': [None, 'balanced'],
    'max_iter': [100, 200, 300]
}

In [72]:
clf = LogisticRegression(random_state=42)

In [73]:
gscv = GridSearchCV(clf, parameters)

In [74]:
gscv.fit(bow, y_train)

In [75]:
pd.DataFrame(gscv.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_class_weight,param_max_iter,param_penalty,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,23.846641,2.372425,0.059079,0.043597,,100,l2,"{'class_weight': None, 'max_iter': 100, 'penal...",0.918818,0.919105,0.915925,0.901578,0.771306,0.885346,0.057383,4
1,23.421624,2.450737,0.040975,0.04389,,200,l2,"{'class_weight': None, 'max_iter': 200, 'penal...",0.918818,0.919105,0.915925,0.901578,0.771306,0.885346,0.057383,4
2,23.576924,2.048472,0.023129,0.035481,,300,l2,"{'class_weight': None, 'max_iter': 300, 'penal...",0.918818,0.919105,0.915925,0.901578,0.771306,0.885346,0.057383,4
3,21.421451,1.64735,0.076616,0.034822,balanced,100,l2,"{'class_weight': 'balanced', 'max_iter': 100, ...",0.925416,0.926277,0.918795,0.906456,0.754663,0.886321,0.06621,1
4,21.404652,1.299862,0.075054,0.034515,balanced,200,l2,"{'class_weight': 'balanced', 'max_iter': 200, ...",0.925416,0.926277,0.918795,0.906456,0.754663,0.886321,0.06621,1
5,22.339177,1.277196,0.040398,0.043767,balanced,300,l2,"{'class_weight': 'balanced', 'max_iter': 300, ...",0.925416,0.926277,0.918795,0.906456,0.754663,0.886321,0.06621,1


In [78]:
pred = gscv.best_estimator_.predict(bow_test)

In [79]:
f1_score(y_test, pred, average='weighted')

np.float64(0.9199695387639771)