In [68]:
import numpy as np 
import pandas as pd 
import nltk 
import sklearn 
from sklearn import*
import re 
from tqdm import tqdm 
import catboost as ctb
import joblib
from collections import*
tqdm.pandas()
import pymystem3
import pymorphy2
import plotly.express as px

In [69]:
df = pd.read_csv('clean_df.csv')

In [70]:
def hatred_upper(text):
    upper = len([i for i in text if i.isupper()])
    lower = len([i for i in text if i.islower()])
    return upper/(upper+lower+1e-6)

def hatred_punc(text):
    punct = len(re.findall(r'[^\w\s]',text))
    text = len(re.findall(r'\w+',text))
    return punct/(text+1e-6)

In [71]:
def FE(df):
    df = pd.DataFrame(df)
    df.columns=['text']
    df['upper'] = df.text.apply(hatred_upper)
    df['punct'] = df.text.apply(hatred_punc)
    df['title'] = df.text.apply(lambda x: len([i for i in x if i.istitle()==True]))
    df['len'] = df.text.apply(len)
    df['count'] = df.text.apply(lambda x: len(x.split()))
    df['average_len'] = df.text.apply(lambda x: np.mean([len(i) for i in x.split()])).fillna(0)
    df['pos_sc'] = df.text.apply(lambda text: len(re.findall(r'\)|D',text)))
    df['neg_sc'] = df.text.apply(lambda text: len(re.findall(r'\(|C|c|С|c',text)))
    df = df.drop('text',axis=1)
    return df

In [72]:
def re_clear(df):
    df=pd.Series(df,name='text')
    compilers = [re.compile(r'(#|@)\w+'),\
                        re.compile(r'htt(ps|p)\S+'),\
                        re.compile(r'_+')]
    for comp in compilers:
        df = df.apply(lambda line: re.sub(comp,'',line))
    return df

In [73]:
def pymy(text):
    morph = pymorphy2.MorphAnalyzer()
    normal_text=' '.join([morph.parse(x)[0].normal_form for x in text.split()])
    return normal_text

In [74]:
 stopwords = nltk.corpus.stopwords.words('russian')
 def stopwords_clear (text):
        return ' '.join([w for w in text.split() if w.lower() not in stopwords])

In [75]:
def lemmize(df,parts=5):
    m = pymystem3.Mystem()
    df=[' '.join(re.findall(r'[А-я]+',line)) for line in df]
    batch=len(df)//parts
    for i in tqdm(range(parts)):
        batch_df='|'.join(df[i*batch:(i+1)*batch])
        lem_batch_df=''.join(m.lemmatize(batch_df)).split('|')
        df[i*batch:(i+1)*batch] = lem_batch_df           
    lem_df=pd.Series(df,name='text')
    lem_df = lem_df.apply(stopwords_clear)
    return lem_df

In [76]:
text=re_clear(df['ttext'])

In [77]:
features=FE(text)

In [78]:
morph = pymorphy2.MorphAnalyzer()

In [79]:
%%time
lemmized_st = lemmize(text)

100%|██████████| 5/5 [02:15<00:00, 27.03s/it]
Wall time: 2min 19s


In [80]:
idf_vec = feature_extraction.text.TfidfVectorizer(max_features=1000,ngram_range=(1,1))
clf = naive_bayes.MultinomialNB(alpha=5)
pipe = pipeline.Pipeline([('idf',idf_vec),('clf',clf)])
prms = {'idf__max_features':np.arange(100,5000,500),
'idf__ngram_range':[(1,1),(1,2),(2,2),(1,3),(2,3),(1,4)],
'clf__alpha': [1,5,10,15,20,25,30,40,50,80]}

In [81]:
rs=model_selection.RandomizedSearchCV(pipe,prms,cv=5,scoring='f1',verbose=10,n_jobs=-1)

In [82]:
rs.fit(text,df['ttype'])

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:   15.0s
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:   30.0s
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:   34.4s
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:   49.5s
[Parallel(n_jobs=-1)]: Done  33 out of  50 | elapsed:  1.0min remaining:   31.2s
[Parallel(n_jobs=-1)]: Done  39 out of  50 | elapsed:  1.3min remaining:   21.3s
[Parallel(n_jobs=-1)]: Done  45 out of  50 | elapsed:  1.3min remaining:    8.7s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:  1.4min finished


RandomizedSearchCV(cv=5,
                   estimator=Pipeline(steps=[('idf',
                                              TfidfVectorizer(max_features=1000)),
                                             ('clf', MultinomialNB(alpha=5))]),
                   n_jobs=-1,
                   param_distributions={'clf__alpha': [1, 5, 10, 15, 20, 25, 30,
                                                       40, 50, 80],
                                        'idf__max_features': array([ 100,  600, 1100, 1600, 2100, 2600, 3100, 3600, 4100, 4600]),
                                        'idf__ngram_range': [(1, 1), (1, 2),
                                                             (2, 2), (1, 3),
                                                             (2, 3), (1, 4)]},
                   scoring='f1', verbose=10)

In [83]:
opt_pipe = pipe.set_params(**rs.best_params_)

In [84]:
opt_pipe.fit(text,df.ttype)

Pipeline(steps=[('idf', TfidfVectorizer(max_features=4100, ngram_range=(1, 2))),
                ('clf', MultinomialNB(alpha=40))])

In [85]:
f"CV F1 score : {np.mean(model_selection.cross_val_score(opt_pipe,text,df.ttype,cv=3,scoring='f1')):0.4%}"

'CV roc-auc score : 71.5345%'

In [86]:
def nlp(text,model,lemmize=False):
    text=re_clear(text)
    if lemmize==True:
        text = text.apply(pymy)
    pred = model.predict_proba(text)
    return print(f" Negative {pred[0][0]:.2%}, Positive {pred[0][1]:.2%}")

In [87]:
new_fe=preprocessing.FunctionTransformer(FE)

In [88]:
fu = pipeline.FeatureUnion([('idf',idf_vec),('fe',new_fe)])

In [94]:
pipe_fe = pipeline.Pipeline([('fu',fu),('clf',clf)])
prms_fe = {'fu__idf__max_features':np.arange(100,5000,500),
'fu__idf__ngram_range':[(1,1),(1,2),(2,2),(1,3),(2,3),(1,4)],
'clf__alpha': [1,5,10,15,20,25,30,40,50,80]}
rs_fe = model_selection.RandomizedSearchCV(pipe_fe,prms_fe,cv=3,scoring='f1',verbose=10,n_jobs=-1)
rs_fe.fit(text,df.ttype)
opt_pipe_fe = pipe_fe.set_params(**rs_fe.best_params_)
opt_pipe_fe.fit(text,df.ttype)
f"CV f1 score : {np.mean(model_selection.cross_val_score(opt_pipe_fe,text,df.ttype,cv=3,scoring='f1')):0.4%}"

Fitting 3 folds for each of 10 candidates, totalling 30 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:   33.8s
[Parallel(n_jobs=-1)]: Done  11 out of  30 | elapsed:   52.5s remaining:  1.5min
[Parallel(n_jobs=-1)]: Done  15 out of  30 | elapsed:  1.3min remaining:  1.3min
[Parallel(n_jobs=-1)]: Done  19 out of  30 | elapsed:  1.7min remaining:  1.0min
[Parallel(n_jobs=-1)]: Done  23 out of  30 | elapsed:  1.9min remaining:   34.2s
[Parallel(n_jobs=-1)]: Done  27 out of  30 | elapsed:  2.0min remaining:   13.4s
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:  2.1min finished


'CV f1 score : 97.8975%'

In [95]:
sgd = linear_model.SGDClassifier()
prms_sgd = {'idf__max_features':np.arange(100,5000,500),
'idf__ngram_range':[(1,1),(1,2),(2,2),(1,3),(2,3),(1,4)],
'clf__alpha': [0.00001,0.0001,0.001,0.01],
'clf__penalty':['l2','l1','elasticnet'],
'clf__loss':['perceptron','modified_huber']}
pipe_sgd = pipeline.Pipeline([('idf',idf_vec),('clf',sgd)])
rs_sgd = model_selection.RandomizedSearchCV(pipe_sgd,prms_sgd,cv=3,scoring='f1',verbose=10,n_jobs=-1,n_iter=20)
rs_sgd.fit(text,df.ttype)
opt_pipe_sgd = pipe_sgd.set_params(**rs_sgd.best_params_)
opt_pipe_sgd.fit(text,df.ttype)
f"CV f1 score : {np.mean(model_selection.cross_val_score(opt_pipe_sgd,text,df.ttype,cv=5,scoring='f1')):0.4%}"


Fitting 3 folds for each of 20 candidates, totalling 60 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:   11.9s
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:   25.9s
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:   43.6s
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:   53.4s
[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done  44 out of  60 | elapsed:  1.5min remaining:   32.7s
[Parallel(n_jobs=-1)]: Done  51 out of  60 | elapsed:  1.7min remaining:   17.8s
[Parallel(n_jobs=-1)]: Done  58 out of  60 | elapsed:  1.8min remaining:    3.5s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:  1.8min finished


'CV f1 score : 72.6992%'

In [105]:
for i in ['ужас какой то!','ну что за день(',"эхх, жаль что ты вообще появился :C","мда, неудачненько вышло..."," ну тут без комментариев"]:
    nlp(i,opt_pipe_sgd)
    print('---'*40)

Negative 100.00%, Positive 0.00%
------------------------------------------------------------------------------------------------------------------------
 Negative 96.64%, Positive 3.36%
------------------------------------------------------------------------------------------------------------------------
 Negative 80.67%, Positive 19.33%
------------------------------------------------------------------------------------------------------------------------
 Negative 77.38%, Positive 22.62%
------------------------------------------------------------------------------------------------------------------------
 Negative 52.99%, Positive 47.01%
------------------------------------------------------------------------------------------------------------------------


In [99]:
for i in ['это хорошо','позитив',"понравился вечер"," это просто моя любовь","милота"]:
    nlp(i,opt_pipe_sgd)
    print('---'*40)

Negative 0.00%, Positive 100.00%
------------------------------------------------------------------------------------------------------------------------
 Negative 46.33%, Positive 53.67%
------------------------------------------------------------------------------------------------------------------------
 Negative 23.89%, Positive 76.11%
------------------------------------------------------------------------------------------------------------------------
 Negative 27.62%, Positive 72.38%
------------------------------------------------------------------------------------------------------------------------
 Negative 46.33%, Positive 53.67%
------------------------------------------------------------------------------------------------------------------------
