In [1]:
import numpy as np 
import pandas as pd 
import nltk 
import sklearn 
from sklearn import*
import re 
from tqdm import tqdm 
import catboost as ctb
import joblib
from collections import*
tqdm.pandas()
import pymystem3
import pymorphy2
import plotly.express as px

In [2]:
df = pd.read_csv('clean_df.csv')
df = df.dropna()

In [3]:
class Clean:
    def __init__(self,idf_max_features=3000,idf_ngram=(1,3),parts=10):
        self.compilers = [re.compile(r'(#|@)\w+'),\
                          re.compile(r'htt(ps|p)\S+'),\
                          re.compile(r'_+')]
        self.parts=parts
        self.morph = pymorphy2.MorphAnalyzer()
        self.m = pymystem3.Mystem()
        self.idf_vec = feature_extraction.text.TfidfVectorizer(max_features=idf_max_features,ngram_range=idf_ngram)
        self.stopwords = nltk.corpus.stopwords.words('russian')
    def re_clear(self,re_df):
        for comp in self.compilers:
            re_df = pd.Series([re.sub(comp,'',line) for line in re_df])
        return re_df
    
    def pymy(self,text):
        normal_text=''.join([self.morph.parse(x)[0].normal_form for x in text if len(x)>3])
        return normal_text
    
    def stopwords_clear (self,text):
        return ' '.join([w for w in text.split() if w not in self.stopwords])

    def lemmize(self,df):
        df=[' '.join(re.findall(r'[А-я]+',line)) for line in df]
        if len(df)<self.parts:
            self.lem_df= [self.pymy(line) for line in df]
        else:
            batch=len(df)//self.parts
            for i in range(self.parts):
                batch_df='|'.join(df[i*batch:(i+1)*batch])
                lem_batch_df=''.join(self.m.lemmatize(batch_df)).split('|')
                df[i*batch:(i+1)*batch] = lem_batch_df           
            self.lem_df=pd.DataFrame(df,columns=['text'])
            self.lem_df['text'] = self.lem_df['text'].apply(self.stopwords_clear)
        return self.lem_df

    def fix(self,df):
        self.fix_df = self.lemmize(self.re_clear(df))
        return self.fix_df


    def hatred_upper(self,text):
        upper = len([i for i in text if i.isupper()])
        lower = len([i for i in text if i.islower()])
        return upper/(upper+lower)

    def hatred_punc(self,text):
        punct = len(re.findall(r'[^\w\s]',text))
        text = len(re.findall(r'\w+',text))
        return punct/text

    def FE(self,df):
        self.df = pd.DataFrame(df)
        self.df.columns=['text']
        self.df['upper'] = self.df.text.apply(self.hatred_upper)
        self.df['punct'] = self.df.text.apply(self.hatred_punc)
        self.df['title'] = self.df.text.apply(lambda x: len([i for i in x if i.istitle()==True]))
        #self.df['pos_sc'] = self.df.text.apply(lambda text: len(re.findall(r'\)|D',text)))
        #self.df['neg_sc'] = self.df.text.apply(lambda text: len(re.findall(r'\(|C|c|С|c',text)))
        self.df = self.df.drop('text',axis=1)
        return self.df

    def idf_text(self,df):
        self.idf_sparse = self.idf_vec.fit_transform(df['text'])
        self.idf_txt=self.idf_sparse.toarray()
        return self.idf_txt

    def ml_prepare_df(self,df):
        self.idf_text(self.fix(df))
        self.FE(df)
        result = np.hstack([self.idf_txt,self.df.values])
        return result

    def ml_transform_text(self,df):
        self.idf_=self.idf_vec.transform(self.fix(df))
        self.FE(df)
        result = np.hstack([self.idf_.toarray(),self.df.values])
        return result

In [4]:
clean = Clean(parts=4)

In [5]:
end_df = clean.ml_prepare_df(df['ttext'])

In [6]:
clf_mnb =naive_bayes.MultinomialNB(alpha=10)

In [7]:
clf_mnb.fit(end_df,df['ttype'])

MultinomialNB(alpha=10)

In [8]:
np.mean(model_selection.cross_val_score(clf_mnb,end_df,df.ttype,cv=3,scoring='roc_auc'))

0.7407429467791653

In [9]:
def nlp_ex(text):
    # some feature engineering
    pred = clf_mnb.predict_proba(clean.ml_transform_text([text]))
    
    return print(f" Negative {pred[0][0]:.2%}, Positive {pred[0][1]:.2%}")

In [10]:
for i in ['это хорошо','позитив',"понравился вечер","давай сегодня зажжем","милота"]:
    nlp_ex(i)
    print('---'*40)

Negative 49.34%, Positive 50.66%
------------------------------------------------------------------------------------------------------------------------
 Negative 49.34%, Positive 50.66%
------------------------------------------------------------------------------------------------------------------------
 Negative 49.34%, Positive 50.66%
------------------------------------------------------------------------------------------------------------------------
 Negative 49.34%, Positive 50.66%
------------------------------------------------------------------------------------------------------------------------
 Negative 49.34%, Positive 50.66%
------------------------------------------------------------------------------------------------------------------------


In [11]:
for i in ['ужас какой то!','ну что за день....',"эхх, жаль что ты вообще появился =/","мда, неудачненько вышло...","это конечно фиаско..."]:
    nlp_ex(i)
    print('---'*40)

Negative 50.93%, Positive 49.07%
------------------------------------------------------------------------------------------------------------------------
 Negative 54.09%, Positive 45.91%
------------------------------------------------------------------------------------------------------------------------
 Negative 51.72%, Positive 48.28%
------------------------------------------------------------------------------------------------------------------------
 Negative 55.66%, Positive 44.34%
------------------------------------------------------------------------------------------------------------------------
 Negative 54.09%, Positive 45.91%
------------------------------------------------------------------------------------------------------------------------


Topic Modeling

In [12]:
tfv_tweets = clean.idf_sparse
tf_svd = sklearn.decomposition.TruncatedSVD(3).fit_transform(tfv_tweets.T)

In [13]:
neg_idx=df.reset_index().query('ttype==-1').index
pos_idx=df.reset_index().query('ttype==1').index

In [14]:
tf_svd_pos = sklearn.decomposition.TruncatedSVD(3).fit_transform(tfv_tweets[pos_idx].T)
tf_svd_neg = sklearn.decomposition.TruncatedSVD(3).fit_transform(tfv_tweets[neg_idx].T)

In [15]:
px.scatter_3d(pd.DataFrame(tf_svd,columns=['x','y','z']),x='x',y='y',z='z',hover_name=clean.idf_vec.get_feature_names())

In [16]:
px.scatter_3d(pd.DataFrame(tf_svd_pos,columns=['x','y','z']),x='x',y='y',z='z',hover_name=clean.idf_vec.get_feature_names())

In [17]:
px.scatter_3d(pd.DataFrame(tf_svd_neg,columns=['x','y','z']),x='x',y='y',z='z',hover_name=clean.idf_vec.get_feature_names())

Generation samples

In [18]:
gen_df=clean.fix(df['ttext'])

In [19]:
def simple_generator(gen_df,tone='all',n_words=10,n_grams = 2): 

    if tone=='positive':
        text=' '.join(gen_df[pos_idx]).split()
    elif tone=='negative':
        text=' '.join(gen_df[neg_idx]).split()
    else:
        text=' '.join(gen_df).split()

    d = defaultdict(list)
    for i in tqdm(range(len(text))):
        d[text[i]].extend(text[i+1:i+n_grams])

    sample= np.random.choice(list(d.keys()))

    for i in range(n_words):
        next_word = np.random.choice(d[sample.split()[-1]])
        sample = sample+' '+next_word
    return sample

In [20]:
gen_df=clean.re_clear(df.ttext)

In [23]:
simple_generator(gen_df,'negative',n_grams=3)

100%|██████████| 1280081/1280081 [00:01<00:00, 1019411.14it/s]


'врятли! не заболеть( RT задолбали хочу все равно встречу Галкиным, и'