In [1]:
import sklearn
import numpy as np
import pandas as pd
import pymorphy2
import csv
from pymystem3 import Mystem

In [2]:
p_d = pd.read_csv('positive.csv',sep=';',header=None)
p_d = p_d[[3,4]]
p_d.columns = ['text','label']
n_d = pd.read_csv('negative.csv',sep=';',header=None)
n_d = n_d[[3,4]]
n_d.columns = ['text','label']

In [3]:
morph = pymorphy2.MorphAnalyzer()
mystem = Mystem()

In [4]:
def text_cleaner_pymorphy(text):
    text = text.lower()
    alph = 'абвгдеёжзийклмнопрстуфчцчшщъыьэюя'
    cleaned_text = ''
    for char in text:
        if(char.isalpha() and char[0] in alph) or (char == ' '):
            cleaned_text += char
    result = []
    for word in cleaned_text.split():
        result.append(morph.parse(word)[0].normal_form)
    return ' '.join(result)                    

In [5]:
p_d['text'] = p_d['text'].apply(text_cleaner_pymorphy)

In [6]:
n_d['text'] = n_d['text'].apply(text_cleaner_pymorphy)

In [7]:
p_d.to_csv('cleaned_positiv.csv',index=False)
n_d.to_csv('cleaned_negative.csv',index=False)
p_count = len(p_d.index)/2
n_count = len(n_d.index)/2
p_d_teach = p_d['text'][:int(p_count)]
n_d_teach = n_d['text'][:int(n_count)]

#p_d_teach = p_d['text'][:10000]
#n_d_teach = n_d['text'][:10000]

p_d_teach.to_csv('cleaned_positiv_text.csv',index = False, header = False)
n_d_teach.to_csv('cleaned_negative_text.csv',index = False, header = False)

In [5]:
def get_probabilities(file,h):
    import csv
    import codecs
    import pymorphy2.tokenizers
    
    Read_file = csv.DictReader(codecs.open(file,"r","utf-8"),delimiter = ';')
    words = []
    for row in Read_file:
        words += pymorphy2.tokenizers.simple_word_tokenize(str(row))
    
    unique_words = set(words)
    
    p = dict.fromkeys(unique_words, 0)
    for item in words:
        p[item]+=1./len(words)
    
    if h == 0:
        Write_file = open("prob1.csv", "w")
    if h == 1:
        Write_file = open("prob2.csv", "w")
    
    Write_file.write("word;probability\n")
    
    for key2,value2 in p.items():
        result = str(key2) + ";" + str(value2) +"\n"
        Write_file.write(result) 
        
    return (p)

In [6]:
def errors(f_set,f_dict):
    for value,key in f_dict.items():
        flag=0;
        for item in f_set:
            if(item==value):
                flag=1
        if flag==0:
            del f_dict[value]
        return(f_dict)

In [7]:
prob1 = get_probabilities("cleaned_positiv_text.csv",0)
prob2 = get_probabilities("cleaned_negative_text.csv",1)

general = set(dict.keys(prob1)) & set(dict.keys(prob2))

prob1 = errors(general,prob1)
prob2 = errors(general,prob2)

In [8]:
import codecs
Write_file = codecs.open("dictionary.csv", "w", "utf-8")
Write_file.write("word;weirdness\n")
for key1, value1 in prob1.items():
    for key2, value2 in prob2.items():
        if(key1==key2):
            #weirdness = value1 / value2
            if value1 < value2:
                weirdness1 = (-1) * (value1 / value2)
            else:
                weirdness1 =  value2 / value1
            result = str(key1) +";" + str(weirdness1)+"\n"
            Write_file.write(result)

In [10]:
import codecs
with codecs.open("dictionary.csv", "r","utf-8") as fin:
    finy = pd.read_csv(fin,sep=';')
    for row in finy:
        row[0].encode('cp1251').decode('utf-8')
    finy.to_csv('dict_to_kurs.csv',index = False)
finy

Unnamed: 0,word,weirdness
0,ураааа,0.065887
1,объяснение,0.602400
2,рэпчик,-0.474294
3,начинаеться,-0.474294
4,вкладывать,0.602400
5,ном,-0.711440
6,учиться,-0.595807
7,упругий,0.527100
8,резина,0.527100
9,вау,0.283823


In [13]:
def classificator(text,p_dict):
    import csv
    import codecs
    import pymorphy2.tokenizers
    import sklearn
    
    morph = pymorphy2.MorphAnalyzer()
    
    with codecs.open(text,"r","utf-8") as text_t:
        
        text_t1 = csv.reader(text_t, delimiter = ';')

        new_f = codecs.open("finish.csv", "w","utf-8")
        new_f.write("text;W;class;class_right\n")

        for row in text_t1:
            W=0
            txt_class = 0
            words = pymorphy2.tokenizers.simple_word_tokenize(str(row[0]))
            j = 0
            finish = len(words)
            while j < finish:
                try:
                    words[j] = morph.parse(words[j])[0].normal_form
                    if len(words[j]) < 4:
                        del words[j]
                        finish = finish - 1
                    else:
                        with codecs.open(p_dict,"r","utf-8") as p_dict1:
                            dict1 = csv.reader(p_dict1, delimiter = ';')
                            for row_dict in dict1:
                                if words[j]==row_dict[0]:
                                    W += float(row_dict[1])
                    if W < -0.5:
                        txt_class=-1
                    if W > 0.5:
                        txt_class=1
                    if W>-0.5 and W<0.5:
                        txt_class=0
                    j = j + 1
                except UnicodeDecodeError:
                    del words[j]
                    finish = finish - 1
                except IndexError:
                    del words[j]
                    finish = finish - 1
            result = str(row[0]) +";" + str(W) +";" + str(txt_class)+ ";" + str(row[1]) +"\n"
            new_f.write(result)

In [15]:
import csv
p_d_test = p_d.tail(int(p_count))
#p_d_test = p_d.tail(100)
p_d_test.to_csv('positiv_test.csv', sep =';', index = False, header = False)

#n_d_test = n_d.tail(int(n_count))

classificator("positiv_test.csv","dictionary.csv")

fin = pd.read_csv("finish.csv",sep=';')
fin
#total=pd.pivot_table(fin, aggfunc=sum,columns="class",value="class_right")

PermissionError: [Errno 13] Permission denied

In [16]:
import codecs
with codecs.open("finish.csv","r","utf-8") as text_t:
    text_t1 = csv.reader(text_t, delimiter = ';')
    N = 0
    M = 0
    L = 0
    for row in text_t1:
        if str(row[2]) == "1":
            N = N + 1
        if str(row[2]) == "-1":
            M = M + 1
        if str(row[2]) == "0":
            L = L + 1
    print(N,M,L)


3717 3511 3067
