In [1]:
import logging
logging.root.handlers = []  # Jupyter messes up logging so needs a reset
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
from smart_open import smart_open
import pandas as pd
import numpy as np
from numpy import random
import gensim
import nltk
from sklearn.cross_validation import train_test_split
from sklearn import linear_model
from sklearn import metrics
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix
import matplotlib.pyplot as plt
from gensim.models import Word2Vec
from sklearn.neighbors import KNeighborsClassifier
from sklearn import linear_model
from nltk.corpus import stopwords
import xml.etree.ElementTree as ET
from nltk import word_tokenize
import re
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.cross_validation import KFold
from sklearn.svm import SVC
from sklearn.cross_validation import cross_val_score
from sklearn.grid_search import GridSearchCV
from sklearn.multiclass import OneVsRestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
import pymorphy2
%matplotlib inline

2017-06-18 23:48:08,672 : INFO : 'pattern' package not found; tag filters are not available for English


In [2]:
def tree_df(tree_name):
    tree = ET.parse(tree_name)
    sentences = []
    label = []
    data_item = []
    data = []
    polarity=[]
    root = tree.getroot()
    for sentence in root.iter('sentence'):
        for text in sentence.iter('text'):
            data_item.append(text.text)
        for opinion in sentence.iter('Opinion'):
            label.append(opinion.get('category'))
        label = list(set(label))
        data_item.append(label)
        label=[]
        data.append(data_item)
        data_item=[]
    df = pd.DataFrame(data, columns = ['sentence', 'category'])
    return df

In [3]:
name = 'se16_ru_rest_train.xml'
df = tree_df(name)
test_name = 'se16_ru_rest_test.xml'
df_test = tree_df(test_name)

In [4]:
from stop_words import get_stop_words
stop_words = get_stop_words('russian')

In [5]:
mlb = mlb = MultiLabelBinarizer()
df_all = pd.concat([df, df_test], axis = 0)
y_all = mlb.fit_transform(df_all.iloc[:,1])
y_train = y_all[:len(df)]
y_test = y_all[len(df):]

# Word2Vec

In [6]:
sen_clear = []
for sentence in np.array(df['sentence']):
    sen_cl = re.sub(r'[^\w\d\s\-\n\{\}]', " ", sentence, flags=re.M | re.U | re.I)
    #words = sen_cl.lower().split()
    words = sen_cl.split()
    words_c = []
    for word in words:
        if word != '-':
            words_c.append(word)
    sen_clear.append(words_c)
sen_clear_test = []
for sentence in np.array(df_test['sentence']):
    sen_cl = re.sub(r'[^\w\d\s\-\n\{\}]', " ", sentence, flags=re.M | re.U | re.I)
    #words = sen_cl.lower().split()
    words = sen_cl.split()
    words_c = []
    for word in words:
        if word != '-':
                words_c.append(word)
    words_stop = []
    for word in words_c:
        for stop_word in stop_words:
            if word != stop_word:
                words_stop.append(word)
                break
    sen_clear_test.append(words_c)

In [7]:
model = gensim.models.Word2Vec(sen_clear,size=1000,window=5,min_count=5, workers=4)

2017-06-18 23:48:18,230 : INFO : collecting all words and their counts
2017-06-18 23:48:18,232 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2017-06-18 23:48:18,251 : INFO : collected 8579 word types from a corpus of 41494 raw words and 3655 sentences
2017-06-18 23:48:18,252 : INFO : Loading a fresh vocabulary
2017-06-18 23:48:18,273 : INFO : min_count=5 retains 1330 unique words (15% of original 8579, drops 7249)
2017-06-18 23:48:18,274 : INFO : min_count=5 leaves 30900 word corpus (74% of original 41494, drops 10594)
2017-06-18 23:48:18,283 : INFO : deleting the raw counts dictionary of 8579 items
2017-06-18 23:48:18,289 : INFO : sample=0.001 downsamples 56 most-common words
2017-06-18 23:48:18,290 : INFO : downsampling leaves estimated 24335 word corpus (78.8% of prior 30900)
2017-06-18 23:48:18,291 : INFO : estimated required memory for 1330 words and 1000 dimensions: 11305000 bytes
2017-06-18 23:48:18,304 : INFO : resetting layer weights
2017-06-18 23:

In [8]:
def makeFeatureVec(words, model, num_features):
    # Function to average all of the word vectors in a given
    # paragraph
    #
    # Pre-initialize an empty numpy array (for speed)
    featureVec = np.zeros((num_features,))
    #
    nwords = 0.
    # 
    # Index2word is a list that contains the names of the words in 
    # the model's vocabulary. Convert it to a set, for speed 
    index2word_set = set(model.wv.index2word)
    #
    # Loop over each word in the review and, if it is in the model's
    # vocaublary, add its feature vector to the total
    for word in words:
        if word in index2word_set: 
            nwords = nwords + 1.
            featureVec = np.add(featureVec,model[word])
    # 
    # Divide the result by the number of words to get the average
    if nwords != 0.0:
        featureVec = np.true_divide(featureVec,nwords)
    else:
        featureVec = np.zeros((num_features,))
    return featureVec


def getAvgFeatureVecs(reviews, model, num_features):
    # Given a set of reviews (each one a list of words), calculate 
    # the average feature vector for each one and return a 2D numpy array 
    # 
    # Initialize a counter
    counter = 0.
    # 
    # Preallocate a 2D numpy array, for speed
    reviewFeatureVecs = np.zeros((len(reviews),num_features))
    #reviewFeatureVecs = np.array((len(reviews),num_features))
    # 
    # Loop through the reviews
    for i in range(len(reviews)):
        reviewFeatureVecs[i] = makeFeatureVec(reviews[i], model, num_features)
        
    #for review in reviews:

       # Call the function (defined above) that makes average feature vectors
       #reviewFeatureVecs[counter] = makeFeatureVec(review, model, \
       #    num_features)
       #
       # Increment the counter
       #counter = counter + 1.
    
    return reviewFeatureVecs

In [9]:
# ****************************************************************
# Calculate average feature vectors for training and testing sets,
# using the functions we defined above. Notice that we now use stop word
# removal.


trainDataVecs = getAvgFeatureVecs( sen_clear, model, 1000 )

print ("Creating average feature vecs for test reviews")

testDataVecs = getAvgFeatureVecs( sen_clear_test, model, 1000 )

Creating average feature vecs for test reviews


In [28]:
def classify_random_forest(X_train, y_train, X_test, y_test):
    clf = OneVsRestClassifier(RandomForestClassifier( n_estimators = 100 ))
    clf.fit(X_train,y_train)
    clf.predict_proba(X_test)
    classes = list(mlb.classes_)
    classes_final = []
    for line in clf.predict_proba(X_test):
        cl = []
        for i in range(len(classes)):
            if line[i] >= 0.2:
                cl.append(classes[i])
        classes_final.append(cl)
    classes_final_s = pd.Series(classes_final)
    df_test_f = pd.concat([df_test,classes_final_s], axis = 1)
    df_test_f.rename(columns={0:"predicted classes"})
    #accuracy=clf.score(X_test,y_test)
    y_pred = clf.predict(X_test)
    accuracy = metrics.accuracy_score(y_test, y_pred)
    precision = metrics.precision_score(y_test, y_pred, average = 'macro')
    recall = metrics.recall_score(y_test, y_pred, average = 'macro')
    f1=metrics.f1_score(y_test, y_pred, average = 'macro')
    return df_test_f, accuracy, precision, recall, f1,clf.predict_proba(X_test)


In [29]:
df_test_f_w2v_logreg, accuracy_w2v_logreg, precision_w2v_logreg, recall_w2v_logreg, f1_w2v_logreg,probabilities = classify_random_forest(trainDataVecs,y_train,testDataVecs,y_test)
print('accuracy = {:0.5f}, precision = {:0.5f}, recall = {:0.5f}, f1 = {:0.5f}'.format(accuracy_w2v_logreg, precision_w2v_logreg, recall_w2v_logreg, f1_w2v_logreg))

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


accuracy = 0.26882, precision = 0.23822, recall = 0.01369, f1 = 0.02542


In [30]:
classes = list(mlb.classes_)
classes_final=[]
for line in probabilities:
    cl = []
    for i in range(len(classes)):
        if line[i] >= 0.2:
            cl.append(classes[i])
    classes_final.append(cl)
    cl=[]
print(classes_final)
y_pred = mlb.transform(classes_final) 

[['RESTAURANT#GENERAL', 'SERVICE#GENERAL'], ['SERVICE#GENERAL'], ['FOOD#QUALITY', 'SERVICE#GENERAL'], ['RESTAURANT#GENERAL', 'SERVICE#GENERAL'], ['SERVICE#GENERAL'], ['SERVICE#GENERAL'], [], [], ['RESTAURANT#GENERAL', 'SERVICE#GENERAL'], ['AMBIENCE#GENERAL', 'FOOD#QUALITY', 'SERVICE#GENERAL'], ['FOOD#QUALITY', 'SERVICE#GENERAL'], ['RESTAURANT#GENERAL'], ['FOOD#QUALITY', 'SERVICE#GENERAL'], ['FOOD#QUALITY', 'SERVICE#GENERAL'], ['AMBIENCE#GENERAL', 'SERVICE#GENERAL'], ['FOOD#QUALITY', 'RESTAURANT#GENERAL', 'SERVICE#GENERAL'], ['FOOD#QUALITY', 'RESTAURANT#GENERAL', 'SERVICE#GENERAL'], ['FOOD#QUALITY', 'FOOD#STYLE_OPTIONS', 'RESTAURANT#GENERAL', 'SERVICE#GENERAL'], ['AMBIENCE#GENERAL', 'FOOD#QUALITY', 'SERVICE#GENERAL'], ['AMBIENCE#GENERAL', 'FOOD#QUALITY', 'RESTAURANT#GENERAL', 'SERVICE#GENERAL'], ['AMBIENCE#GENERAL', 'FOOD#QUALITY', 'RESTAURANT#GENERAL', 'SERVICE#GENERAL'], ['AMBIENCE#GENERAL', 'FOOD#QUALITY', 'RESTAURANT#GENERAL', 'SERVICE#GENERAL'], ['AMBIENCE#GENERAL', 'FOOD#QUALITY',

In [31]:
metrics.f1_score(y_test, y_pred, average = 'macro')

  'precision', 'predicted', average, warn_for)


0.13327870878362186

In [32]:
metrics.precision_score(y_test, y_pred, average = 'macro')

  'precision', 'predicted', average, warn_for)


0.16857014184022912

In [33]:
metrics.recall_score(y_test, y_pred, average = 'macro')

0.22024495680688957

In [34]:
metrics.accuracy_score(y_test, y_pred)

0.11497105045492143

In [12]:
print('accuracy = {:0.5f}, precision = {:0.5f}, recall = {:0.5f}, f1 = {:0.5f}'.format(accuracy_w2v, precision_w2v, recall_w2v, f1_w2v))

NameError: name 'accuracy_w2v' is not defined

In [13]:
def classify_SVC2(X_train, y_train, X_test, y_test):
    clf = OneVsRestClassifier(SVC(kernel='linear', random_state=241, probability=True))
    clf.fit(X_train,y_train)
    clf.predict_proba(X_test)
    classes = list(mlb.classes_)
    classes_final = []
    for line in clf.predict_proba(X_test):
        cl = []
        for i in range(len(classes)):
            if line[i] >= 0.2:
                cl.append(classes[i])
        classes_final.append(cl)
        cl=[]
    classes_final_s = pd.Series(classes_final)
    df_test_f = pd.concat([df_test,classes_final_s], axis = 1)
    df_test_f.rename(columns={0:"predicted classes"})
    #accuracy=clf.score(X_test,y_test)
    y_pred = clf.predict(X_test)
    accuracy = metrics.accuracy_score(y_test, y_pred)
    precision = metrics.precision_score(y_test, y_pred, average = 'macro')
    recall = metrics.recall_score(y_test, y_pred, average = 'weighted')
    f1=metrics.f1_score(y_test, y_pred, average = 'macro')
    #coverage_error = metrics.coverage_error(y_test, clf.predict_proba(X_test))
    return df_test_f, accuracy, precision, recall, f1, clf.predict_proba(X_test)


In [14]:
df_test_f_w2v_SVC, accuracy_w2v_SVC, precision_w2v_SVC, recall_w2v_SVC, f1_w2v_SVC, probabilities = classify_SVC2(trainDataVecs,y_train,testDataVecs,y_test)
print('accuracy = {:0.5f}, precision = {:0.5f}, recall = {:0.5f}, f1 = {:0.5f}'.format(accuracy_w2v_SVC, precision_w2v_SVC, recall_w2v_SVC, f1_w2v_SVC))

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


accuracy = 0.24897, precision = 0.00000, recall = 0.00000, f1 = 0.00000


In [17]:
from sklearn.naive_bayes import GaussianNB
def classify_NB(X_train, y_train, X_test, y_test):
    clf = OneVsRestClassifier(GaussianNB())
    clf.fit(X_train,y_train)
    clf.predict_proba(X_test)
    classes = list(mlb.classes_)
    classes_final = []
    for line in clf.predict_proba(X_test):
        cl = []
        for i in range(len(classes)):
            if line[i] >= 0.2:
                cl.append(classes[i])
        classes_final.append(cl)
    classes_final_s = pd.Series(classes_final)
    df_test_f = pd.concat([df_test,classes_final_s], axis = 1)
    df_test_f.rename(columns={0:"predicted classes"})
    #accuracy=clf.score(X_test,y_test)
    y_pred = clf.predict(X_test)
    accuracy = metrics.accuracy_score(y_test, y_pred)
    precision = metrics.precision_score(y_test, y_pred, average = 'micro')
    recall = metrics.recall_score(y_test, y_pred, average = 'micro')
    f1=metrics.f1_score(y_test, y_pred, average = 'micro')
    #coverage_error = metrics.coverage_error(y_test, clf.predict_proba(X_test))
    return df_test_f, accuracy, precision, recall, f1

In [18]:
df_test_f_w2v_NB, accuracy_w2v_NB, precision_w2v_NB, recall_w2v_NB, f1_w2v_NB = classify_NB(trainDataVecs,y_train,testDataVecs,y_test)
print('accuracy = {:0.5f}, precision = {:0.5f}, recall = {:0.5f}, f1 = {:0.5f}'.format(accuracy_w2v_NB, precision_w2v_NB, recall_w2v_NB, f1_w2v_NB))

accuracy = 0.00000, precision = 0.08839, recall = 0.71243, f1 = 0.15726


In [26]:
df_test_f_w2v_logreg, accuracy_w2v_logreg, precision_w2v_logreg, recall_w2v_logreg, f1_w2v_logreg = classify_logreg(trainDataVecs,y_train,testDataVecs,y_test)
print('accuracy = {:0.5f}, precision = {:0.5f}, recall = {:0.5f}, f1 = {:0.5f}'.format(accuracy_w2v_logreg, precision_w2v_logreg, recall_w2v_logreg, f1_w2v_logreg))

accuracy = 0.43342, precision = 0.75168, recall = 0.31169, f1 = 0.44066


In [19]:
df_test_f_w2v_NB

Unnamed: 0,sentence,category,0
0,"Очень милый, уютный ресторанчик со скромными ц...","[FOOD#QUALITY, FOOD#STYLE_OPTIONS, AMBIENCE#GE...","[AMBIENCE#GENERAL, DRINKS#QUALITY, DRINKS#STYL..."
1,"Мы отмечали день рожденья,нам разрешили принес...",[],"[AMBIENCE#GENERAL, DRINKS#PRICES, DRINKS#QUALI..."
2,"Салаты со свежайшей зеленью, мясо и курица неж...",[FOOD#QUALITY],"[AMBIENCE#GENERAL, DRINKS#PRICES, DRINKS#QUALI..."
3,Официантки вежливые и улыбчивые.,[SERVICE#GENERAL],"[RESTAURANT#GENERAL, RESTAURANT#MISCELLANEOUS,..."
4,"Единственное,что немного не понравилось - долг...",[SERVICE#GENERAL],"[AMBIENCE#GENERAL, DRINKS#PRICES, DRINKS#QUALI..."
5,Но это мелочи.,[],"[AMBIENCE#GENERAL, DRINKS#QUALITY, DRINKS#STYL..."
6,Общее впечатление прекрасное.,[RESTAURANT#GENERAL],"[RESTAURANT#MISCELLANEOUS, RESTAURANT#PRICES]"
7,Советуем!,[RESTAURANT#GENERAL],"[RESTAURANT#MISCELLANEOUS, RESTAURANT#PRICES]"
8,"Отличный ресторан, была здесь уже не раз и вот...",[RESTAURANT#GENERAL],"[AMBIENCE#GENERAL, FOOD#QUALITY, FOOD#STYLE_OP..."
9,"И я, и все гости остались очень довольны.",[RESTAURANT#GENERAL],"[AMBIENCE#GENERAL, FOOD#QUALITY, FOOD#STYLE_OP..."


In [20]:
probabilities

array([[ 0.62002954,  0.00518712,  0.03771394, ...,  0.01475069,
         0.02198549,  0.44977116],
       [ 0.37177547,  0.00482966,  0.03799881, ...,  0.01486335,
         0.02227119,  0.5337725 ],
       [ 0.4285837 ,  0.00425641,  0.04183853, ...,  0.01607707,
         0.02309483,  0.38660734],
       ..., 
       [ 0.32046957,  0.00500334,  0.04075292, ...,  0.01591645,
         0.02411613,  0.39244433],
       [ 0.41038624,  0.00383288,  0.05981479, ...,  0.02426867,
         0.02887394,  0.40158368],
       [ 0.61803899,  0.00439827,  0.06091878, ...,  0.01975709,
         0.02759593,  0.47499334]])

In [22]:
classes = list(mlb.classes_)
print(classes)

['AMBIENCE#GENERAL', 'DRINKS#PRICES', 'DRINKS#QUALITY', 'DRINKS#STYLE_OPTIONS', 'FOOD#PRICES', 'FOOD#QUALITY', 'FOOD#STYLE_OPTIONS', 'LOCATION#GENERAL', 'RESTAURANT#GENERAL', 'RESTAURANT#MISCELLANEOUS', 'RESTAURANT#PRICES', 'SERVICE#GENERAL']


In [27]:
classes_final=[]
for line in probabilities:
    cl = []
    for i in range(len(classes)):
        if line[i] >= 0.2:
            cl.append(classes[i])
    classes_final.append(cl)
    cl=[]
print(classes_final)
y_pred = mlb.transform(classes_final) 

[['AMBIENCE#GENERAL', 'FOOD#QUALITY', 'FOOD#STYLE_OPTIONS', 'RESTAURANT#GENERAL', 'SERVICE#GENERAL'], ['AMBIENCE#GENERAL', 'FOOD#QUALITY', 'RESTAURANT#GENERAL', 'SERVICE#GENERAL'], ['AMBIENCE#GENERAL', 'FOOD#QUALITY', 'FOOD#STYLE_OPTIONS', 'RESTAURANT#GENERAL', 'SERVICE#GENERAL'], ['AMBIENCE#GENERAL', 'FOOD#QUALITY', 'SERVICE#GENERAL'], ['AMBIENCE#GENERAL', 'FOOD#QUALITY', 'SERVICE#GENERAL'], ['AMBIENCE#GENERAL', 'FOOD#QUALITY', 'RESTAURANT#GENERAL', 'SERVICE#GENERAL'], ['RESTAURANT#GENERAL', 'SERVICE#GENERAL'], [], ['AMBIENCE#GENERAL', 'FOOD#QUALITY', 'RESTAURANT#GENERAL', 'SERVICE#GENERAL'], ['AMBIENCE#GENERAL', 'FOOD#QUALITY', 'FOOD#STYLE_OPTIONS', 'RESTAURANT#GENERAL', 'SERVICE#GENERAL'], ['AMBIENCE#GENERAL', 'FOOD#QUALITY', 'FOOD#STYLE_OPTIONS', 'RESTAURANT#GENERAL', 'SERVICE#GENERAL'], ['AMBIENCE#GENERAL', 'FOOD#QUALITY', 'RESTAURANT#GENERAL', 'SERVICE#GENERAL'], ['AMBIENCE#GENERAL', 'FOOD#QUALITY', 'RESTAURANT#GENERAL', 'SERVICE#GENERAL'], ['AMBIENCE#GENERAL', 'FOOD#QUALITY', 'R

In [28]:
accuracy = metrics.accuracy_score(y_test, y_pred)
precision = metrics.precision_score(y_test, y_pred, average = 'micro')
recall = metrics.recall_score(y_test,y_pred, average = 'micro')
f1=metrics.f1_score(y_test, y_pred, average = 'micro')

In [29]:
accuracy

0.024813895781637719

In [30]:
precision

0.19266819412622371

In [31]:
recall

0.85807050092764381

In [32]:
f1

0.31467936723932644