In [None]:
import logging
logging.root.handlers = []  # Jupyter messes up logging so needs a reset
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
from smart_open import smart_open
import pandas as pd
import numpy as np
from numpy import random
import gensim
import nltk
from sklearn.cross_validation import train_test_split
from sklearn import linear_model
from sklearn import metrics
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix
import matplotlib.pyplot as plt
from gensim.models import Word2Vec
from sklearn.neighbors import KNeighborsClassifier
from sklearn import linear_model
from nltk.corpus import stopwords
import xml.etree.ElementTree as ET
from nltk import word_tokenize
import re
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.cross_validation import KFold
from sklearn.svm import SVC
from sklearn.cross_validation import cross_val_score
from sklearn.grid_search import GridSearchCV
from sklearn.multiclass import OneVsRestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
import pymorphy2
%matplotlib inline

In [None]:
def tree_df(tree_name):
    tree = ET.parse(tree_name)
    sentences = []
    label = []
    data_item = []
    data = []
    polarity=[]
    root = tree.getroot()
    for sentence in root.iter('sentence'):
        for text in sentence.iter('text'):
            for opinion in sentence.iter('Opinion'):
                data_item.append(text.text)
                data_item.append(opinion.get('category'))
                data_item.append(opinion.get('polarity'))
                data.append(data_item)
                data_item=[]
    df = pd.DataFrame(data, columns = ['sentence', 'category', 'polarity'])
    return df

In [None]:
def tokenizator(text):
    sen_clear = []
    for sentence in np.array(text):
        sen_cl = re.sub(r'[^\w\d\s\-\n\{\}]', " ", sentence, flags=re.M | re.U | re.I)
        words = sen_cl.lower().split()
        words_c = []
        for word in words:
            if word != '-':
                words_c.append(morph.parse(word)[0].normal_form)
        sen_clear.append(words_c)
    return sen_clear

In [None]:
def tokenizator_2(sentence):
    sen_cl = re.sub(r'[^\w\d\s\-\n\{\}]', " ", sentence, flags=re.M | re.U | re.I)
    words = sen_cl.lower().split()
    words_c = []
    for word in words:
        if word != '-':
            words_c.append(morph.parse(word)[0].normal_form)
    return words_c

In [None]:
name = 'se16_ru_rest_train.xml'
df = tree_df(name)
test_name = 'se16_ru_rest_test.xml'
df_test = tree_df(test_name)
morph = pymorphy2.MorphAnalyzer()

In [None]:
from stop_words import get_stop_words
stop_words = get_stop_words('russian')

In [None]:
def classify_SVC(X_train, y_train, X_test, y_test):
    clf = SVC(kernel='linear', random_state=241, probability=True, C=1e5)
    clf.fit(X_train,y_train)
    y_pred = clf.predict(X_test)
    df_pred = pd.Series(y_pred)
    accuracy = metrics.accuracy_score(y_test, y_pred)
    precision = metrics.precision_score(y_test, y_pred, average = 'micro')
    recall = metrics.recall_score(y_test, y_pred, average = 'micro')
    f1=metrics.f1_score(y_test, y_pred, average = 'micro')
    return df_pred, accuracy, precision, recall, f1


In [None]:
def classify_logreg(X_train, y_train, X_test, y_test):
    clf = linear_model.LogisticRegression(n_jobs=1, C=1e5)
    clf.fit(X_train,y_train)
    y_pred = clf.predict(X_test)
    df_pred = pd.Series(y_pred)
    accuracy = metrics.accuracy_score(y_test, y_pred)
    precision = metrics.precision_score(y_test, y_pred, average = 'micro')
    recall = metrics.recall_score(y_test, y_pred, average = 'micro')
    f1=metrics.f1_score(y_test, y_pred, average = 'micro')
    return df_pred, accuracy, precision, recall, f1


In [None]:
def classify_NB(X_train, y_train, X_test, y_test):
    clf = MultinomialNB()
    clf.fit(X_train,y_train)
    y_pred = clf.predict(X_test)
    df_pred = pd.Series(y_pred)
    accuracy = metrics.accuracy_score(y_test, y_pred)
    precision = metrics.precision_score(y_test, y_pred, average = 'micro')
    recall = metrics.recall_score(y_test, y_pred, average = 'micro')
    f1=metrics.f1_score(y_test, y_pred, average = 'micro')
    return df_pred, accuracy, precision, recall, f1

In [None]:
def classify_random_forest(X_train, y_train, X_test, y_test):
    clf = RandomForestClassifier( n_estimators = 100, random_state=241 )
    clf.fit(X_train,y_train)
    y_pred = clf.predict(X_test)
    df_pred = pd.Series(y_pred)
    accuracy = metrics.accuracy_score(y_test, y_pred)
    precision = metrics.precision_score(y_test, y_pred, average = 'micro')
    recall = metrics.recall_score(y_test, y_pred, average = 'micro')
    f1=metrics.f1_score(y_test, y_pred, average = 'micro')
    return df_pred, accuracy, precision, recall, f1

# BAG OF WORDS model

In [None]:
%%time
# training
#count_vectorizer = CountVectorizer(
#    analyzer="word", tokenizer=nltk.word_tokenize,
#    preprocessor=None, stop_words=stop_words, max_features=1000) 
count_vectorizer = CountVectorizer(stop_words=stop_words, max_features=1000,analyzer="word")
train_sentences = count_vectorizer.fit_transform(df['sentence'])
test_sentences = count_vectorizer.transform(df_test['sentence'])
train_1 = train_sentences.toarray()
test_1 = test_sentences.toarray()
mlb = mlb = MultiLabelBinarizer()
df_all = pd.concat([df, df_test], axis = 0)
categories = mlb.fit_transform(df_all['category'])
train_2 = categories[:len(df)]
test_2 = categories[len(df):]
y_train = df['polarity']
y_test = df_test['polarity']
X_test_bow = np.concatenate((test_1,test_2), axis=1)
X_train_bow = np.concatenate((train_1,train_2), axis=1)

In [None]:
df_pred_svc, accuracy_bow_SVC, precision_bow_SVC, recall_bow_SVC, f1_bow_SVC = classify_SVC(X_train_bow,y_train,X_test_bow,y_test)
print('accuracy = {:0.5f}, precision = {:0.5f}, recall = {:0.5f}, f1 = {:0.5f}'.format(accuracy_bow_SVC, precision_bow_SVC, recall_bow_SVC, f1_bow_SVC))
print(df_bow_svc = pd.concat([df_test,df_pred_svc], axis = 1))

In [None]:
df_pred_logreg, accuracy_bow_logreg, precision_bow_logreg, recall_bow_logreg, f1_bow_logreg = classify_logreg(X_train_bow,y_train,X_test_bow,y_test)
print('accuracy = {:0.5f}, precision = {:0.5f}, recall = {:0.5f}, f1 = {:0.5f}'.format(accuracy_bow_logreg, precision_bow_logreg, recall_bow_logreg, f1_bow_logreg))
print(df_bow_logreg = pd.concat([df_test,df_pred_logreg], axis = 1))

In [None]:
df_pred_NB, accuracy_bow_NB, precision_bow_NB, recall_bow_NB, f1_bow_NB = classify_NB(X_train_bow,y_train,X_test_bow,y_test)
print('accuracy = {:0.5f}, precision = {:0.5f}, recall = {:0.5f}, f1 = {:0.5f}'.format(accuracy_bow_NB, precision_bow_NB, recall_bow_NB, f1_bow_NB))
print(df_bow_NB = pd.concat([df_test,df_pred_NB], axis = 1))

In [None]:
df_pred_random_forest, accuracy_bow_random_forest, precision_bow_random_forest, recall_bow_random_forest, f1_bow_random_forest = classify_random_forest(X_train_bow,y_train,X_test_bow,y_test)
print('accuracy = {:0.5f}, precision = {:0.5f}, recall = {:0.5f}, f1 = {:0.5f}'.format(accuracy_bow_random_forest, precision_bow_random_forest, recall_bow_random_forest, f1_bow_random_forest))
print(df_bow_random_forest = pd.concat([df_test,df_pred_random_forest], axis = 1))

# TF.IDF

In [None]:
tfidfv = TfidfVectorizer(stop_words=stop_words, max_features=1000,analyzer="word")
train_sentences_tfidf = tfidf.fit_transform(df['sentence'])
test_sentences_tfidf = tfidf.transform(df_test['sentence'])
train_1_tfidf = train_sentences_tfidf.toarray()
test_1_tfidf = test_sentences_tfidf.toarray()
train_2 = categories[:len(df)]
test_2 = categories[len(df):]
X_test_tfidf = np.concatenate((test_1_tfidf,test_2), axis=1)
X_train_tfidf = np.concatenate((train_1_tfidf,train_2), axis=1)

In [None]:
df_pred_tfidf_SVC, accuracy_tfidf_SVC, precision_tfidf_SVC, recall_tfidf_SVC, f1_tfidf_SVC = classify_SVC(X_train_tfidf,y_train,X_test_tfidf,y_test)
print('accuracy = {:0.5f}, precision = {:0.5f}, recall = {:0.5f}, f1 = {:0.5f}'.format(accuracy_tfidf_SVC, precision_tfidf_SVC, recall_tfidf_SVC, f1_tfidf_SVC))
print(df_tfidf_random_forest = pd.concat([df_test,df_pred_tfidf_SVC], axis = 1))

In [None]:
df_pred_tfidf_logreg, accuracy_tfidf_logreg, precision_tfidf_logreg, recall_tfidf_logreg, f1_tfidf_logreg = classify_logreg(X_train_tfidf,y_train,X_test_tfidf,y_test)
print('accuracy = {:0.5f}, precision = {:0.5f}, recall = {:0.5f}, f1 = {:0.5f}'.format(accuracy_tfidf_logreg, precision_tfidf_logreg, recall_tfidf_logreg, f1_tfidf_logreg))
print(df_tfidf_logreg = pd.concat([df_test,df_pred_tfidf_logreg], axis = 1))

In [None]:
df_pred_tfidf_NB, accuracy_tfidf_NB, precision_tfidf_NB, recall_tfidf_NB, f1_tfidf_NB = classify_NB(X_train_tfidf,y_train,X_test_tfidf,y_test)
print('accuracy = {:0.5f}, precision = {:0.5f}, recall = {:0.5f}, f1 = {:0.5f}'.format(accuracy_tfidf_NB, precision_tfidf_NB, recall_tfidf_NB, f1_tfidf_NB))
print(df_tfidf_NB = pd.concat([df_test,df_pred_tfidf_NB], axis = 1))

In [None]:
df_pred_tfidf_forest, accuracy_tfidf_forest, precision_tfidf_forest, recall_forest_NB, f1_tfidf_forest = classify_random_forest(X_train_tfidf,y_train,X_test_tfidf,y_test)
print('accuracy = {:0.5f}, precision = {:0.5f}, recall = {:0.5f}, f1 = {:0.5f}'.format(accuracy_tfidf_forest, precision_tfidf_forest, recall_forest_NB, f1_tfidf_forest))
print(df_tfidf_forest = pd.concat([df_test,df_pred_tfidf_forest], axis = 1))

# Word2Vec

In [36]:
sen_clear = []
for sentence in np.array(df['sentence']):
    sen_cl = re.sub(r'[^\w\d\s\-\n\{\}]', " ", sentence, flags=re.M | re.U | re.I)
    #words = sen_cl.lower().split()
    words = sen_cl.split()
    words_c = []
    for word in words:
        if word != '-':
            words_c.append(word)
    sen_clear.append(words_c)
sen_clear_test = []
for sentence in np.array(df_test['sentence']):
    sen_cl = re.sub(r'[^\w\d\s\-\n\{\}]', " ", sentence, flags=re.M | re.U | re.I)
    #words = sen_cl.lower().split()
    words = sen_cl.split()
    words_c = []
    for word in words:
        if word != '-':
            words_c.append(word)
    sen_clear_test.append(words_c)

In [37]:
model = gensim.models.Word2Vec(sen_clear,size=1000,window=5,min_count=5, workers=4)

2017-06-17 20:19:16,905 : INFO : collecting all words and their counts
2017-06-17 20:19:17,554 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2017-06-17 20:19:17,580 : INFO : collected 8579 word types from a corpus of 41494 raw words and 3655 sentences
2017-06-17 20:19:17,613 : INFO : Loading a fresh vocabulary
2017-06-17 20:19:17,755 : INFO : min_count=5 retains 1330 unique words (15% of original 8579, drops 7249)
2017-06-17 20:19:17,757 : INFO : min_count=5 leaves 30900 word corpus (74% of original 41494, drops 10594)
2017-06-17 20:19:17,788 : INFO : deleting the raw counts dictionary of 8579 items
2017-06-17 20:19:18,761 : INFO : sample=0.001 downsamples 56 most-common words
2017-06-17 20:19:18,762 : INFO : downsampling leaves estimated 24335 word corpus (78.8% of prior 30900)
2017-06-17 20:19:18,842 : INFO : estimated required memory for 1330 words and 1000 dimensions: 11305000 bytes
2017-06-17 20:19:18,851 : INFO : resetting layer weights
2017-06-17 20:

In [38]:
def makeFeatureVec(words, model, num_features):
    # Function to average all of the word vectors in a given
    # paragraph
    #
    # Pre-initialize an empty numpy array (for speed)
    featureVec = np.zeros((num_features,))
    #
    nwords = 0.
    # 
    # Index2word is a list that contains the names of the words in 
    # the model's vocabulary. Convert it to a set, for speed 
    index2word_set = set(model.wv.index2word)
    #
    # Loop over each word in the review and, if it is in the model's
    # vocaublary, add its feature vector to the total
    for word in words:
        if word in index2word_set: 
            nwords = nwords + 1.
            featureVec = np.add(featureVec,model[word])
    # 
    # Divide the result by the number of words to get the average
    if nwords != 0.0:
        featureVec = np.true_divide(featureVec,nwords)
    else:
        featureVec = np.zeros((num_features,))
    return featureVec


def getAvgFeatureVecs(reviews, model, num_features):
    # Given a set of reviews (each one a list of words), calculate 
    # the average feature vector for each one and return a 2D numpy array 
    # 
    # Initialize a counter
    counter = 0.
    # 
    # Preallocate a 2D numpy array, for speed
    reviewFeatureVecs = np.zeros((len(reviews),num_features))
    #reviewFeatureVecs = np.array((len(reviews),num_features))
    # 
    # Loop through the reviews
    for i in range(len(reviews)):
        reviewFeatureVecs[i] = makeFeatureVec(reviews[i], model, num_features)
        
    #for review in reviews:

       # Call the function (defined above) that makes average feature vectors
       #reviewFeatureVecs[counter] = makeFeatureVec(review, model, \
       #    num_features)
       #
       # Increment the counter
       #counter = counter + 1.
    
    return reviewFeatureVecs

In [39]:
# ****************************************************************
# Calculate average feature vectors for training and testing sets,
# using the functions we defined above. Notice that we now use stop word
# removal.

sen_clear_test = []
for sentence in np.array(df_test['sentence']):
    sen_cl = re.sub(r'[^\w\d\s\-\n\{\}]', " ", sentence, flags=re.M | re.U | re.I)
    #words = sen_cl.lower().split()
    words = sen_cl.split()
    words_c = []
    for word in words:
        if word != '-':
            words_c.append(word)
    sen_clear_test.append(words_c)

trainDataVecs = getAvgFeatureVecs( sen_clear, model, 1000 )

print ("Creating average feature vecs for test reviews")

testDataVecs = getAvgFeatureVecs( sen_clear_test, model, 1000 )

Creating average feature vecs for test reviews


In [40]:
df_test_f_w2v, accuracy_w2v, precision_w2v, recall_w2v, f1_w2v = classify_random_forest(trainDataVecs,y_train,testDataVecs,y_test)

In [42]:
print('accuracy = {:0.5f}, precision = {:0.5f}, recall = {:0.5f}, f1 = {:0.5f}'.format(accuracy_w2v, precision_w2v, recall_w2v, f1_w2v))

accuracy = 0.27626, precision = 0.70968, recall = 0.04082, f1 = 0.07719


In [47]:
def classify_SVC2(X_train, y_train, X_test, y_test):
    clf = OneVsRestClassifier(SVC(kernel='linear', random_state=241, probability=True))
    clf.fit(X_train,y_train)
    clf.predict_proba(X_test)
    classes = list(mlb.classes_)
    classes_final = []
    for line in clf.predict_proba(X_test):
        cl = []
        for i in range(len(classes)):
            if line[i] >= 0.2:
                cl.append(classes[i])
        classes_final.append(cl)
    classes_final_s = pd.Series(classes_final)
    df_test_f = pd.concat([df_test,classes_final_s], axis = 1)
    df_test_f.rename(columns={0:"predicted classes"})
    #accuracy=clf.score(X_test,y_test)
    y_pred = clf.predict(X_test)
    accuracy = metrics.accuracy_score(y_test, y_pred)
    precision = metrics.precision_score(y_test, y_pred, average = 'samples')
    recall = metrics.recall_score(y_test, y_pred, average = 'samples')
    f1=metrics.f1_score(y_test, y_pred, average = 'samples')
    #coverage_error = metrics.coverage_error(y_test, clf.predict_proba(X_test))
    return df_test_f, accuracy, precision, recall, f1


In [48]:
df_test_f_w2v_SVC, accuracy_w2v_SVC, precision_w2v_SVC, recall_w2v_SVC, f1_w2v_SVC = classify_SVC2(trainDataVecs,y_train,testDataVecs,y_test)
print('accuracy = {:0.5f}, precision = {:0.5f}, recall = {:0.5f}, f1 = {:0.5f}'.format(accuracy_w2v_SVC, precision_w2v_SVC, recall_w2v_SVC, f1_w2v_SVC))

accuracy = 0.24897, precision = 0.00000, recall = 0.00000, f1 = 0.00000


  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


In [44]:
df_test_f_w2v_logreg, accuracy_w2v_logreg, precision_w2v_logreg, recall_w2v_logreg, f1_w2v_logreg = classify_logreg(trainDataVecs,y_train,testDataVecs,y_test)
print('accuracy = {:0.5f}, precision = {:0.5f}, recall = {:0.5f}, f1 = {:0.5f}'.format(accuracy_w2v_logreg, precision_w2v_logreg, recall_w2v_logreg, f1_w2v_logreg))

accuracy = 0.44417, precision = 0.75109, recall = 0.31911, f1 = 0.44792


In [45]:
df_test_f_w2v_NB, accuracy_w2v_NB, precision_w2v_NB, recall_w2v_NB, f1_w2v_NB = classify_NB(trainDataVecs,y_train,testDataVecs,y_test)
print('accuracy = {:0.5f}, precision = {:0.5f}, recall = {:0.5f}, f1 = {:0.5f}'.format(accuracy_w2v_NB, precision_w2v_NB, recall_w2v_NB, f1_w2v_NB))

ValueError: Input X must be non-negative