In [1]:
import pandas as pd
import pickle
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
import spacy
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from skmultilearn.problem_transform import LabelPowerset
import numpy as np
from sklearn.metrics import accuracy_score, f1_score

nlp = spacy.load('en_coref_lg')

In [552]:
def replace_pronouns(text):
    input_actual = text
    text = nlp(text)
    text_resolved = text._.coref_resolved
    if text_resolved:
        text = text_resolved
        return text
    else:
        return input_actual

annotated_reviews_df = pd.read_csv('/Users/gielderks/PycharmProjects/NLP_V2/Laptop_Train_Data.csv')
annotated_reviews_df["text_pro"] = annotated_reviews_df['ReviewText'].map(lambda x: replace_pronouns(x))

test_data = pd.read_csv('/Users/gielderks/PycharmProjects/NLP_V2/Laptop_Test_Data.csv')
test_data["text_pro"] = test_data['ReviewText'].map(lambda x: replace_pronouns(x))

In [553]:
def eval3(x):
    x = eval(x)
    return x

annotated_reviews_df['aspects'] = annotated_reviews_df['aspects'].apply(eval3)
test_data['aspects'] = test_data['aspects'].apply(eval3)

annotated_reviews_df = pd.concat([annotated_reviews_df, test_data])

annotated_reviews_df.head()

Unnamed: 0.1,Unnamed: 0,ReviewText,aspects,terms,text_pro
0,0,This computer is absolutely AMAZING!!!,[LAPTOP#GENERAL],,This computer is absolutely AMAZING!!!
1,1,10 plus hours of battery...,[BATTERY#OPERATION_PERFORMANCE],,10 plus hours of battery...
2,2,super fast processor and really nice graphics ...,"[CPU#OPERATION_PERFORMANCE, GRAPHICS#GENERAL]",,super fast processor and really nice graphics ...
3,3,and plenty of storage with 250 gb(though I wil...,[HARD_DISC#DESIGN_FEATURES],,and plenty of storage with 250 gb(though I wil...
4,4,This computer is really fast and I'm shocked a...,"[LAPTOP#OPERATION_PERFORMANCE, LAPTOP#USABILITY]",,This computer is really fast and I'm shocked a...


In [554]:
len(annotated_reviews_df["text_pro"])

2612

In [723]:
### Mapping only first part of aspect term
temp = ['BATTERY',
         'COMPANY',
         'CPU',
         'GENERAL',
         'KEYBOARD',
         'MEMORY',
         'MISCELLANEOUS',
         'MOUSE',
         'PRICE',
         'QUALITY',
         'SOFTWARE']

replace = {'OPERATION_PERFORMANCE' : 'SPEED',
          'HARD_DISC' : 'HARDDISK', 
           'OS' : 'SOFTWARE',
           'DISPLAY' : 'SCREEN'}

def remove_aspects(x):
    
    new_list = []

    for y in x: # [BATTERY#OPERATION_PERFORMANCE, GRAPHICS#GENERAL]
        

        y = y.split("#") #[BATTERY, OPERATION_PERFORMANCE]
        

        if y[0] in temp: #BATTERY                        
            new_list.append(y[0])
            new_list.append(y[1])
            
        elif y[1] in temp: #BATTERY
                        
            new_list.append(y[1])
                
        elif y[0] == 'LAPTOP':
            
            new_list.append(y[0])
            new_list.append(y[1])
            
        elif y[0] in replace.keys():
            new_list.append(replace[y[0]])
            
        else:
            return "REMOVE"
        
    return new_list

In [724]:
### Mapping only first part of aspect term
def only_first_part(x):
    
    temp = []
    
    for y in x:
        
        if 'LAPTOP' in y:
            
            adder = y.split("#")
            
            temp.append(adder[1])
        
        else:

            adder = y.split("#")

            temp.append(adder[0])
            
        
    return list(set(temp))

### Mapping only first part of aspect term
def only_first_partv2(x):
    
    temp = []
    
    for y in x:
        
        if 'LAPTOP' in y:
            
            temp.append(y)
        
        else:

            adder = y.split("#")

            temp.append(adder[0])
            
        
    return list(set(temp))


#OR

### Mapping only first part of aspect term
def only_second_part(x):
    
    temp = []
    
    for y in x:

        adder = y.split("#")

        temp.append(adder[1])
        
    return list(set(temp))

annotated_reviews_df['aspect_final'] = annotated_reviews_df['aspects'].apply(remove_aspects)
annotated_reviews_df['aspect_first'] = annotated_reviews_df['aspects'].apply(only_first_part)
annotated_reviews_df['aspect_firstv2'] = annotated_reviews_df['aspects'].apply(only_first_partv2)
annotated_reviews_df['aspect_second'] = annotated_reviews_df['aspects'].apply(only_second_part)

In [725]:
annotated_reviews_df.head()

Unnamed: 0.1,Unnamed: 0,ReviewText,aspects,terms,text_pro,aspect_final,aspect_first,aspect_firstv2,aspect_second
0,0,This computer is absolutely AMAZING!!!,[LAPTOP#GENERAL],,This computer is absolutely AMAZING!!!,[GENERAL],[GENERAL],[LAPTOP#GENERAL],[GENERAL]
1,1,10 plus hours of battery...,[BATTERY#OPERATION_PERFORMANCE],,10 plus hours of battery...,"[BATTERY, OPERATION_PERFORMANCE]",[BATTERY],[BATTERY],[OPERATION_PERFORMANCE]
2,2,super fast processor and really nice graphics ...,"[CPU#OPERATION_PERFORMANCE, GRAPHICS#GENERAL]",,super fast processor and really nice graphics ...,"[CPU, OPERATION_PERFORMANCE, GENERAL]","[CPU, GRAPHICS]","[CPU, GRAPHICS]","[OPERATION_PERFORMANCE, GENERAL]"
3,3,and plenty of storage with 250 gb(though I wil...,[HARD_DISC#DESIGN_FEATURES],,and plenty of storage with 250 gb(though I wil...,[HARDDISK],[HARD_DISC],[HARD_DISC],[DESIGN_FEATURES]
4,4,This computer is really fast and I'm shocked a...,"[LAPTOP#OPERATION_PERFORMANCE, LAPTOP#USABILITY]",,This computer is really fast and I'm shocked a...,"[LAPTOP, OPERATION_PERFORMANCE, LAPTOP, USABIL...","[OPERATION_PERFORMANCE, USABILITY]","[LAPTOP#USABILITY, LAPTOP#OPERATION_PERFORMANCE]","[OPERATION_PERFORMANCE, USABILITY]"


In [726]:
annotated_reviews_df = annotated_reviews_df[annotated_reviews_df['aspect_final'] != 'REMOVE']
len(annotated_reviews_df)

2572

In [727]:
#Unique aspects
unique_f = []
unique_s = []
unique_final = []

for x in annotated_reviews_df['aspect_first']:
    for y in x:
        unique_f.append(y)
        
for x in annotated_reviews_df['aspect_final']:
    for y in x:
        unique_final.append(y)
        
for x in annotated_reviews_df['aspect_second']:
    for y in x:
        unique_s.append(y)
        
print(list(set(unique_f)))
print("")
print(list(set(unique_s)))
print("")
print(list(set(unique_final)))

['MEMORY', 'OPTICAL_DRIVES', 'BATTERY', 'PRICE', 'MOUSE', 'PORTABILITY', 'CONNECTIVITY', 'DESIGN_FEATURES', 'KEYBOARD', 'POWER_SUPPLY', 'OPERATION_PERFORMANCE', 'QUALITY', 'OS', 'SUPPORT', 'USABILITY', 'MISCELLANEOUS', 'MULTIMEDIA_DEVICES', 'DISPLAY', 'CPU', 'WARRANTY', 'MOTHERBOARD', 'SOFTWARE', 'COMPANY', 'FANS_COOLING', 'GENERAL', 'HARD_DISC', 'HARDWARE', 'SHIPPING', 'PORTS', 'GRAPHICS']

['CONNECTIVITY', 'OPERATION_PERFORMANCE', 'MISCELLANEOUS', 'PRICE', 'QUALITY', 'PORTABILITY', 'USABILITY', 'GENERAL', 'DESIGN_FEATURES']

['CONNECTIVITY', 'OPERATION_PERFORMANCE', 'MISCELLANEOUS', 'CPU', 'MEMORY', 'BATTERY', 'PRICE', 'QUALITY', 'SOFTWARE', 'SCREEN', 'MOUSE', 'PORTABILITY', 'COMPANY', 'USABILITY', 'GENERAL', 'LAPTOP', 'DESIGN_FEATURES', 'KEYBOARD', 'HARDDISK']


In [728]:
print(len(annotated_reviews_df))

2572


In [729]:
# Convert the multi-labels into arrays
mlb = MultiLabelBinarizer()

# choose between predicting first or second part
y = mlb.fit_transform(annotated_reviews_df.aspect_final)
#y = mlb.fit_transform(annotated_reviews_df.aspect_second)

X = annotated_reviews_df["text_pro"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25)

X_test = X_test.reset_index(drop=True)
X_train = X_train.reset_index(drop=True)

print(len(X_train), len(y_train), len(y_test), len(X_test))

# LabelPowerset allows for multi-label classification
# Multi-label classification is the supervised classification task where
# each data instance may be associated with multiple class labels.
# Build a pipeline for multinomial naive bayes classification
# Label Powerset (LP): every labelset is a single class-label in a multi-class problem
text_clf = Pipeline([('vect', CountVectorizer(stop_words = "english",ngram_range=(1, 1))),
                     ('tfidf', TfidfTransformer(use_idf=True)),
                     ('clf', LabelPowerset(MultinomialNB(alpha=1e-1, fit_prior=False))),])

#This explains how pipelines work: https://www.kaggle.com/baghern/a-deep-dive-into-sklearn-pipelines

text_clf = text_clf.fit(X_train, y_train)
predicted = text_clf.predict(X_test)

predict_proba = text_clf.predict_proba(X_test)

# # Calculate accuracy
print(accuracy_score(y_test, predicted))
print(f1_score(y_test, predicted, average='micro'))

# # Test if SVM performs better
# from sklearn.linear_model import SGDClassifier
# text_clf = Pipeline([('vect', CountVectorizer()),
#                          ('tfidf', TfidfTransformer()),
#                          ('clf-svm', LabelPowerset(
#                              SGDClassifier(loss='hinge', penalty='l2',
#                                            alpha=1e-3, max_iter=6)))])
# _ = text_clf.fit(X_train, y_train)
# predicted = text_clf.predict(X_test)

# #Calculate accuracy
# print('SVM', np.mean(predicted == y_test))
# print(f1_score(y_test, predicted, average='micro'))
text_clf.steps[2][1].classifier.classes_

1929 1929 643 643
0.4214618973561431
0.6003445305770886


array([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,
        13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,
        26,  27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,
        39,  40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,
        52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,
        65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,
        78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,
        91,  92,  93,  94,  95,  96,  97,  98,  99, 100, 101, 102, 103,
       104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116,
       117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129,
       130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142,
       143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155,
       156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168,
       169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 18

In [730]:
import pickle
# Train naive bayes on full dataset and save model
text_clf = Pipeline([('vect', CountVectorizer(stop_words = "english",ngram_range=(1, 1))),
                     ('tfidf', TfidfTransformer(use_idf=False)),
                     ('clf', LabelPowerset(MultinomialNB(alpha=1e-1))),])
text_clf = text_clf.fit(X, y)

# save the model to disk
filename = 'naive_model1.pkl'
pickle.dump(text_clf, open(filename, 'wb'))

# save the the fitted binarizer labels
# This is important: it contains the how the multi-label was binarized, so you need to
# load this in the next folder in order to undo the transformation for the correct labels.
filename = 'mlb.pkl'
pickle.dump(mlb, open(filename, 'wb'))

In [731]:
#type(predict_proba.toarray().tolist()[1])
check_row = 36
print(mlb.classes_[predicted.toarray().tolist()[check_row].index(1)])
print(mlb.classes_[y_test.tolist()[check_row].index(1)])

LAPTOP
BATTERY


In [732]:
print('predicted = ', mlb.classes_[predicted.toarray().tolist()[check_row].index(1)])
print(' ')
print('ACTUAL = ', mlb.classes_[y_test.tolist()[check_row].index(1)])
print([y_test[check_row].tolist().index(1)])
print(' ')
print('Review text = ', X_test[check_row])
print(' ')
df = pd.DataFrame({'aspect' : mlb.classes_, 'proba' : predict_proba.toarray().tolist()[check_row]}).sort_values('proba', ascending=False)
print(df)
print(' ')
print(' ')

# print('predicted array = ', predicted.toarray()[check_row])
# print('Actual array = ', y_test[check_row])

# print(' ')
# print(mlb.classes_)
# print(' ')

print('Review text = ', X_test[check_row])
print(' ')

g = [i for i, e in enumerate(y_test[check_row]) if e == 1]
print('multiple classes actual =', g)
g = [i for i, e in enumerate(predicted.toarray().tolist()[check_row]) if e == 1]
print('multiple classes predicted=', g)
print(' ')

predicted =  LAPTOP
 
ACTUAL =  BATTERY
[0]
 
Review text =  It's light, fast and has great battery!!
 
                   aspect     proba
8                  LAPTOP  0.656549
12  OPERATION_PERFORMANCE  0.569654
4         DESIGN_FEATURES  0.395344
5                 GENERAL  0.319401
15                QUALITY  0.274347
0                 BATTERY  0.187239
18              USABILITY  0.175170
10          MISCELLANEOUS  0.095044
17               SOFTWARE  0.093887
14                  PRICE  0.092621
7                KEYBOARD  0.083709
13            PORTABILITY  0.068938
16                 SCREEN  0.056682
11                  MOUSE  0.049526
2            CONNECTIVITY  0.040566
3                     CPU  0.038961
6                HARDDISK  0.037590
1                 COMPANY  0.031970
9                  MEMORY  0.018845
 
 
Review text =  It's light, fast and has great battery!!
 
multiple classes actual = [0, 4, 8, 12]
multiple classes predicted= [8, 12]
 


In [733]:
predicted_mapped = []

actual_mapped = []

for row in list(predicted.toarray()):
    
    g = [i for i, e in enumerate(list(row)) if e == 1]
    
    leng = len(g)
    
    add = []
    
    for number in g:
        
        add.append(mlb.classes_[number])
    
    predicted_mapped.append(add)
    
for row in list(y_test):
    
    g = [i for i, e in enumerate(list(row)) if e == 1]
    
    leng = len(g)
    
    add = []
    
    for number in g:
        
        add.append(mlb.classes_[number])
    
    actual_mapped.append(add)


In [734]:
validation_df = pd.DataFrame({'predicted' : predicted_mapped, 'actual' : actual_mapped, 'ReviewText' : X_test})
len(validation_df)

643

In [735]:
data_list = []
partial_recall = []

for index, row in validation_df.iterrows():
    
    ad_list = []
    ad_p_list = []
    
    for item in row['predicted']:
        
        if item in row['actual']:
            ad_list.append(item)
            
        splitted = item.split('#')
        item_split = splitted[0]
        
        for thing in row['actual']:
            t_splitted = thing.split('#')
            t_split = t_splitted[0]
            
            if t_split == item_split:
                if t_split not in ad_p_list:    
                    ad_p_list.append(t_split)
                else:
                    continue

    partial_recall.append(ad_p_list)
    data_list.append(ad_list)
                

validation_df['Test_full_recall'] = data_list
validation_df['Test_partial_recall'] = partial_recall


In [736]:
len(validation_df[validation_df['Test_partial_recall'].astype(str) == '[]'])

155

In [737]:
len(validation_df[validation_df['Test_full_recall'].astype(str) == '[]'])

155

In [738]:
len(validation_df)

643

In [740]:
validation_df[0:100]

Unnamed: 0,predicted,actual,ReviewText,Test_full_recall,Test_partial_recall
0,[MISCELLANEOUS],[MISCELLANEOUS],"It works great for general internet use, Micro...",[MISCELLANEOUS],[MISCELLANEOUS]
1,[GENERAL],[GENERAL],Hate Windows 8.1!,[GENERAL],[GENERAL]
2,[QUALITY],[QUALITY],3) Horrible customer support-they lost my lapt...,[QUALITY],[QUALITY]
3,[GENERAL],[GENERAL],"I am pleased with my decision, however I have ...",[GENERAL],[GENERAL]
4,[GENERAL],[GENERAL],"At first when I got this product, I loved this...",[GENERAL],[GENERAL]
5,"[GENERAL, PRICE]","[LAPTOP, OPERATION_PERFORMANCE, PRICE]",It wont wow you with It speed but who can comp...,[PRICE],[PRICE]
6,[QUALITY],[QUALITY],When it come time for warranty service to Tosh...,[QUALITY],[QUALITY]
7,"[OPERATION_PERFORMANCE, SOFTWARE, USABILITY]",[GENERAL],5) Cut my losses and sold my losses for parts,[],[]
8,"[GENERAL, QUALITY]",[GENERAL],There is a chance mine is defective but regard...,[GENERAL],[GENERAL]
9,"[DESIGN_FEATURES, LAPTOP, USABILITY]","[DESIGN_FEATURES, LAPTOP, USABILITY]",the hinge design forced you to place various c...,"[DESIGN_FEATURES, LAPTOP, USABILITY]","[DESIGN_FEATURES, LAPTOP, USABILITY]"
