In [1]:
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.mlab as mlab
import plotly.plotly as py
import matplotlib
%matplotlib inline

matplotlib.rcParams['figure.figsize'] = (10.0, 8.0)


## Import raw data


In [2]:
raw_data = pd.read_csv("Consumer_Coded_Comments_2017.csv", index_col=None)

In [3]:
#raw_data.columns

In [4]:
#raw_data.head(3)

## TF-IDF based Vectorization

In [5]:
import nltk
import string
import os
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem.porter import PorterStemmer

token_dict = {}
stemmer = PorterStemmer()
Y = [] # Sensiment labels

def stem_tokens(tokens, stemmer):
    stemmed = []
    for item in tokens:
        stemmed.append(stemmer.stem(item))
    return stemmed

def tokenize(text):
    tokens = nltk.word_tokenize(text)
    stems = stem_tokens(tokens, stemmer)
    return stems

for i in range(raw_data.shape[0]):
    record = raw_data.iloc[i]
    if not pd.isnull(record.Comment) and not pd.isnull(record.Sentiment):
        lowers = record.Comment.lower()

        no_punctuation = lowers.translate(None, string.punctuation)
        no_punctuation = unicode(no_punctuation, errors='replace')

        token_dict[i] = no_punctuation
        
        # the corresponding y
        Y.append(record.Sentiment)
## so far: token_dict is {index:cleaned comment, ...}, Y is the corresponding sentiment.

if not os.path.isfile('TF-IDF.csv'): 
    #this can take some time
    tfidf = TfidfVectorizer(tokenizer=tokenize, stop_words='english', ngram_range=(1,1))
    tfs = tfidf.fit_transform(token_dict.values())

    df = pd.DataFrame(data=tfs.toarray(), 
                  index = token_dict.keys(), 
                  columns=tfidf.get_feature_names())
    df.to_csv('TF-IDF.csv', encoding='utf-8')
else:
    df = pd.read_csv('TF-IDF.csv')    
    
df_sentiment = pd.DataFrame()
df_sentiment['sentiment'] = Y
df_sentiment.to_csv("Sentiments.csv")

In [11]:
from sklearn.model_selection import train_test_split
print "here"
all_X_array = df.as_matrix()
print "here too"
all_Y_array = Y
print "here three"
data = train_test_split(all_X_array, all_Y_array, test_size=0.2, random_state=10)
X_train, X_test, Y_train, Y_test = data
print X_train[0], X_test[0], Y_train[0], Y_test[0]

here
here too
here three
[17954.     0.     0. ...     0.     0.     0.] [12114.     0.     0. ...     0.     0.     0.] Positive Positive


In [12]:
from sklearn.linear_model import RidgeClassifier
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import Perceptron
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.utils.extmath import density
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier


def benchmark(clf, data):
    X_train, X_test, Y_train, Y_test = data
    print X_train[0], X_test[0], Y_train[0], Y_test[0]
    clf.fit(X_train, Y_train)
    pred = clf.predict(X_test)
    precision, recall, fbeta_score, support = metrics.precision_recall_fscore_support(
        Y_test, pred, average=None)
    print metrics.classification_report(Y_test, pred)



clf_knn = KNeighborsClassifier(n_neighbors=10)

clf_MLP = MLPClassifier(solver='lbfgs', alpha=1e-5,
                    hidden_layer_sizes=(100, 5, 3, ), random_state=1)

clf_rf= RandomForestClassifier(
            n_estimators=100,
            max_depth=None,
            min_samples_split=2,
            random_state=0)

clf_lsvc = LinearSVC()
clf_dt = DecisionTreeClassifier(
            max_depth=None, min_samples_split=6, random_state=0)
clf_adaboost = AdaBoostClassifier(n_estimators=200)

X_train, X_test, Y_train, Y_test = data

print "Training data size:", len(Y_train)
print "Test data size:", len(Y_test)
labels = list(set(Y_test))
print "#of classes:", len(labels)

print('=' * 50)
print("Neural Network-MLP")
benchmark(clf_MLP, data)

print('=' * 50)
print("SVM")
benchmark(clf_lsvc, data)

print('=' * 50)
print("Decision Tree")
benchmark(clf_dt, data)

print('=' * 50)
print("Random Forest")
benchmark(clf_rf, data)

print('=' * 50)
print("AdaBoost")
benchmark(clf_adaboost, data)

Training data size: 18722
Test data size: 4681
#of classes: 3
Neural Network-MLP
[17954.     0.     0. ...     0.     0.     0.] [12114.     0.     0. ...     0.     0.     0.] Positive Positive
             precision    recall  f1-score   support

   Negative       0.93      1.00      0.96       902
    Neutral       1.00      0.63      0.77       589
   Positive       0.96      1.00      0.98      3190

avg / total       0.96      0.95      0.95      4681

SVM
[17954.     0.     0. ...     0.     0.     0.] [12114.     0.     0. ...     0.     0.     0.] Positive Positive
             precision    recall  f1-score   support

   Negative       1.00      0.86      0.92       902
    Neutral       0.15      1.00      0.26       589
   Positive       0.00      0.00      0.00      3190

avg / total       0.21      0.29      0.21      4681

Decision Tree
[17954.     0.     0. ...     0.     0.     0.] [12114.     0.     0. ...     0.     0.     0.] Positive Positive
             precision 

In [34]:
import csv



def show_uncorrect(classifier, test_X, y_real, raw_data):
    y_predict = classifier.predict(test_X)
    writer = csv.writer(open("mislabel.csv", 'w'))
    for i in xrange(len(y_predict)):
        if y_predict[i] != y_real[i]:
            
            writer.writerow([y_real[i], y_predict[i], raw_data.iloc[i].Comment])
            print "{},{},{} \n".format(y_real[i], y_predict[i], raw_data.iloc[i].Comment)

mis_dict = show_uncorrect(clf_MLP, X_test, Y_test, raw_data)


Neutral,Positive,By what for now I didn't of virus problem, which is positive. Nevertheless, although I have a two year subscription, I have received several very incentives messages from McAfee to resubscribe me after a year only claiming that I didn't have any antivirus! Fortunately, I am not been fooled, otherwise, I would have paid a year for nothing. 

Neutral,Negative,because I renewed with you at the given time but you keep telling me I am no longer covered and you are too expensive. 

Neutral,Positive,I like the way you can use McAfee on several devices however I kept getting alerts saying I wasn't up to date but I was!  Which was frustrating. 

Neutral,Positive,I'm tired of all the McAfee pop-ups every time I open the internet, telling me my firewall is down.  I paid a hefty price for one year's coverage, and yet you keep trying to sell me more coverage. SERIOUSLY  looking at getting rid of this product, and company; I am looking for a replacement. 

Neutral,Positive,Bad custo

In [23]:
from sklearn.externals import joblib
joblib.dump(clf_adaboost, "1.0adaboost.pk1")

['1.0adaboost.pk1']

In [None]:
#loaded_one = joblib.load("0.82_lsvc.pk1")
#loaded_one.predict(asdasdasdasdasd)

In [None]:
def clear_input(list_string):
    cleared_token = {}
    for i in xrange(len(list_string)):
        comment = list_string[i]
        comment = comment.lower() # lower case
        comment = lowers.translate(None, string.punctuation) # remove puncu
        comment = unicode(comment, errors='replace')
        cleared_token[i] = comment
    
    tfidf = TfidfVectorizer(tokenizer=tokenize, stop_words='english', ngram_range=(1,1))
    tfs = tfidf.fit_transform(cleared_token.values())
    
    df = pd.DataFrame(data=tfs.toarray(), 
                  index = cleared_token.keys(), 
                  columns=tfidf.get_feature_names())
    print "customize input: ", df
    
    df = df.as_matrix()
    return df
print "old one", df
answer = clear_input(["i have no idea this can work"])
clf_lsvc.predict([X_train[0]])