### Sentiment Analysis on Movie Reviews

In [127]:
import pandas as pd
import numpy as np

from sklearn.cross_validation import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, VotingClassifier
from sklearn.cross_validation import StratifiedKFold
from sklearn.naive_bayes import MultinomialNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

### 1. Pre processing

In [2]:
read_file = pd.read_csv('./resource/train.tsv', sep='\t')
test_file = pd.read_csv('./resource/test.tsv', sep='\t')

corpus_file = pd.read_excel('./resource/harvard_copus_excel.xlsx')

### 하버드 감정사전
#### 어떻게 이용할것인가?

In [13]:
corpus_file.tail()

Unnamed: 0,Entry,Source,Positiv,Negativ,Pstv,Affil,Ngtv,Hostile,Strong,Power,...,Anomie,NegAff,PosAff,SureLw,If,NotLw,TimeSpc,FormLw,Othtags,Defined
11783,ZENITH,H4,Positiv,,,,,,,,...,,,,,,,,,Noun,|
11784,ZERO,H4Lvd,,,,,,,,,...,,,,,,,,,DET,|
11785,ZEST,H4,Positiv,,,,,,Strong,,...,,,,,,,,,Noun,|
11786,ZINC,H4Lvd,,,,,,,,,...,,,,,,,,,Noun,|
11787,ZONE,H4Lvd,,,,,,,,,...,,,,,,,,,Noun,|


In [3]:
print(type(read_file))

<class 'pandas.core.frame.DataFrame'>


### Phrase -> dfx , Sentiment -> dfy

In [3]:
dfx = read_file.ix[:,2]
dfy = read_file.ix[:,3]

df_test = read_file.ix[:,2:]
print(dfx.shape)

(156060,)


In [6]:
read_file.tail()

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
156055,156056,8544,Hearst 's,2
156056,156057,8544,forced avuncular chortles,1
156057,156058,8544,avuncular chortles,3
156058,156059,8544,avuncular,2
156059,156060,8544,chortles,2


In [7]:
test_file.head()

Unnamed: 0,PhraseId,SentenceId,Phrase
0,156061,8545,An intermittently pleasing but mostly routine ...
1,156062,8545,An intermittently pleasing but mostly routine ...
2,156063,8545,An
3,156064,8545,intermittently pleasing but mostly routine effort
4,156065,8545,intermittently pleasing but mostly routine


### 2.Feature Engneering

#### 2.0 tokenizer

In [180]:
from nltk.stem import WordNetLemmatizer 
import re
stemmer = WordNetLemmatizer()

def stem_tokens(tokens, stemmer):
       
    stemming = [stemmer.lemmatize(phrase) for phrase in tokens]
    
    return stemming

def tokenizer1(words):

    filter_words = re.sub(r'[^a-zA-Z]', " ", words)
    tokens = nltk.word_tokenize(filter_words)
    stems = stem_tokens(tokens, stemmer)
    return stems

In [4]:
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag

def tokenizer_list(doc):
    tagged_list = pos_tag(word_tokenize(doc))
    return tagged_list

#### 2.0.1 Train, Test set 나누기.

In [178]:
x_train, x_test, y_train, y_test = train_test_split(dfx, dfy, train_size = 40000, test_size = 10000)

In [39]:
read_file.ix[:,2:].head()

Unnamed: 0,Phrase,Sentiment
0,A series of escapades demonstrating the adage ...,1
1,A series of escapades demonstrating the adage ...,2
2,A series,2
3,A,2
4,series,2


In [6]:
tk_list =[(tokenizer_list(row)) for row in x_train]

In [12]:
print(tk_list[10])

[('a', 'DT'), ('delightful', 'JJ'), ('romantic', 'JJ'), ('comedy', 'NN'), ('with', 'IN'), ('plenty', 'NN'), ('of', 'IN'), ('bite', 'NN'), ('.', '.')]


#### 2.1 Count Vectorizer

In [30]:
print(x_train)

124232                         routine Hollywood frightfest
79985     Rarely has skin looked as beautiful , desirabl...
106474                             Schneider 's performance
127221                             delicious and delicately
126353                                  with terrific flair
140985    does n't this film have that an impressionable...
36526       rough-around-the-edges , low-budget constraints
124620                                           getting at
68655         to your lover when you wake up in the morning
47420                               stand up in the theater
110538                                  is a convincing one
21538                                  titular character 's
5545                                         things so nice
25410                                               are n't
113974                                               Irwins
141234                              to retrieve her husband
49251                                   

In [48]:
cv = CountVectorizer(ngram_range=(1,2))
phrase_corpus = cv.fit_transform(dfx)


In [49]:
x_train_cv, x_test_cv, y_train_cv, y_test_cv = train_test_split(phrase_corpus, dfy, train_size = 10000, test_size = 10000)

#### 2.2 TF-IDF

In [50]:
import re, nltk
tv = TfidfVectorizer(analyzer='word')

In [51]:
tfidv = tv.fit(dfx)

In [14]:
tfidv.vocabulary_

{u'writings': 9831,
 u'foul': 3488,
 u'four': 3492,
 u'woods': 9783,
 u'hanging': 3943,
 u'woody': 9784,
 u'comically': 1677,
 u'conjure': 1785,
 u'crooned': 2010,
 u'wizardry': 9760,
 u'originality': 6065,
 u'superficially': 8521,
 u'lore': 5181,
 u'lord': 5180,
 u'immature': 4325,
 u'bile': 893,
 u'screaming': 7565,
 u'scholar': 7531,
 u'wooden': 9782,
 u'succession': 8473,
 u'stereotypical': 8298,
 u'straight': 8350,
 u'sturm': 8422,
 u'tired': 8896,
 u'switchblade': 8607,
 u'bacon': 689,
 u'270': 52,
 u'elegant': 2771,
 u'second': 7602,
 u'valiant': 9391,
 u'shrugging': 7820,
 u'admire': 201,
 u'ruthless': 7422,
 u'fingers': 3322,
 u'negated': 5823,
 u'payoff': 6287,
 u'groupie': 3857,
 u'succumb': 8475,
 u'shocks': 7779,
 u'widget': 9693,
 u'hero': 4062,
 u'avert': 651,
 u'intentioned': 4562,
 u'divertissement': 2497,
 u'here': 4059,
 u'china': 1475,
 u'cult': 2037,
 u'natured': 5797,
 u'transfixes': 9004,
 u'substance': 8448,
 u'uplifting': 9352,
 u'pretensions': 6665,
 u'elabora

In [52]:
tfidv_trans = tfidv.transform(dfx)

In [17]:
tfidv_trans.data

array([ 0.56053582,  0.44600286,  0.69776862, ...,  0.44127075,
        0.48108947,  1.        ])

In [53]:
x_train_tf, x_test_tf, y_train_tf, y_test_tf = train_test_split(tfidv_trans, dfy, train_size = 10000, test_size = 5000)

### 3.Model

### 3-1.SVC ( with countvectorizer)

In [55]:
model1= SVC(kernel='linear', probability=True).fit(x_train_cv, y_train_cv)

 #### svc kernel 다른 모델은 precision 값이 낮아 사용할 수 없다.

In [61]:
#model2= SVC(kernel='rbf').fit(x_train_cv, y_train_cv)

### 3-1.Predict


In [58]:
# SVM Report
print("svc : linear Model")
y_pred = model1.predict(x_test_cv)
print(classification_report(y_test_cv, y_pred))
print('accuracy :'+ str(accuracy_score(y_test_cv, y_pred)))
print('*' * 50)

svc : linear Model
             precision    recall  f1-score   support

          0       0.36      0.26      0.30       438
          1       0.42      0.29      0.34      1724
          2       0.64      0.81      0.72      5105
          3       0.47      0.36      0.41      2129
          4       0.43      0.27      0.33       604

avg / total       0.54      0.57      0.55     10000

accuracy :0.5707
**************************************************


In [62]:
# SVM Report
print("svc : rbf Model")
y_pred2 = model2.predict(x_test_cv)
print(classification_report(y_test_cv, y_pred2))
print('accuracy : ' + accuracy_score(y_test_cv, y_pred2))
print('*' * 50)

svc : rbf Model
             precision    recall  f1-score   support

          0       0.00      0.00      0.00       234
          1       0.00      0.00      0.00       865
          2       0.51      1.00      0.67      2545
          3       0.00      0.00      0.00      1051
          4       0.00      0.00      0.00       305

avg / total       0.26      0.51      0.34      5000

**************************************************


### 3-2.SVC (with tf-idf)

In [67]:
model1_tf = SVC(kernel='linear', probability=True).fit(x_train_tf, y_train_tf)

### 3-2.Predict

In [69]:
# SVM Report
print("svc : linear Model with tf-idf")
y_pred_tf = model1_tf.predict(x_test_tf)
print(classification_report(y_test_tf, y_pred_tf))

svc : linear Model with tf-idf
             precision    recall  f1-score   support

          0       0.29      0.04      0.07       216
          1       0.47      0.22      0.30       902
          2       0.60      0.87      0.71      2584
          3       0.47      0.35      0.40      1015
          4       0.54      0.11      0.18       283

avg / total       0.54      0.57      0.52      5000



In [71]:
print accuracy_score(y_test_tf, y_pred_tf)

0.5696


### 4. SGDClassifier

#### 4-1 SGD(with CounterVectorizer)

In [34]:
sgd = SGDClassifier(loss='log', n_iter=500, random_state=0).fit(x_train_cv, y_train_cv)

In [35]:
y_pred_cv = sgd.predict(x_test_cv)
print(classification_report(y_test_cv, y_pred_cv))

             precision    recall  f1-score   support

          0       0.52      0.17      0.25       245
          1       0.45      0.25      0.32       812
          2       0.62      0.89      0.73      2584
          3       0.49      0.32      0.39      1059
          4       0.45      0.14      0.21       300

avg / total       0.55      0.58      0.54      5000



In [39]:
sgd1 = SGDClassifier(loss='perceptron', n_iter=500, random_state=0).fit(x_train_cv, y_train_cv)

In [40]:
y_pred1 = sgd1.predict(x_test_cv)
print(classification_report(y_test_cv, y_pred1))

             precision    recall  f1-score   support

          0       0.35      0.26      0.30       245
          1       0.38      0.25      0.30       812
          2       0.64      0.82      0.72      2584
          3       0.44      0.33      0.38      1059
          4       0.33      0.20      0.25       300

avg / total       0.52      0.56      0.53      5000



In [42]:
sgd2 = SGDClassifier(loss='modified_huber', n_iter=1000, random_state=0).fit(x_train_cv, y_train_cv)

In [47]:
y_pred2 = sgd1.predict(x_test_cv)
print(classification_report(y_test_cv, y_pred2))

             precision    recall  f1-score   support

          0       0.35      0.26      0.30       245
          1       0.38      0.25      0.30       812
          2       0.64      0.82      0.72      2584
          3       0.44      0.33      0.38      1059
          4       0.33      0.20      0.25       300

avg / total       0.52      0.56      0.53      5000



#### 4-2 SGD(with Tf - Idf)

In [48]:
sgd_tf = SGDClassifier(loss='log', n_iter = 1000, random_state=0).fit(x_train_tf, y_train_tf)

In [49]:
y_pred2_tf = sgd_tf.predict(x_test_tf)
print(classification_report(y_test_tf, y_pred2_tf))

             precision    recall  f1-score   support

          0       0.67      0.02      0.04       222
          1       0.48      0.13      0.21       898
          2       0.57      0.94      0.71      2569
          3       0.44      0.22      0.29      1025
          4       0.67      0.07      0.13       286

avg / total       0.54      0.56      0.47      5000



In [50]:
print(accuracy_score(y_test_tf, y_pred2_tf))

0.5556


### 5. Pipeline

#### 5.1 CountVecotorizer Pipeline

In [24]:
clf_1 = Pipeline([
        ('vect', CountVectorizer(tokenizer=tokenizer_list)),
        (('clf', SVC(kernel='linear')))
    ])

In [25]:
clf_1.fit(x_train, y_train)

Pipeline(steps=[('vect', CountVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        st...,
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))])

In [32]:
print(classification_report(y_test, clf_1.predict(x_test)))

             precision    recall  f1-score   support

          0       0.34      0.27      0.30       421
          1       0.44      0.32      0.37      1725
          2       0.66      0.82      0.73      5197
          3       0.50      0.38      0.43      2091
          4       0.40      0.24      0.30       566

avg / total       0.56      0.59      0.56     10000



In [33]:
print(accuracy_score(y_test, clf_1.predict(x_test)))

0.5853


#### 5.2 Tf-Idf Pipeliine

In [40]:
clf_2 = Pipeline([
        ('tv' , TfidfVectorizer(analyzer = 'word', tokenizer=tokenizer_list)),
        (('clf', SVC(kernel='linear')))
    ])

In [41]:
clf_2.fit(x_train, y_train)

Pipeline(steps=[('tv', TfidfVectorizer(analyzer='word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm=u'l2', preprocessor=None, smooth_idf=True,
 ...,
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))])

In [42]:
print(classification_report(y_test, clf_2.predict(x_test)))

             precision    recall  f1-score   support

          0       0.29      0.05      0.08       421
          1       0.42      0.23      0.30      1725
          2       0.60      0.86      0.71      5197
          3       0.46      0.32      0.38      2091
          4       0.48      0.11      0.18       566

avg / total       0.52      0.56      0.51     10000



In [43]:
print(accuracy_score(y_test, clf_2.predict(x_test)))

0.5612


In [44]:
clf_3 = Pipeline([
        ('vect', CountVectorizer(ngram_range=(1,2))),
        (('sgd', SGDClassifier(loss='log', n_iter = 500, random_state=0)))
    ])

In [45]:
clf_3.fit(x_train, y_train)

Pipeline(steps=[('vect', CountVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 2), preprocessor=None, stop_words=None,
        st...      penalty='l2', power_t=0.5, random_state=0, shuffle=True, verbose=0,
       warm_start=False))])

In [46]:
print(classification_report(y_test, clf_3.predict(x_test)))

             precision    recall  f1-score   support

          0       0.36      0.14      0.20       421
          1       0.46      0.25      0.32      1725
          2       0.62      0.89      0.73      5197
          3       0.49      0.29      0.37      2091
          4       0.50      0.19      0.28       566

avg / total       0.55      0.58      0.53     10000



In [47]:
print(accuracy_score(y_test, clf_3.predict(x_test)))

0.5813


In [65]:
clf_4 = Pipeline([
        ('vect', CountVectorizer(tokenizer=tokenizer_list)),
        (('sgd', SGDClassifier(loss='log', n_iter = 1000, random_state=0)))
    ])

In [66]:
clf_4.fit(x_train, y_train)

Pipeline(steps=[('vect', CountVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        st...      penalty='l2', power_t=0.5, random_state=0, shuffle=True, verbose=0,
       warm_start=False))])

In [67]:
print(classification_report(y_test, clf_4.predict(x_test)))

             precision    recall  f1-score   support

          0       0.32      0.12      0.17       421
          1       0.45      0.23      0.30      1725
          2       0.62      0.89      0.73      5197
          3       0.48      0.28      0.35      2091
          4       0.45      0.17      0.24       566

avg / total       0.54      0.58      0.53     10000



In [68]:
print(accuracy_score(y_test, clf_4.predict(x_test)))

0.5765


#### 6. Class model

In [83]:
class_model1 = BaggingClassifier(DecisionTreeClassifier(),
                           bootstrap_features=True,
                           random_state=0).fit(x_train_cv, y_train_cv)

In [84]:
print(classification_report(y_test, class_model1.predict(x_test_cv)))

             precision    recall  f1-score   support

          0       0.03      0.01      0.01       421
          1       0.16      0.08      0.11      1725
          2       0.52      0.75      0.61      5197
          3       0.22      0.13      0.16      2091
          4       0.04      0.01      0.02       566

avg / total       0.34      0.43      0.37     10000



In [87]:
print(accuracy_score(y_test, class_model1.predict(x_test_cv)))

0.4308


In [89]:
rf_model = RandomForestClassifier().fit(x_train_cv, y_train_cv)

In [90]:
print(classification_report(y_test, rf_model.predict(x_test_cv)))

             precision    recall  f1-score   support

          0       0.05      0.02      0.03       421
          1       0.17      0.07      0.10      1725
          2       0.52      0.80      0.63      5197
          3       0.21      0.09      0.13      2091
          4       0.04      0.01      0.02       566

avg / total       0.35      0.45      0.37     10000



In [92]:
print(accuracy_score(y_test, rf_model.predict(x_test_cv)))

0.4476


In [198]:
p_mnb = Pipeline([
        ('cv', CountVectorizer(tokenizer=tokenizer1, ngram_range=(1,2))),
        (('mnb', MultinomialNB(alpha=0.01)))
    ])

In [None]:
p_mnb.fit(x_train, y_train)

In [None]:
print(classification_report(y_test, p_mnb.predict(x_test)))

In [None]:
print(accuracy_score(y_test, p_mnb.predict(x_test)))

In [193]:
p_mnb1 = Pipeline([
        ('tv' , TfidfVectorizer(analyzer = 'word', ngram_range=(1,1), tokenizer=tokenizer1)),
        (('mnb', MultinomialNB()))
    ])

In [194]:
p_mnb1.fit(x_train, y_train)

Pipeline(steps=[('tv', TfidfVectorizer(analyzer='word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm=u'l2', preprocessor=None, smooth_idf=True,
 ...rue,
        vocabulary=None)), ('mnb', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [195]:
print(classification_report(y_test, p_mnb1.predict(x_test)))

             precision    recall  f1-score   support

          0       0.75      0.01      0.01       437
          1       0.49      0.14      0.21      1787
          2       0.56      0.93      0.70      5059
          3       0.51      0.26      0.34      2121
          4       0.33      0.01      0.01       596

avg / total       0.53      0.55      0.47     10000



In [196]:
print(accuracy_score(y_test, p_mnb1.predict(x_test)))

0.5498
