# Importing the data set


In [2]:
import os
import pandas as pd


pos_path = ".\\data\\train\\pos\\"
neg_path = ".\\data\\train\\neg\\"

neg_review_list = []
pos_review_list = []
review_list = []
test_set = []

for file in os.listdir(neg_path):
    file_path = os.path.join(neg_path, file)
    fh = open(file_path, 'r', encoding="utf8")
    entry = {'class': 0, 'raw_txt': fh.read()}
    neg_review_list.append(entry)
    fh.close()
    
for file in os.listdir(pos_path):
    file_path = os.path.join(pos_path, file)
    fh = open(file_path, 'r', encoding="utf8")
    entry = {'class': 1, 'raw_txt': fh.read()}
    pos_review_list.append(entry)
    fh.close()

review_list = pd.DataFrame(neg_review_list + pos_review_list)

print("Done")

Done


In [3]:
# sample review of 20 for some quick checking
sample_review = pd.DataFrame(neg_review_list[:10]+pos_review_list[:10])

In [92]:
import re
from nltk.corpus import stopwords
import numpy as np
from textblob import Word, TextBlob

stopWords = set(stopwords.words('english'))

def text_processing(txt):
    processed = re.sub(r'[^\w\s]', '', txt.lower())
    # processed = ' '.join([Word(word).lemmatize() for word in processed.split()])
    # processed = negationHandling(processed.split())
    return processed

def negationHandling(word_list):
    """
    given an matrix of individual words in order, combine "negation,"word" into
    "neg_word"
    """
    negation_words = ("not", "hardly", "barely", "never", "neither", "scarcely",
                      "doesn't", "doesnt", "isn't", "isnt", "wasn't", "wasnt",
                      "shouldn't", "shouldnt", "wouldn't", "wouldnt", "couldn't",
                      "couldnt", "won't", "wont", "can't", "cant", "don't", "dont")
    for index in range(len(word_list) - 1):
        if word_list[index] in negation_words:
            word_list[index + 1] = "neg_" + word_list[index + 1]
    new_list = []
    for word in word_list:
        if word not in negation_words:
            new_list.append(word)
    return ' '.join(new_list)

def processing(df):
    # lowering, removing punctuation and lemmatization
    df['processed'] = df['raw_txt'].apply(lambda x: text_processing(x))

    # Removing stopwords    
    df['processed_no_stopwords'] = df['processed'].apply(lambda x: ' '.join([t for t in x.split(' ')if t not in stopWords]))
                                        
    # Sentiment
    df['sentiment'] = df['processed_no_stopwords'].apply(lambda x: TextBlob(x).sentiment[0])
    
    return df

sample = processing(sample_review)
df = processing(review_list)
print(sample.head())


   class                                            raw_txt  \
0      0  Story of a man who has unnatural feelings for ...   
1      0  Airport '77 starts as a brand new luxury 747 p...   
2      0  This film lacked something I couldn't put my f...   
3      0  Sorry everyone,,, I know this is supposed to b...   
4      0  When I was little my parents took me along to ...   

                                           processed  \
0  story of a man who has unnatural feelings for ...   
1  airport 77 starts as a brand new luxury 747 pl...   
2  this film lacked something i couldnt put my fi...   
3  sorry everyone i know this is supposed to be a...   
4  when i was little my parents took me along to ...   

                              processed_no_stopwords  sentiment  
0  story man unnatural feelings pig starts openin...  -0.067593  
1  airport 77 starts brand new luxury 747 plane l...   0.068553  
2  film lacked something couldnt put finger first...   0.025000  
3  sorry everyone kn

In [95]:
from sklearn.model_selection import train_test_split

# splitting up the data set
features = [f for f in df.columns.values if f not in ['class']]
x_train, x_test, y_train, y_test = train_test_split(df[features], df['class'],
                                                    test_size=0.20, random_state=10)
x_set = df[features]
target_set = df['class']
print(x_train.head())

                                                 raw_txt  \
5610   Devil Hunter gained notoriety for the fact tha...   
3742   What can I say about Seven Pounds...well I wat...   
5692   This film was so predictable, that during the ...   
22213  In an attempt to cash in on the success of Uni...   
23165  Dark comedy? Gallows humor? How does one make ...   

                                               processed  \
5610   devil hunter gained notoriety for the fact tha...   
3742   what can i say about seven poundswell i watche...   
5692   this film was so predictable that during the e...   
22213  in an attempt to cash in on the success of uni...   
23165  dark comedy gallows humor how does one make a ...   

                                  processed_no_stopwords  sentiment  
5610   devil hunter gained notoriety fact dpp video n...   0.007029  
3742   say seven poundswell watched flight seattle to...  -0.176136  
5692   film predictable entire time youre hoping obvi...   0.036243 

In [6]:
from sklearn.base import BaseEstimator, TransformerMixin


class Selector(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[self.key]

In [98]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.feature_selection import GenericUnivariateSelect,mutual_info_classif,SelectPercentile,f_classif
from sklearn.linear_model import Lasso,Ridge

sentiment = Pipeline([
    ('selector', Selector(key='sentiment'))
])

tf_id_feature = Pipeline([
    ('selector', Selector(key='processed_no_stopwords')),
    ('tfidf', TfidfVectorizer(ngram_range=(1, 2), max_features=100000))
    #('feature_selection', SelectPercentile(score_func=mutual_info_classif, percentile=50))
])

binary_count_feature = Pipeline([
    ('selector', Selector(key='processed_no_stopwords')),            
    # ('count', CountVectorizer(ngram_range=(1, 2), max_features=100000)),
    ('count', CountVectorizer())
    # ('feature_selection', SelectPercentile(score_func=f_classif, percentile=50))
])

In [35]:
from sklearn.pipeline import FeatureUnion

feats_tfid = FeatureUnion([('text', tf_id_feature),
                           ])

feats_count = FeatureUnion([('text', binary_count_feature),
                            ])


In [None]:

def final_features(features_pipeline):
    features_pipeline.fit(x_train, y_train)
    return features_pipeline.transform(x_train), features_pipeline.transform(x_test)


x_train_final, x_test_final = final_features(feats_tfid)
print(x_train_final.shape)
print(x_test_final.shape)


running


0.8952


In [99]:

from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn import tree
from sklearn.ensemble import VotingClassifier

svc_pipeline = Pipeline([
    ('feats', tf_id_feature),
    ('classifier', LinearSVC())
])
svc_pipeline_count = Pipeline([
    ('feats', binary_count_feature),
    ('classifier', LinearSVC())
])

stacking_pipeline = Pipeline([
    ('feats', tf_id_feature),
    ('voting', VotingClassifier(estimators=[('svc', LinearSVC()),
                                            ('mnb', MultinomialNB()),
                                            ('lr', LogisticRegression(C=0.5))],voting='hard'))       
])

logistic_pipeline = Pipeline([
    ('feats', binary_count_feature),
    ('classifier', LogisticRegression())
])

decision_tree_pipeline = Pipeline([
    ('feats', binary_count_feature),
    ('classifier', tree.DecisionTreeClassifier())])

naive_bayes_pipeline = Pipeline([
    ('feats', binary_count_feature),
    ('classifier', MultinomialNB())])

lasso = Pipeline([
    ('feats', tf_id_feature),
    ('classifier', Lasso(alpha=0.3,fit_intercept=True))])

ridge = Pipeline([
    ('feats', tf_id_feature),
    ('classifier', Ridge(alpha=10, fit_intercept=True))])


running




0.8608




0.8786


0.8588


0.7202


In [100]:

def fit_predict(model):
    model.fit(x_train, y_train)
    print(model.score(x_test, y_test))

print("running")
fit_predict(svc_pipeline_count)
fit_predict(logistic_pipeline)
fit_predict(naive_bayes_pipeline)
fit_predict(decision_tree_pipeline)


running




0.8608




0.8786


0.8588


0.7202


In [102]:
from sklearn.model_selection import cross_val_score
scores_svc_tfidf = cross_val_score(svc_pipeline, x_set, target_set, cv=4)
scores_logistic = cross_val_score(logistic_pipeline, x_set, target_set, cv=4)
scores_NB = cross_val_score(naive_bayes_pipeline, x_set, target_set, cv=4)
scores_DT = cross_val_score(decision_tree_pipeline, x_set, target_set, cv=4)



In [107]:
print("Accuracy: %0.3f (+/- %0.2f)" % (scores_svc_tfidf.mean(), scores_svc_tfidf.std() * 2))
print("Accuracy: %0.3f (+/- %0.2f)" % (scores_logistic.mean(), scores_logistic.std() * 2))
print("Accuracy: %0.3f (+/- %0.2f)" % (scores_NB.mean(), scores_NB.std() * 2))
print("Accuracy: %0.3f (+/- %0.2f)" % (scores_DT.mean(), scores_DT.std() * 2))


Accuracy: 0.854 (+/- 0.02)
Accuracy: 0.842 (+/- 0.02)
Accuracy: 0.788 (+/- 0.01)
Accuracy: 0.716 (+/- 0.01)


In [28]:
test_path = ".\\data\\test\\"
test_set = []

numbers = re.compile(r'(\d+)')
def numericalSort(value):
    parts = numbers.split(value)
    parts[1::2] = map(int, parts[1::2])
    return parts

dir_list = sorted(os.listdir(test_path), key=numericalSort)

for file in dir_list:
    file_path = os.path.join(test_path, file)
    fh = open(file_path, 'r', encoding="utf8")
    entry = {'raw_txt': fh.read()}
    test_set.append(entry)
    fh.close()

test_set = pd.DataFrame(test_set)
print(test_set.head())
print(test_set.shape)


                                             raw_txt
0  Thinking that it could only get better was the...
1  For most people, RoboCop 3 is the film that re...
2  I'm pretty sure Poe would have considered this...
3  This is one of those made-for-TV B movies that...
4  Wallace & Gromit have been around for some tim...
(25000, 1)


In [29]:
test_set = processing(test_set)
svc_pipeline.fit(x_train, y_train)
test_predictions = svc_pipeline.predict(test_set)

prediction_df = pd.DataFrame(test_predictions)

prediction_df.columns = ['Category']
print(test_set.head())
print(prediction_df.head())

                                             raw_txt  \
0  Thinking that it could only get better was the...   
1  For most people, RoboCop 3 is the film that re...   
2  I'm pretty sure Poe would have considered this...   
3  This is one of those made-for-TV B movies that...   
4  Wallace & Gromit have been around for some tim...   

                                           processed  \
0  thinking that it could only get better wa the ...   
1  for most people robocop 3 is the film that rea...   
2  im pretty sure poe would have considered this ...   
3  this is one of those madefortv b movie that is...   
4  wallace gromit have been around for some time ...   

                              processed_no_stopwords  sentiment  
0  thinking could get better wa worst assumption ...   0.151250  
1  people robocop 3 film really big disgrace robo...   0.060466  
2  im pretty sure poe would considered travesty f...   0.009722  
3  one madefortv b movie awful kind endearsbr br ...  -0.11031

In [30]:
export_csv = prediction_df.to_csv (r'.\test_results.csv', header=True)
