In [2]:
import os
import pandas as pd


pos_path = ".\\data\\train\\pos\\"
neg_path = ".\\data\\train\\neg\\"

neg_review_list = []
pos_review_list = []
review_list = []
test_set = []

for file in os.listdir(neg_path):
    file_path = os.path.join(neg_path, file)
    fh = open(file_path, 'r', encoding="utf8")
    entry = {'class': 0, 'raw_txt': fh.read()}
    neg_review_list.append(entry)
    fh.close()
    
for file in os.listdir(pos_path):
    file_path = os.path.join(pos_path, file)
    fh = open(file_path, 'r', encoding="utf8")
    entry = {'class': 1, 'raw_txt': fh.read()}
    pos_review_list.append(entry)
    fh.close()

review_list = pd.DataFrame(neg_review_list + pos_review_list)

print("Done")

Done


In [3]:
sample_review = pd.DataFrame(neg_review_list[:10]+pos_review_list[:10])

In [4]:
import re
from nltk.corpus import stopwords
import numpy as np
from textblob import Word, TextBlob

stopWords = set(stopwords.words('english'))


def text_processing(txt):
    processed = re.sub(r'[^\w\s]', '', txt.lower())
    processed = ' '.join([Word(word).lemmatize() for word in processed.split()])
    return processed


def processing(df):
    # lowering, removing punctuation and lemmatization
    df['processed'] = df['raw_txt'].apply(lambda x: text_processing(x))

    # Removing stopwords    
    df['processed_no_stopwords'] = df['processed'].apply(lambda x: ' '.join([t for t in x.split(' ')if t not in stopWords]))
                                        
    # total length of sentence in characters
    df['length'] = df['processed'].apply(lambda x: float(len(x)))
    
    # get number of words
    df['words'] = df['processed'].apply(lambda x: float(len(x.split(' '))))
    
    # num words that are not stopwords
    df['words_not_stopword'] = df['processed'].apply(lambda x: float(len([t for t in x.split(' ')if t not in stopWords])))
    
    # num words that are stopwords
    df['words_stopword'] = df['processed'].apply(lambda x: float(len([t for t in x.split(' ')if t in stopWords])))
    
    # get the average word length
    df['avg_word_length'] = df['processed'].apply(
        lambda x: np.mean([len(t) for t in x.split(' ') if t not in stopWords]) if len(
            [len(t) for t in x.split(' ') if t not in stopWords]) > 0 else 0)
    
    # number of upper case words
    df['upper'] = df['raw_txt'].apply(lambda x: float(len([x for x in x.split() if x.isupper()])))
    
    # Sentiment
    df['sentiment'] = df['processed_no_stopwords'].apply(lambda x: TextBlob(x).sentiment[0])
    
    return df

sample = processing(sample_review)
df = processing(review_list)
print(sample.head())


   class                                            raw_txt  \
0      0  Story of a man who has unnatural feelings for ...   
1      0  Airport '77 starts as a brand new luxury 747 p...   
2      0  This film lacked something I couldn't put my f...   
3      0  Sorry everyone,,, I know this is supposed to b...   
4      0  When I was little my parents took me along to ...   

                                           processed  \
0  story of a man who ha unnatural feeling for a ...   
1  airport 77 start a a brand new luxury 747 plan...   
2  this film lacked something i couldnt put my fi...   
3  sorry everyone i know this is supposed to be a...   
4  when i wa little my parent took me along to th...   

                              processed_no_stopwords  length  words  \
0  story man ha unnatural feeling pig start openi...   635.0  112.0   
1  airport 77 start brand new luxury 747 plane lo...  4218.0  773.0   
2  film lacked something couldnt put finger first...   766.0  141.0   


In [10]:
from sklearn.model_selection import train_test_split

features = [f for f in df.columns.values if f not in ['class']]
x_train, x_test, y_train, y_test = train_test_split(df[features], df['class'],
                                                    test_size=0.20, random_state=10)
x_set = df[features]
target_set = df['class']
print(x_train.head())

                                                 raw_txt  \
5610   Devil Hunter gained notoriety for the fact tha...   
3742   What can I say about Seven Pounds...well I wat...   
5692   This film was so predictable, that during the ...   
22213  In an attempt to cash in on the success of Uni...   
23165  Dark comedy? Gallows humor? How does one make ...   

                                               processed  \
5610   devil hunter gained notoriety for the fact tha...   
3742   what can i say about seven poundswell i watche...   
5692   this film wa so predictable that during the en...   
22213  in an attempt to cash in on the success of uni...   
23165  dark comedy gallows humor how doe one make a c...   

                                  processed_no_stopwords  length  words  \
5610   devil hunter gained notoriety fact dpp video n...  1379.0  278.0   
3742   say seven poundswell watched flight seattle to...   714.0  133.0   
5692   film wa predictable entire time youre hoping o

In [6]:
from sklearn.base import BaseEstimator, TransformerMixin


class TxtPicker(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[self.key]


class NumberSelector(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[[self.key]]


In [27]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, HashingVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import GenericUnivariateSelect,mutual_info_classif

tf_id_feature = Pipeline([
    ('selector', TxtPicker(key='processed_no_stopwords')),
    ('tfidf', TfidfVectorizer(ngram_range=(1,2))),
    ('mutual_info', GenericUnivariateSelect(score_func=mutual_info_classif, mode='percentile', param=50))
])

binary_count_feature = Pipeline([
    ('selector', TxtPicker(key='processed')),            
    ('count', HashingVectorizer(ngram_range=(1, 2)))
])

length = Pipeline([
    ('selector', NumberSelector(key='length')),
    ('standard', StandardScaler())
])

words_not_stopword = Pipeline([
    ('selector', NumberSelector(key='words_not_stopword')),
    ('standard', StandardScaler())
])

words_stopword = Pipeline([
    ('selector', NumberSelector(key='words_stopword')),
    ('standard', StandardScaler())
])
words = Pipeline([
    ('selector', NumberSelector(key='words')),
    ('standard', StandardScaler())
])

avg_length = Pipeline([
    ('selector', NumberSelector(key='avg_word_length')),
    ('standard', StandardScaler())
])

sentiment = Pipeline([
    ('selector', NumberSelector(key='sentiment')),
    ('standard', StandardScaler())
])

upper = Pipeline([
    ('selector', NumberSelector(key='upper')),
    ('standard', StandardScaler())
])

In [28]:
from sklearn.pipeline import FeatureUnion

feats_tfid = FeatureUnion([('text', tf_id_feature),
                           ('length', length),
                           ('words', words),
                           ('avg_length', avg_length),
                           ('sentiment', sentiment),
                           ('upper', upper),
                           ('words_not_stopword', words_not_stopword),
                           ('words_stopword', words_stopword)
                           ])

feats_count = FeatureUnion([('text', binary_count_feature),
                            ])

In [30]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectFromModel
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn import tree

svc_pipeline = Pipeline([
    ('feats', feats_tfid),
    ('classifier', LinearSVC())])

logistic_pipeline = Pipeline([
    ('feats', feats_tfid),
    ('selector', SelectFromModel(LinearSVC())),
    ('classifier', LogisticRegression())])

decision_tree_pipeline = Pipeline([
    ('feats', feats_tfid),
    ('classifier', tree.DecisionTreeClassifier())])

naive_bayes_pipeline = Pipeline([
    ('feats', feats_tfid),
    ('classifier', MultinomialNB())])


def fit_predict(model):
    model.fit(x_train, y_train)
    preds = model.predict(x_test)
    print(np.mean(preds == y_test))


fit_predict(logistic_pipeline)




0.874


In [13]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(svc_pipeline, x_set, target_set, cv=4)









In [14]:
print("Accuracy: %0.3f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
print(scores)

Accuracy: 0.840 (+/- 0.02)
[0.84768 0.8216  0.84464 0.8464 ]


In [63]:
test_path = ".\\data\\test\\"
test_set = []

numbers = re.compile(r'(\d+)')
def numericalSort(value):
    parts = numbers.split(value)
    parts[1::2] = map(int, parts[1::2])
    return parts

dir_list = sorted(os.listdir(test_path), key=numericalSort)

for file in dir_list:
    file_path = os.path.join(test_path, file)
    fh = open(file_path, 'r', encoding="utf8")
    entry = {'raw_txt': fh.read()}
    test_set.append(entry)
    fh.close()

test_set = pd.DataFrame(test_set)
print(test_set.head())
print(test_set.shape)


                                             raw_txt
0  Thinking that it could only get better was the...
1  For most people, RoboCop 3 is the film that re...
2  I'm pretty sure Poe would have considered this...
3  This is one of those made-for-TV B movies that...
4  Wallace & Gromit have been around for some tim...
(25000, 1)


In [65]:

test_set = processing(test_set)

test_predictions = pipeline.predict(test_set)

prediction_df = pd.DataFrame(test_predictions)

prediction_df.columns = ['Category']
print(test_set.head())
print(prediction_df.head())

  Xt = transform.transform(Xt)
  Xt = transform.transform(Xt)
  Xt = transform.transform(Xt)
  Xt = transform.transform(Xt)


                                             raw_txt  \
0  Thinking that it could only get better was the...   
1  For most people, RoboCop 3 is the film that re...   
2  I'm pretty sure Poe would have considered this...   
3  This is one of those made-for-TV B movies that...   
4  Wallace & Gromit have been around for some tim...   

                                           processed  \
0  thinking that it could only get better was the...   
1  for most people robocop 3 is the film that rea...   
2  im pretty sure poe would have considered this ...   
3  this is one of those madefortv b movies that i...   
4  wallace  gromit have been around for some time...   

                                   text_not_stopword  length  words  \
0  thinking could get better worst assumption eve...     541    106   
1  people robocop 3 film really big disgrace robo...    5125    998   
2  im pretty sure poe would considered travesty f...     336     58   
3  one madefortv b movies awful kind endea

In [66]:
export_csv = prediction_df.to_csv (r'.\test_results.csv', header=True)
