In [3]:
import os
import pandas as pd

test_path = ".\data\\test\\"
pos_path = ".\data\\train\\pos\\"
neg_path = ".\data\\train\\neg\\"

neg_review_list = []
pos_review_list = []
review_list = []
test_set = []

for file in os.listdir(neg_path):
    file_path = os.path.join(neg_path, file)
    fh = open(file_path, 'r', encoding="utf8")
    entry = {'class': 0, 'raw_txt': fh.read()}
    neg_review_list.append(entry)
    fh.close()
    
for file in os.listdir(pos_path):
    file_path = os.path.join(pos_path, file)
    fh = open(file_path, 'r', encoding="utf8")
    entry = {'class': 1, 'raw_txt': fh.read()}
    pos_review_list.append(entry)
    fh.close()

review_list = pd.DataFrame(neg_review_list + pos_review_list)

print("done")

done


In [23]:
import re
from nltk.corpus import stopwords
import numpy as np

stopWords = set(stopwords.words('english'))


def processing(df):
    # lowering and removing punctuation
    df['processed'] = df['raw_txt'].apply(lambda x: re.sub(r'[^\w\s]','', x.lower()))
    df['text_not_stopword'] = df['processed'].apply(lambda x: ' '.join([t for t in x.split(' ')if t not in stopWords]))
    # numerical feature engineering                                                                                                                              
    # total length of sentence
    df['length'] = df['processed'].apply(lambda x: len(x))
    # get number of words
    df['words'] = df['processed'].apply(lambda x: len(x.split(' ')))
    df['words_not_stopword'] = df['processed'].apply(lambda x: len([t for t in x.split(' ')if t not in stopWords]))
    # get the average word length
    df['avg_word_length'] = df['processed'].apply(
        lambda x: np.mean([len(t) for t in x.split(' ') if t not in stopWords]) if len(
            [len(t) for t in x.split(' ') if t not in stopWords]) > 0 else 0)
    # get the average word length
    df['commas'] = df['raw_txt'].apply(lambda x: x.count(','))

    return df


df = processing(review_list)
print(df.head())



   class                                            raw_txt  \
0      0  Story of a man who has unnatural feelings for ...   
1      0  Airport '77 starts as a brand new luxury 747 p...   
2      0  This film lacked something I couldn't put my f...   
3      0  Sorry everyone,,, I know this is supposed to b...   
4      0  When I was little my parents took me along to ...   

                                           processed  length  words  \
0  story of a man who has unnatural feelings for ...     644    112   
1  airport 77 starts as a brand new luxury 747 pl...    4324    801   
2  this film lacked something i couldnt put my fi...     776    141   
3  sorry everyone i know this is supposed to be a...     832    154   
4  when i was little my parents took me along to ...    2265    395   

   words_not_stopword  avg_word_length  commas  \
0                  63         6.365079       1   
1                 484         5.456612      16   
2                  64         6.375000      

In [32]:
from sklearn.model_selection import train_test_split

features = [f for f in df.columns.values if f not in ['class']]
x_train, x_test, y_train, y_test = train_test_split(df[features], df['class'],
                                                    test_size=0.20, random_state=10)
print(x_train.head())

                                                 raw_txt  \
5610   Devil Hunter gained notoriety for the fact tha...   
3742   What can I say about Seven Pounds...well I wat...   
5692   This film was so predictable, that during the ...   
22213  In an attempt to cash in on the success of Uni...   
23165  Dark comedy? Gallows humor? How does one make ...   

                                               processed  length  words  \
5610   devil hunter gained notoriety for the fact tha...    1405    280   
3742   what can i say about seven poundswell i watche...     727    133   
5692   this film was so predictable that during the e...    1153    213   
22213  in an attempt to cash in on the success of uni...    2751    515   
23165  dark comedy gallows humor how does one make a ...    1179    217   

       words_not_stopword  avg_word_length  commas  \
5610                  135         5.118519       9   
3742                   71         5.802817       0   
5692                   98 

In [38]:
from sklearn.base import BaseEstimator, TransformerMixin

class TxtPicker(BaseEstimator, TransformerMixin):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    Use on text columns in the data
    """
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[self.key]

class NumberSelector(BaseEstimator, TransformerMixin):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    Use on numeric columns in the data
    """
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[[self.key]]

In [49]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
text = Pipeline([
                ('selector', TxtPicker(key='processed')),
                ('tfidf', TfidfVectorizer())
                ])


In [39]:
from sklearn.preprocessing import StandardScaler

length = Pipeline([
    ('selector', NumberSelector(key='length')),
    ('standard', StandardScaler())
])

words = Pipeline([
    ('selector', NumberSelector(key='words')),
    ('standard', StandardScaler())
])
words_not_stopword = Pipeline([
    ('selector', NumberSelector(key='words_not_stopword')),
    ('standard', StandardScaler())
])
avg_word_length = Pipeline([
    ('selector', NumberSelector(key='avg_word_length')),
    ('standard', StandardScaler())
])
commas = Pipeline([
    ('selector', NumberSelector(key='commas')),
    ('standard', StandardScaler()),
])


In [None]:
from sklearn.pipeline import FeatureUnion
feats = FeatureUnion([('text', text), 
                      ('length', length),
                      ('words', words),
                      ('words_not_stopword', words_not_stopword),
                      ('avg_word_length', avg_word_length),
                      ('commas', commas)])

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
pipeline = Pipeline([
    ('feats', feats),
    ('classifier', BernoulliNB()),
])

pipeline.fit(x_train, y_train)
preds = pipeline.predict(x_test)

np.mean(preds == y_test)

  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)


In [18]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, preds)

array([[2183,  269],
       [ 506, 2042]], dtype=int64)