# Importing the data set

In [1]:
import os
import pandas as pd


pos_path = ".\\data\\train\\pos\\"
neg_path = ".\\data\\train\\neg\\"

neg_review_list = []
pos_review_list = []
review_list = []
test_set = []

for file in os.listdir(neg_path):
    file_path = os.path.join(neg_path, file)
    fh = open(file_path, 'r', encoding="utf8")
    entry = {'class': 0, 'raw_txt': fh.read()}
    neg_review_list.append(entry)
    fh.close()
    
for file in os.listdir(pos_path):
    file_path = os.path.join(pos_path, file)
    fh = open(file_path, 'r', encoding="utf8")
    entry = {'class': 1, 'raw_txt': fh.read()}
    pos_review_list.append(entry)
    fh.close()

review_list = pd.DataFrame(neg_review_list + pos_review_list)

print("Done")

Done


Preprocessing


In [2]:
sample_review = pd.DataFrame(neg_review_list[:10]+pos_review_list[:10])

In [3]:
import re
from nltk.corpus import stopwords
import numpy as np
from textblob import Word, TextBlob

# stopWords = set(stopwords.words('english'))
stopWords=['in','of','at','a','the']

def text_processing(txt):
    processed = re.sub(r'[^\w\s]', '', txt.lower())
    #processed = ' '.join([Word(word).lemmatize() for word in processed.split()])
    processed = negationHandling(processed.split())
    return processed

def negationHandling(word_list):
    """
    given an matrix of individual words in order, combine "negation,"word" into
    "neg_word"
    """
    negation_words = ("not", "hardly", "barely", "never", "neither", "scarcely",
                      "doesn't", "doesnt", "isn't", "isnt", "wasn't", "wasnt",
                      "shouldn't", "shouldnt", "wouldn't", "wouldnt", "couldn't",
                      "couldnt", "won't", "wont", "can't", "cant", "don't", "dont")
    for index in range(len(word_list) - 1):
        if word_list[index] in negation_words:
            word_list[index + 1] = "neg_" + word_list[index + 1]
    new_list = []
    for word in word_list:
        if word not in negation_words:
            new_list.append(word)
    return ' '.join(new_list)


def processing(df):
    # lowering, removing punctuation and lemmatization
    df['processed'] = df['raw_txt'].apply(lambda x: text_processing(x))

    # Removing stopwords    
    df['processed_no_stopwords'] = df['processed'].apply(
        lambda x: ' '.join([t for t in x.split(' ') if t not in stopWords]))
    # Sentiment
    df['sentiment'] = df['processed_no_stopwords'].apply(lambda x: TextBlob(x).sentiment[0])

    return df

sample = processing(sample_review)
df = processing(review_list)
print(sample.head())



   class                                            raw_txt  \
0      0  Story of a man who has unnatural feelings for ...   
1      0  Airport '77 starts as a brand new luxury 747 p...   
2      0  This film lacked something I couldn't put my f...   
3      0  Sorry everyone,,, I know this is supposed to b...   
4      0  When I was little my parents took me along to ...   

                                           processed  \
0  story of a man who has unnatural feelings for ...   
1  airport 77 starts as a brand new luxury 747 pl...   
2  this film lacked something i neg_put my finger...   
3  sorry everyone i know this is supposed to be a...   
4  when i was little my parents took me along to ...   

                              processed_no_stopwords  sentiment  
0  story man who has unnatural feelings for pig s...  -0.071759  
1  airport 77 starts as brand new luxury 747 plan...   0.036677  
2  this film lacked something i neg_put my finger...   0.079167  
3  sorry everyone i 

In [4]:
from sklearn.model_selection import train_test_split

features = [f for f in df.columns.values if f not in ['class']]
x_train, x_test, y_train, y_test = train_test_split(df['processed_no_stopwords'], df['class'],
                                                    test_size=0.20, random_state=10)
x_set = df[features]
target_set = df['class']
print(x_train.head())

5610     devil hunter gained notoriety for fact that it...
3742     what can i say about seven poundswell i watche...
5692     this film was so predictable that during entir...
22213    an attempt to cash on success universals horro...
23165    dark comedy gallows humor how does one make co...
Name: processed_no_stopwords, dtype: object


In [39]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer

cv = CountVectorizer(ngram_range=(1,3),binary=True)
cv.fit(x_train)
feature_matrix_train = cv.transform(x_train)
feature_matrix_test =  cv.transform(x_test)



In [40]:
print(feature_matrix_test.shape)
print(feature_matrix_train.shape)

(5000, 4597493)
(20000, 4597493)


In [41]:
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression
selector = SelectFromModel(LogisticRegression(),threshold=0.01)
selector.fit(feature_matrix_train,y_train)
X_test_final = selector.transform(feature_matrix_test)
x_train_final = selector.transform(feature_matrix_train)



In [42]:
print(x_train_final.shape)
print(X_test_final.shape)

(20000, 2964125)
(5000, 2964125)


In [44]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

for c in [0.01, 0.05, 0.25, 0.5, 1]:
    
    lr = LogisticRegression(C=c)
    lr.fit(x_train_final, y_train)
    print ("Accuracy for C=%s: %s" 
           % (c, accuracy_score(y_test, lr.predict(X_test_final))))



Accuracy for C=0.01: 0.8748


Accuracy for C=0.05: 0.8784


Accuracy for C=0.25: 0.8802


Accuracy for C=0.5: 0.8806


In [19]:
final_model  = LogisticRegression(C = 0.5)
final_model.fit(feature_matrix_train,y_train)




LogisticRegression(C=0.5, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [20]:
feature_to_coef = {
    word: coef for word, coef in zip(
        cv.get_feature_names(), final_model.coef_[0]
    )
}
for best_positive in sorted(
    feature_to_coef.items(), 
    key=lambda x: x[1], 
    reverse=True)[:50]:
    print (best_positive)
    
    
for best_negative in sorted(
    feature_to_coef.items(), 
    key=lambda x: x[1])[:20]:
    print (best_negative)


('great', 4.073240179349115)
('best', 2.927497136612272)
('excellent', 2.8809247427612217)
('wonderful', 2.5171521886866857)
('love', 2.47525677090837)
('loved', 1.9723859747717905)
('amazing', 1.9475302105225027)
('perfect', 1.9413853602755384)
('beautiful', 1.8262653184537212)
('favorite', 1.8182325826943495)
('very', 1.7994653789246966)
('also', 1.776387843232295)
('is great', 1.7727341530160456)
('one best', 1.743629830869582)
('well', 1.738691940903889)
('recommend', 1.643130673180896)
('enjoyed', 1.623164150073664)
('highly', 1.5759168471582203)
('always', 1.548666207659943)
('both', 1.5385886660671149)
('fun', 1.5125325952193178)
('superb', 1.4512508214821251)
('world', 1.4286903239300932)
('very good', 1.423250601624859)
('brilliant', 1.4182725652162131)
('fantastic', 1.4122339130606394)
('life', 1.4005339429993096)
('enjoy', 1.3824731283394047)
('years', 1.3769914900574591)
('my favorite', 1.3734418665960617)
('job', 1.3731244007578425)
('shows', 1.368084079623699)
('will', 1.

('bad', -5.089114554865968)
('worst', -4.6897230776304255)
('awful', -3.0444489769784395)
('waste', -2.7713279190241384)
('nothing', -2.7093266381067793)
('neg_even', -2.660417101393672)
('no', -2.6550757116967105)
('boring', -2.6423353645628502)
('terrible', -2.4912222334885463)
('poor', -2.4077775055403143)
('stupid', -2.3524197624400105)
('worse', -2.3148956909258596)
('plot', -2.288048666245941)
('minutes', -2.211166564981929)
('script', -2.110830995517214)
('acting', -2.0916516874423268)
('horrible', -2.068569300713997)
('money', -2.0062570417317604)
('only', -1.9136187163259404)
('lame', -1.7248845188396813)
