# Importing the data set

In [3]:
import os
import pandas as pd


pos_path = ".\\data\\train\\pos\\"
neg_path = ".\\data\\train\\neg\\"

neg_review_list = []
pos_review_list = []
review_list = []
test_set = []

for file in os.listdir(neg_path):
    file_path = os.path.join(neg_path, file)
    fh = open(file_path, 'r', encoding="utf8")
    entry = {'class': 0, 'raw_txt': fh.read()}
    neg_review_list.append(entry)
    fh.close()
    
for file in os.listdir(pos_path):
    file_path = os.path.join(pos_path, file)
    fh = open(file_path, 'r', encoding="utf8")
    entry = {'class': 1, 'raw_txt': fh.read()}
    pos_review_list.append(entry)
    fh.close()

review_list = pd.DataFrame(neg_review_list + pos_review_list)

print("Done")

Done


Preprocessing


In [4]:
sample_review = pd.DataFrame(neg_review_list[:10]+pos_review_list[:10])

In [36]:
import re
from nltk.corpus import stopwords
import numpy as np
from textblob import Word, TextBlob

stopWords = set(stopwords.words('english'))
#stopWords=['in','of','at','a','the','i','he','she','it']

def text_processing(txt):
    processed = re.sub(r'[^\w\s]', '', txt.lower())
    processed = ' '.join([Word(word).lemmatize() for word in processed.split()])
    processed = negationHandling(processed.split())
    return processed

def negationHandling(word_list):
    """
    given an matrix of individual words in order, combine "negation,"word" into
    "neg_word"
    """
    negation_words = ("not", "hardly", "barely", "never", "neither", "scarcely",
                      "doesn't", "doesnt", "isn't", "isnt", "wasn't", "wasnt",
                      "shouldn't", "shouldnt", "wouldn't", "wouldnt", "couldn't",
                      "couldnt", "won't", "wont", "can't", "cant", "don't", "dont")
    for index in range(len(word_list) - 1):
        if word_list[index] in negation_words:
            word_list[index + 1] = "neg_" + word_list[index + 1]
    new_list = []
    for word in word_list:
        if word not in negation_words:
            new_list.append(word)
    return ' '.join(new_list)


def processing(df):
    # lowering, removing punctuation and lemmatization
    df['processed'] = df['raw_txt'].apply(lambda x: text_processing(x))

    # Removing stopwords    
    df['processed_no_stopwords'] = df['processed'].apply(
        lambda x: ' '.join([t for t in x.split(' ') if t not in stopWords]))
    # Sentiment
    df['sentiment'] = df['processed_no_stopwords'].apply(lambda x: TextBlob(x).sentiment[0])

    return df

sample = processing(sample_review)
df = processing(review_list)
print(sample.head())



   class                                            raw_txt  \
0      0  Story of a man who has unnatural feelings for ...   
1      0  Airport '77 starts as a brand new luxury 747 p...   
2      0  This film lacked something I couldn't put my f...   
3      0  Sorry everyone,,, I know this is supposed to b...   
4      0  When I was little my parents took me along to ...   

                                           processed  \
0  story of a man who ha unnatural feeling for a ...   
1  airport 77 start a a brand new luxury 747 plan...   
2  this film lacked something i neg_put my finger...   
3  sorry everyone i know this is supposed to be a...   
4  when i wa little my parent took me along to th...   

                              processed_no_stopwords  sentiment  
0  story man ha unnatural feeling pig start openi...  -0.067593  
1  airport 77 start brand new luxury 747 plane lo...   0.048856  
2  film lacked something neg_put finger first cha...   0.025000  
3  sorry everyone kn

In [37]:
from sklearn.model_selection import train_test_split

features = [f for f in df.columns.values if f not in ['class']]
x_train, x_test, y_train, y_test = train_test_split(df['processed_no_stopwords'], df['class'],
                                                    test_size=0.20, random_state=10)
x_set = df[features]
target_set = df['class']
print(x_train.head())

5610     devil hunter gained notoriety fact dpp video n...
3742     say seven poundswell watched flight seattle to...
5692     film wa predictable entire time youre hoping o...
22213    attempt cash success universal horror film maj...
23165    dark comedy gallows humor doe one make comedy ...
Name: processed_no_stopwords, dtype: object


In [38]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer

cv = CountVectorizer(ngram_range=(1,3), binary=True,min_df=5)
cv.fit(x_train)
count_x_train = cv.transform(x_train)
count_x_test = cv.transform(x_test)

In [33]:
tf_idf = TfidfVectorizer(ngram_range=(1, 2), min_df=5)
tf_idf.fit(x_train)
tfidf_x_train = cv.transform(x_train)
tfidf_x_test = cv.transform(x_test)

In [41]:
print(count_x_test.shape)
print(count_x_train.shape)

(5000, 84263)
(20000, 84263)


In [29]:
from sklearn.feature_selection import SelectFromModel, chi2, SelectPercentile
from sklearn.linear_model import LogisticRegression
selector = SelectPercentile(score_func=chi2, percentile=40)
selector.fit(count_x_train, y_train)
X_test_final = selector.transform(count_x_train)
x_train_final = selector.transform(count_x_test)

In [30]:
print(x_train_final.shape)
print(X_test_final.shape)

(5000, 67770)
(20000, 67770)


In [42]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

for c in [0.01, 0.05, 0.25, 0.5, 1]:
    
    lr = LogisticRegression(C=c)
    lr.fit(count_x_train, y_train)
    print ("Accuracy for C=%s: %s" 
           % (c, accuracy_score(y_test, lr.predict(count_x_test))))



Accuracy for C=0.01: 0.8744


Accuracy for C=0.05: 0.8838


Accuracy for C=0.25: 0.885


Accuracy for C=0.5: 0.8846


Accuracy for C=1: 0.8844


In [15]:
final_model = LogisticRegression(C=0.5)
final_model.fit(x_train_final, y_train)



LogisticRegression(C=0.5, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [17]:
feature_to_coef = {
    word: coef for word, coef in zip(
        cv.get_feature_names(), final_model.coef_[0]
    )
}
for best_positive in sorted(
    feature_to_coef.items(), 
    key=lambda x: x[1], 
    reverse=True)[:50]:
    print (best_positive)
    
    
for best_negative in sorted(
    feature_to_coef.items(), 
    key=lambda x: x[1])[:20]:
    print (best_negative)


('excellent', 1.0799982955505822)
('great', 1.0264378131141723)
('wonderful', 0.8880931271220058)
('best', 0.8046852331979775)
('perfect', 0.7871240869964533)
('amazing', 0.7701794690835482)
('superb', 0.6788919991980692)
('loved', 0.6513753244005914)
('fun', 0.6418148352931241)
('enjoyed', 0.641450469121652)
('710', 0.6286695612524313)
('enjoyable', 0.6142923518691386)
('favorite', 0.5656611062490394)
('recommend', 0.5542892305585336)
('today', 0.5540753142095256)
('recommended', 0.5530230085445236)
('highly', 0.5492491083930617)
('very good', 0.5440017536936727)
('love', 0.5406779035265981)
('bit', 0.5366834667889714)
('beautiful', 0.5362994616361331)
('job', 0.5352497286827795)
('rare', 0.5170700781542733)
('shows', 0.5149858402958708)
('worth', 0.5135318023714532)
('hilarious', 0.5119660566388041)
('fantastic', 0.5062235178471155)
('brilliant', 0.5044173234439167)
('gem', 0.49674348442427974)
('incredible', 0.4865036254801623)
('enjoy', 0.4859876692015734)
('definitely', 0.48197798

('worst', -1.795705974696713)
('bad', -1.2699805526887435)
('awful', -1.2119495233218747)
('boring', -1.1706602869022078)
('waste', -1.1092579631455077)
('poor', -0.9695941088061945)
('neg_even', -0.9014694061715264)
('terrible', -0.8680964410628166)
('stupid', -0.8410132488227579)
('dull', -0.8326462415424772)
('horrible', -0.7754025537786531)
('poorly', -0.7711305226411619)
('worse', -0.7298288375658661)
('nothing', -0.7289854163427919)
('unfortunately', -0.7019023684656917)
('disappointing', -0.6816400072045343)
('lame', -0.6738123528493131)
('410', -0.6696192725431648)
('neg_worth', -0.6596037980971755)
('disappointment', -0.6578297499379018)
