# Sentiment Analysis :

## Dataset : 
    - Text File 
    - Amazon Product Review - Label followed by free text 
    - Two Classes 
        1. __label__1  --> Negative Review
        2. __label__2  --> Positive Review
       
        
        

In [43]:
# Imports 
import os
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer


# sklearn imports 
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

Porter_Stemmer = PorterStemmer()
WordNet_Lemmatizer = WordNetLemmatizer()
cv = CountVectorizer(binary=True)
lr_clf = LogisticRegression()


In [33]:
# Data Files:
Train_Data_File = 'kili_train_data.txt' 
Test_Data_File = 'kili_train_data.txt' 


# Data Cleaning:
    - Data is free text from customer i.e Natural language , hence data needs to be cleaned for better performance

In [34]:
# Load Train and Test Data 

df_train = pd.read_fwf(Train_Data_File, header = None)
df_train = df_train.rename(columns={0:'labels', 1:'Features'}).drop(2, 1)


df_test = pd.read_fwf(Test_Data_File, header = None)
df_test = df_test.rename(columns={0:'labels', 1:'Features'}).drop(2, 1)


print(df_train.describe())

print(df_test.describe())

            labels                                           Features
count        10010                                              10010
unique           2                                              10010
top     __label__1  Weak remake: I found this recent remake of the...
freq          5103                                                  1
            labels                                           Features
count        10010                                              10010
unique           2                                              10010
top     __label__1  Weak remake: I found this recent remake of the...
freq          5103                                                  1


In [35]:
# Renaming Labels for Better understanding 
# __label__1 = negative
# __label__1 = positive

df_train['labels'] = df_train['labels'].map({'__label__1': 'negative', '__label__2': 'positive'})
df_test['labels'] = df_test['labels'].map({'__label__1': 'negative', '__label__2': 'positive'})

In [27]:
def clean_data(df):
    cleaned_features = []
    for index , line in enumerate(df['Features'].values):
    #     print(index, line)
        words = word_tokenize(line)
        # remove Puntuations 
        words = [w.lower() for w in words if w not in string.punctuation]
        words = [Porter_Stemmer.stem(w) for w in words if w not in stopwords.words('english') if w.isalpha() ]
        words = [WordNet_Lemmatizer.lemmatize(w) for w in words ]
        cleaned_features.append(" ".join(words))
    # Adding the cleaned_features to Dataframe
    df['cleaned_features'] = pd.Series(cleaned_features)
    df.head()
    return df

In [36]:
df_train = clean_data(df_train)
df_test = clean_data(df_test)


print(df_train.head())
print(df_test.head())

     labels                                           Features  \
0  positive  Stuning even for the non-gamer: This sound tra...   
1  positive  The best soundtrack ever to anything.: I'm rea...   
2  positive  Amazing!: This soundtrack is my favorite music...   
3  positive  Excellent Soundtrack: I truly like this soundt...   
4  positive  Remember, Pull Your Jaw Off The Floor After He...   

                                    cleaned_features  
0  stune even sound track beauti paint seneri min...  
1  best soundtrack ever anyth read lot review say...  
2  amaz soundtrack favorit music time hand intens...  
3  excel soundtrack truli like soundtrack enjoy v...  
4  rememb pull jaw floor hear play game know divi...  
     labels                                           Features  \
0  positive  Stuning even for the non-gamer: This sound tra...   
1  positive  The best soundtrack ever to anything.: I'm rea...   
2  positive  Amazing!: This soundtrack is my favorite music...   
3  positi

In [39]:
# Converting the cleaned_features into CountVectorizer , SparseMatrix 
x_train = cv.fit_transform(df_train.cleaned_features)
x_test = cv.fit_transform(df_test.cleaned_features)

y_train = df_train['labels'].values
y_test = df_test['labels'].values


In [40]:
lr_clf.fit(X = x_train, y=y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [41]:
y_pred = lr_clf.predict(x_test)

In [45]:
print(accuracy_score(y_true = y_test, y_pred = y_pred))

0.9836163836163836
