## Text Classification on Amazon Fine Food Dataset with Google Word2Vec Word Embeddings in Gensim and training using LSTM In Keras.

### IMPORTING THE MODULES

In [20]:
# Ignore the warinings
import warnings
warnings.filterwarnings('always')
warnings.filterwarnings('ignore')

# data visualization and manipulation
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import style
import seaborn as sns
#configure
# sets matplotlib to inline and displays graphs below the corressponding cell.
# matplotlib inline
style.use('fivethirtyeight')
sns.set(style='whitegrid', color_codes=True)

# nltk
import nltk

#preprocessing
from nltk.corpus import stopwords  #stopwords
from nltk import word_tokenize, sent_tokenize  # tokenizing
from nltk.stem import PorterStemmer, LancasterStemmer  # using the Porter Stemmer and Lancaster Stemmer and others
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer  # lammatizer from WordNet

# for part-of-speech tagging
from nltk import pos_tag

# from named entity recognition (NER)
from nltk import ne_chunk

# vectorizers for creating the document-term-matrix (DTM)
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer

# BeautifulSoup library
from bs4 import BeautifulSoup

import re  # regex

#model_selection
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV

#evaluation
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.metrics import classification_report
from mlxtend.plotting import plot_confusion_matrix


#prprocssing scikit
from sklearn.preprocessing import MinMaxScaler, StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer  # 'Imputer' is deprecated from 'sklearn.preprocessing'

#classification.
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC, SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier

from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB

#stop-words
stop_words = set(nltk.corpus.stopwords.words('english'))

#keras
import keras
from keras.preprocessing.text import one_hot, Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Flatten, Embedding, Input, LSTM  # cannot import name 'CuDNNLSTM' from 'keras.layers'

from keras.models import Model
from keras.preprocessing.text import text_to_word_sequence

#gensim w2v
#word2vec
from gensim.models import Word2Vec

### LOADING THE DATASET

In [36]:
rev_frame = pd.read_csv(r'./input/Reviews.csv')
df = rev_frame.copy()

In [37]:
df.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...


In [38]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 568454 entries, 0 to 568453
Data columns (total 10 columns):
 #   Column                  Non-Null Count   Dtype 
---  ------                  --------------   ----- 
 0   Id                      568454 non-null  int64 
 1   ProductId               568454 non-null  object
 2   UserId                  568454 non-null  object
 3   ProfileName             568438 non-null  object
 4   HelpfulnessNumerator    568454 non-null  int64 
 5   HelpfulnessDenominator  568454 non-null  int64 
 6   Score                   568454 non-null  int64 
 7   Time                    568454 non-null  int64 
 8   Summary                 568427 non-null  object
 9   Text                    568454 non-null  object
dtypes: int64(5), object(5)
memory usage: 43.4+ MB


In [39]:
df.groupby(['UserId', 'ProductId']).sum('Score')

Unnamed: 0_level_0,Unnamed: 1_level_0,Id,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time
UserId,ProductId,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
#oc-R103C0QSV1DF5E,B006Q820X0,136323,1,2,5,1343088000
#oc-R109MU5OBBZ59U,B008I1XPKA,516062,0,1,5,1350086400
#oc-R10LFEMQEW6QGZ,B008I1XPKA,516079,0,1,5,1345939200
#oc-R10LT57ZGIB140,B0026LJ3EA,378693,0,0,3,1310601600
#oc-R10UA029WVWIUI,B006Q820X0,136545,0,0,1,1342483200
...,...,...,...,...,...,...
AZZV9PDNMCOZW,B003SNX4YA,422838,0,0,4,1329436800
AZZVNIMTTMJH6,B000FI4O90,190698,0,0,5,1268179200
AZZY649VYAHQS,B000N9VLJ2,222781,1,1,5,1309737600
AZZYCJOJLUDYR,B001SB22UG,131469,0,0,5,1337472000


In [40]:
print(df['Time'].min())
print(df['Time'].max())

939340800
1351209600


#### A brief description of the dataset from Overview tab on Kaggle : -

Data includes:
- Reviews from Oct 1999 - Oct 2012
- 568,454 reviews
- 256,059 users
- 74,258 products
- 260 users with > 50 reviews

### DATA CLEANING AND PRE-PROCESSING

#### Since here I am concerned with **sentiment analysis** I shall keep only the 'Text' and the 'Score' column.

In [41]:
df.columns

Index(['Id', 'ProductId', 'UserId', 'ProfileName', 'HelpfulnessNumerator',
       'HelpfulnessDenominator', 'Score', 'Time', 'Summary', 'Text'],
      dtype='object')

In [42]:
df = df[['Text', 'Score']]

In [None]:
df['review'] = df['Text']
df['rating']

In [48]:
df.rename({'Text': 'review', 'Score': 'rating'}, axis=1, inplace=True)

In [51]:
print(df.shape)
df.head()

(568454, 2)


Unnamed: 0,review,rating
0,I have bought several of the Vitality canned d...,5
1,Product arrived labeled as Jumbo Salted Peanut...,1
2,This is a confection that has been around a fe...,4
3,If you are looking for the secret ingredient i...,2
4,Great taffy at a great price. There was a wid...,5


Let us now see if any of the column has any null values.

In [52]:
# check for null values
print(df['rating'].isnull().sum())
df['review'].isnull().sum()  # no null values.

0


0

Note that there is no point for keeping rows with different scores or sentiment for same review text.  So I will keep only one instance and drop the rest of the duplicates.