In [None]:
import os
from google.colab import files
import sqlite3
import pandas as pd
# !kaggle datasets download -d ajaysh/amazon-fine-food-reviews --unzip -p /content/data


In [None]:
%matplotlib inline
import numpy as np
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
import string
import re
import nltk
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import roc_curve, auc
from nltk.stem.porter import PorterStemmer

In [None]:

#using Sqlite3 to connect to the database file
conn = sqlite3.connect('/content/data/database.sqlite') # Update to the correct path

In [None]:
#Filtering Data where only positive and negative reviews come in and discarding the reviews with neutral score
filtered_data = pd.read_sql_query('''SELECT * FROM Reviews WHERE Score !=3''', conn)

#Given reviews with score > 3 a positive rating, and reviews with a score<3 with a negative rating
def partition(x):
    if x>3:
        return 'positive'
    return 'negative'

#changing review score less than 3 to be positive and vice-versa
actialScore = filtered_data['Score']
positiveNegative = actialScore.map(partition)
filtered_data['Score'] = positiveNegative
filtered_data.head()


In [None]:
# prompt: Data Cleaning: Deduplication

display(filtered_data.shape)
#Sorting data according to ProductId in ascending order
sorted_data=filtered_data.sort_values('ProductId', axis=0, ascending=True, inplace=False, kind='quicksort', na_position='last')
#Deduplication of entries
final=sorted_data.drop_duplicates(subset={"UserId","ProfileName","Time","Text"}, keep='first', inplace=False)
display(final.shape)

In [None]:
#Checking to see how much % of data still remains
print("The percentage of data remained is: ", (final['Id'].size*1.0) / (filtered_data['Id'].size*1.0)*100)


**Text Preprocessing: Stemming,stop-word removal and Lemmatization**

In [None]:
import nltk
nltk.download('punkt_tab')
nltk.download('stopwords')

# Download wordnet if not already downloaded
nltk.download('wordnet') # Download wordnet to enable lemmatization


In [None]:
#Stopwords in English
stop  = set(stopwords.words('english'))

#Initializing the Snowball stemmer
sno = nltk.stem.SnowballStemmer('english')

#Function to clean html elements from text
def cleanhtml(sentence):
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, ' ', sentence)
    return cleantext

#Function to clean punctuations from text
def cleanpunc(sentence):
    cleaned = re.sub(r'[?|!|\'|"|#]', r' ', sentence)
    cleaned = re.sub(r'[.|,|)|(|\|/]', r' ', cleaned)
    return cleaned


**Bag of Words(BOW)**

In [None]:
count_vect = CountVectorizer()
final_counts = count_vect.fit_transform(final['Text'].values)


In [None]:
#Find sentences containing HTML Tags

i = 0;
for sent in final['Text'].values:
    if(len(re.findall('<.*?', sent))):
        print(i)
        print(sent)
        break;
    i+=1

In [None]:
print(stop)
print('******************************************')
print(sno.stem('tasty'))

In [None]:
#Code for implementing step-by-step the checks mentioned in the preprocessing
i = 0
str1 = ' '
final_string = []
all_positive_words = []
all_negative_words = []
s=''

for sent in final['Text'].values:
    filtered_sentence = []
    sent = cleanhtml(sent)
    for w in sent.split():
        for cleaned_words in cleanpunc(w).split():
            if((cleaned_words.isalpha()) & (len(cleaned_words) > 2)):
                if(cleaned_words.lower() not in stop):
                    s = (sno.stem(cleaned_words.lower())).encode('utf-8')
                    filtered_sentence.append(s)
                    if(final['Score'].values)[i] =='positive':
                        all_positive_words.append(s)
                    if(final['Score'].values)[i] == 'negative':
                        all_negative_words.append(s)
                else:
                    continue
            else:
                continue
    str1 = b" ".join(filtered_sentence)

    final_string.append(str1)
    i+=1


In [None]:
final['CleanedText'] = final_string
final.head(5)

In [None]:
conn = sqlite3.connect('final.sqlite')
c = conn.cursor()
print(c)
conn.text_factory = str
final.to_sql('Reviews', conn, schema=None, if_exists='replace')

In [None]:
freq_dist_positive = nltk.FreqDist(all_positive_words)
freq_dist_negative = nltk.FreqDist(all_negative_words)
print("Most Common Positive Words : ", freq_dist_positive.most_common(20))
print("Most Common Negative Words : ", freq_dist_negative.most_common(20))

In [None]:
#bi-grams, tri-grams, n-grams
count_vect = CountVectorizer(ngram_range=(1,2))
final_bigram_counts = count_vect.fit_transform(final['Text'].values)
print(final_bigram_counts.get_shape())


**TF-IDF**

In [None]:
tf_idf_vect = TfidfVectorizer(ngram_range=(1,2))
final_tf_idf = tf_idf_vect.fit_transform(final['Text'].values)

In [None]:
features = tf_idf_vect.get_feature_names_out() # Use get_feature_names_out() instead of get_feature_names()
len(features)

In [None]:
features[100000:100010]


In [None]:
def top_tf_idf_feats(row, features, top_n = 25):
    '''Get top n tfidf values in a row and return them with their corrosponding vector'''
    topn_ids = np.argsort(row)[::-1][:top_n]
    top_feat = [(features[i], row[i]) for i in topn_ids]
    df = pd.DataFrame(top_feat)
    df.columns = ['feature', 'tfidf']
    return df

top_tf_idf = top_tf_idf_feats(final_tf_idf[1,:].toarray()[0], features, 25)

print(top_tf_idf)


**Word2Vec**

In [None]:
!kaggle datasets download -d adarshsng/googlenewsvectors --unzip -p /content/data


In [None]:
#Using Google News Word2Vectors
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
import pickle

#We are using pretrained model by Google
modelw2v = KeyedVectors.load_word2vec_format('/content/data/GoogleNews-vectors-negative300.bin', binary=True, unicode_errors='ignore')

In [None]:
modelw2v.get_vector('car')


In [None]:
modelw2v.similarity('man', 'woman')

In [None]:
modelw2v.most_similar('woman')


In [None]:
#model.most_similar('tasti') which is a stem of word tasty
#No word found, OOV Issue
#Stemmed words may not have word2vecs so be careful in doing it.

modelw2v.most_similar('tasty')


In [None]:
#Now Let's create our own Word2Vec model using our own text corpus

import gensim

i=0
list_of_sent = []
for sent in final['Text'].values:
    filtered_sentence = []
    sent = cleanhtml(sent)
    for w in sent.split():
        for cleaned_words in cleanpunc(w).split():
            if(cleaned_words.isalpha()):
                filtered_sentence.append(cleaned_words.lower())
            else:
                continue
    list_of_sent.append(filtered_sentence)

In [None]:
print(final['Text'].values[0])
print("***********************************************************************")
print(list_of_sent[0])

In [None]:
w2v_model = gensim.models.Word2Vec(list_of_sent, min_count=5, vector_size=50, workers=4)


In [None]:
words = list(w2v_model.wv.index_to_key)
print(len(words))


In [None]:
w2v_model.wv.most_similar('tasty')
#Now we have more similar words from our own model

In [None]:
w2v_model.wv.most_similar('like')
#Now we have more similar words from our own model

In [None]:
count_vect_feat = count_vect.get_feature_names_out() # Use get_feature_names_out() instead of get_feature_names()
# Convert count_vect_feat to a list to use the index() method
count_vect_feat = list(count_vect_feat)
index_of_like = count_vect_feat.index('like')
print(count_vect_feat[64055])

In [None]:
#average Word2Vec
#Compute average word2vec for each review

sent_vectors = [];
for sent in list_of_sent:
    sent_vec = np.zeros(50)
    cnt_words = 0
    for word in sent:
        try:
            vec = w2v_model[word]
            sent_vec+=vec
            cnt_words +=1
        except:
            pass
    sent_vec /=cnt_words
    sent_vectors.append(sent_vec)
print(len(sent_vectors))
print(len(sent_vectors[0]))

In [None]:
#TF-IDF weighted Word2Vec
tfidf_feat = tf_idf_vect.get_feature_names_out()

tfidf_sent_vectors = []
row = 0;

for sent in list_of_sent:
    sent_vec = np.zeros(50)
    weight_sum = 0
    for word in sent:
        try:
            vec = w2v_model[word]
            tfidf = final_tf_idf[row, tfidf_feat.index(word)]
            sent_vec+=(vec*tf_idf)
            weight_sum+=tf_idf
        except:
            pass
    sent_vec /=weight_sum
    tfidf_sent_vectors.append(sent_vec)
    row+=1
