In [1]:
%matplotlib inline
import sqlite3
import pandas as pd
import numpy as np
import nltk
import string
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import roc_curve, auc
from nltk.stem.porter import PorterStemmer
import re
import zipfile
import os

# First, extract the database file from the zip archive
with zipfile.ZipFile('database.sqlite.zip', 'r') as zip_ref:
    zip_ref.extractall('.')  # Extract to current directory

# Now connect to the extracted database file
con = sqlite3.connect('database.sqlite')  # Connect to the extracted file, not the zip

# Filter only positive and negative review
# i.e., not taking 3 rating
filtered_data = pd.read_sql_query("""SELECT * FROM Reviews WHERE Score !=3""", con)

def partition(x):
    # Function to convert rating score into sentiment label
    # If score is less than 3 → 'negative', else → 'positive'
    if x < 3:
        return 'negative'
    return 'positive'  # Fixed typo: 'positve' -> 'positive'

# ✅ Print column names of dataset
print("Column names in dataset:", filtered_data.columns)

# Convert original numeric score column into sentiment labels
actualScore = filtered_data['Score']  # Extract numerical Score column

# Apply partition() to each score value to map numbers to 'positive'/'negative'
positiveNegative = actualScore.map(partition)

# Replace original Score column with new sentiment labels
filtered_data['Score'] = positiveNegative

# Print dataset size (rows, columns)
print(filtered_data.shape)

# Display first few rows to check changes
filtered_data.head()


Column names in dataset: Index(['Id', 'ProductId', 'UserId', 'ProfileName', 'HelpfulnessNumerator',
       'HelpfulnessDenominator', 'Score', 'Time', 'Summary', 'Text'],
      dtype='object')
(525814, 10)


Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,positive,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,negative,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,positive,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,negative,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,positive,1350777600,Great taffy,Great taffy at a great price. There was a wid...


<h2>#DATA CLEANING ;DEDULICATION

In [2]:
#DATA CLEANING ;DEDULICATION
#exaple of a id which is spamming the reiview there would be lot more id like this
display=pd.read_sql_query("""SELECT *FROM Reviews WHERE Score !=3 AND UserID="AR5J8UI46CURR"ORDER BY ProductID""",con)
display

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,78445,B000HDL1RQ,AR5J8UI46CURR,Geetha Krishnan,2,2,5,1199577600,LOACKER QUADRATINI VANILLA WAFERS,DELICIOUS WAFERS. I FIND THAT EUROPEAN WAFERS ...
1,138317,B000HDOPYC,AR5J8UI46CURR,Geetha Krishnan,2,2,5,1199577600,LOACKER QUADRATINI VANILLA WAFERS,DELICIOUS WAFERS. I FIND THAT EUROPEAN WAFERS ...
2,138277,B000HDOPYM,AR5J8UI46CURR,Geetha Krishnan,2,2,5,1199577600,LOACKER QUADRATINI VANILLA WAFERS,DELICIOUS WAFERS. I FIND THAT EUROPEAN WAFERS ...
3,73791,B000HDOPZG,AR5J8UI46CURR,Geetha Krishnan,2,2,5,1199577600,LOACKER QUADRATINI VANILLA WAFERS,DELICIOUS WAFERS. I FIND THAT EUROPEAN WAFERS ...
4,155049,B000PAQ75C,AR5J8UI46CURR,Geetha Krishnan,2,2,5,1199577600,LOACKER QUADRATINI VANILLA WAFERS,DELICIOUS WAFERS. I FIND THAT EUROPEAN WAFERS ...


In [3]:
display.shape

(5, 10)

In [4]:
#sorting data according to productid in ascending order
# Sorting the dataset by 'ProductId' column
# axis=0 → sort rows (default)
# ascending=True → smallest → largest
# inplace=False → do NOT modify original dataframe, return new sorted dataframe
# kind='quicksort' → sorting algorithm (default, fastest for many cases)
# na_position='last' → place missing values (NaN) at the end
sorted_data = filtered_data.sort_values(
    'ProductId',
    axis=0,
    ascending=True,
    inplace=False,
    kind='quicksort',
    na_position='last'
)

# Print shape of sorted dataset (rows, columns)
print(sorted_data.shape)
sorted_data.head()

(525814, 10)


Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
138706,150524,6641040,ACITT7DI6IDDL,shari zychinski,0,0,positive,939340800,EVERY book is educational,this witty little book makes my son laugh at l...
138688,150506,6641040,A2IW4PEEKO2R0U,Tracy,1,1,positive,1194739200,"Love the book, miss the hard cover version","I grew up reading these Sendak books, and watc..."
138689,150507,6641040,A1S4A3IQ2MU7V4,"sally sue ""sally sue""",1,1,positive,1191456000,chicken soup with rice months,This is a fun way for children to learn their ...
138690,150508,6641040,AZGXZ2UUK6X,"Catherine Hallberg ""(Kate)""",1,1,positive,1076025600,a good swingy rhythm for reading aloud,This is a great little book to read aloud- it ...
138691,150509,6641040,A3CMRKGE0P909G,Teresa,3,4,positive,1018396800,A great way to learn the months,This is a book of poetry about the months of t...


<h2>DEDUPLICATION OF ENTRIES

In [5]:
#DEDUPLICATION OF ENTRIES
final = sorted_data.drop_duplicates(
    subset=["UserId","ProfileName","Time","Text"],  # columns used to check duplicates
    keep='first',                                   # keep first occurrence
    inplace=False                                   # return new dataframe instead of modifying original
)

final.shape  # shape after removing duplicates
print(list(final.columns))


['Id', 'ProductId', 'UserId', 'ProfileName', 'HelpfulnessNumerator', 'HelpfulnessDenominator', 'Score', 'Time', 'Summary', 'Text']


In [6]:
#CHECKING TO SEE HOW MUCH % OF DATA STILL REMAINS
(final['Id'].size*1.0)/(filtered_data['Id'].size*1.0)*100

69.25890143662969

In [7]:
#To remove those whose helpfulness numerator is> helpfulness denominaor¶
display=pd.read_sql_query("""SELECT *FROM Reviews WHERE Score !=3 AND Id=44737 OR Id=64422 ORDER BY ProductId""",con)
print(display.shape)
display

(2, 10)


Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,64422,B000MIDROQ,A161DK06JJMCYF,"J. E. Stephens ""Jeanne""",3,1,5,1224892800,Bought This for My Son at College,My son loves spaghetti so I didn't hesitate or...
1,44737,B001EQ55RW,A2V0I904FH7ABY,Ram,3,2,4,1212883200,Pure cocoa taste with crunchy almonds inside,It was almost a 'love at first bite' - the per...


In [8]:
final = final[
    final.HelpfulnessNumerator <= final.HelpfulnessDenominator   # keep only rows where numerator is not greater than denominator
]

print(final.shape)  # print dataframe shape after removing invalid rows


(364171, 10)


In [9]:
print(list(final.columns))

['Id', 'ProductId', 'UserId', 'ProfileName', 'HelpfulnessNumerator', 'HelpfulnessDenominator', 'Score', 'Time', 'Summary', 'Text']


In [10]:
#HOW MANY POSITIVE AND NEGATIVE REVIEWS ARE PRESENT IN OUR DATASET?
final['Score'].value_counts()

Score
positive    307061
negative     57110
Name: count, dtype: int64

In [11]:
final.shape

(364171, 10)

In [12]:
print(list(final.columns))


['Id', 'ProductId', 'UserId', 'ProfileName', 'HelpfulnessNumerator', 'HelpfulnessDenominator', 'Score', 'Time', 'Summary', 'Text']


<h2>TEXT PRERPOCESSING :STEMMING ,STOPWORD REMOVAL AND LEMITIZATION
<h3>NOW WE HAVE FINISHED DEDUPLICATION OUR DATA REQUUIRES SOME PREPOCESSING BEFORE WE GO ON FURTHER WITH ANALYSIS AND 
<h3>MAKING THE PREDICTION MODEL

In [13]:
import re                                  # used for regex operations (cleaning text patterns like HTML tags)
import nltk                                # NLP library for text preprocessing
nltk.download('stopwords')                 # download stopwords list once (words like "the", "is", "and")
import string                              # handles punctuation-related operations

from nltk.corpus import stopwords          # import stopword dataset
from nltk.stem import PorterStemmer        # stemming tool (reduces words to root form, e.g., running→run)
from nltk.stem.wordnet import WordNetLemmatizer  # lemmatizer (smarter stemmer; grammar aware)

# Creating a set of English stopwords (fast lookup)
stop = set(stopwords.words('english'))
print(stop)                                # print list of stopwords for reference

# Initializing the Snowball stemmer (better than PorterStemmer in many cases)
sno = nltk.stem.SnowballStemmer('english')

# -------------------------------------------------------------------------
# FUNCTION: Remove HTML tags like <br>, <div>, <p>, etc.
# Example: "<br>great product</br>" → " great product "
# -------------------------------------------------------------------------
def cleanhtml(sentence):
    cleanr = re.compile('<.*?>')           # regex pattern to match anything inside < >
    cleantext = re.sub(cleanr, ' ', sentence)  # replace HTML tags with a space
    return cleantext                       # return cleaned sentence

# -------------------------------------------------------------------------
# FUNCTION: Remove punctuation marks
# First line removes special punctuation like ? ! ' " #
# Second line replaces .,()|/ etc. with spaces (to avoid word merging)
# Example: "hello,world!" → "hello world"
# -------------------------------------------------------------------------
def cleanpunc(sentence):
    cleaned = re.sub(r'[?|!|\'|"|#]', r'', sentence)  # remove special punctuations completely
    cleaned = re.sub(r'[.|,|)|(|\|/]', r' ', cleaned) # replace other punctuation with space
    return cleaned

print('***************************')   
print(stop)                                # print stopwords again to verify everything works


{'he', 'during', "weren't", 'ma', 'needn', 'y', 'mightn', 'why', 'the', 'no', "mustn't", 'doing', 'isn', 'now', 'doesn', 'while', "shouldn't", 'theirs', 'yourselves', 'about', 's', 'be', 'after', 'here', 'once', 'until', 't', "shan't", 'being', "you'll", 'yours', 'because', "it'll", 'we', 'have', "wasn't", 'their', 'further', "we'll", 'by', "hasn't", 'all', 'down', 'this', 'are', "doesn't", 'own', "we've", 'yourself', 'in', 'before', 'there', 'd', "wouldn't", "couldn't", 'm', 'whom', "won't", 'don', "haven't", "she'd", 'ain', 'ourselves', 'above', 'is', 'itself', 'with', 'wasn', "didn't", "isn't", 'nor', 'and', 'mustn', 'on', 'o', 'only', "we'd", 'from', 'll', 'very', 'our', 'between', "mightn't", 'myself', 'wouldn', 'off', "i've", 'or', 'she', 'were', 'when', 'as', 're', "i'm", 'weren', "you've", 'more', 'your', 'than', 'into', 'you', 'how', 'ours', 'an', 'them', "he's", 'to', "i'll", 'am', 'through', 'did', 'some', 'out', 'each', 'any', "they're", "i'd", 'but', 'has', 'didn', 'was', 

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [14]:
sample_text = "<div><b>This product is <i>amazing</i>!</b> <br> I loved it.</div>"

print("Original text:")
print(sample_text)
print("\nAfter HTML cleaning:")
print(cleanhtml(sample_text))


Original text:
<div><b>This product is <i>amazing</i>!</b> <br> I loved it.</div>

After HTML cleaning:
  This product is  amazing !    I loved it. 


<h2>BAG OF WORDS(BOW)
<h4>FINAL IS HERE FILTERED DATA AFTER DEDULPIATON AND HELPFULNESS NUM/DENOM OPERTION
<h4>THSESE TWO WHERE DATA CLEANING


In [15]:
# -------------------- Bag of Words (BOW) Feature Extraction --------------------

import sklearn                                           # machine learning library
from sklearn.feature_extraction.text import CountVectorizer   # tool to convert text → numeric vectors

count_vect = CountVectorizer()                           # create BOW converter object (default settings)

# fit() = learn vocabulary from text
# transform() = convert each text review into numeric vector (word frequency)
# fit_transform() = fit + transform in one step
final_counts = count_vect.fit_transform(final['Text'].values)  

# final_counts → sparse matrix (rows = reviews, columns = unique words)
# Each cell stores frequency of word in that review

print(final_counts.shape)                                # check shape (num_rows, vocab_size)


(364171, 115281)


In [16]:
type(final_counts)

scipy.sparse._csr.csr_matrix

In [17]:
final_counts.get_shape()#here all column vector is unigram all unique words in unique col

(364171, 115281)

In [18]:
final.shape

(364171, 10)

In [31]:
final_counts.shape

(364171, 115281)

<h2>TEXT PREPOCESSING:STEMMING,STOP-REMOVAL,AND LEMMITIZATION

In [18]:
#FINDING SENTENCE USING HTML TAGS
i = 0   # index counter

# iterate through each review/text in the dataset
for sent in final['Text'].values:     

    # check if sentence still contains any HTML tags <...>
    if(len(re.findall('<.*?>', sent))):     
        print(i)        # print index of the review with HTML tag
        print(sent)     # print the sentence containing HTML
        break           # stop after finding first such sentence
    
    i += 1  # increase index counter


6
I set aside at least an hour each day to read to my son (3 y/o). At this point, I consider myself a connoisseur of children's books and this is one of the best. Santa Clause put this under the tree. Since then, we've read it perpetually and he loves it.<br /><br />First, this book taught him the months of the year.<br /><br />Second, it's a pleasure to read. Well suited to 1.5 y/o old to 4+.<br /><br />Very few children's books are worth owning. Most should be borrowed from the library. This book, however, deserves a permanent spot on your shelf. Sendak's best.


In [21]:
import re                                 # regex library for pattern matching (HTML removal)
import nltk                               # natural language toolkit
import string                             # handles punctuation-related operations

from nltk.corpus import stopwords         # import stopword list (common useless words)
from nltk.stem import PorterStemmer       # stemmer (not used here but imported)
from nltk.stem.wordnet import WordNetLemmatizer  # lemmatizer (not used here but imported)

# Create a set of English stopwords (faster lookup than list)
stop = set(stopwords.words('english'))     # e.g., "the", "is", "and", "are"
                                           # removing these helps model focus on meaningful words

# Initialize Snowball Stemmer for English (better than PorterStemmer generally)
sno = nltk.stem.SnowballStemmer('english')  # used for stemming words, e.g., tasty → tasti

# -----------------------------------------------------------------
# FUNCTION: Remove HTML tags such as <br>, <div>, <p> etc.
# Example: "<br>good product</br>" → " good product "
# -----------------------------------------------------------------
def cleanhtml(sentence):
    cleanr = re.compile('<.*?>')           # regex pattern to match anything inside < >
    cleantext = re.sub(cleanr, ' ', sentence)  # replace HTML tags with a space
    return cleantext                       # return cleaned sentence

# -----------------------------------------------------------------
# FUNCTION: Remove punctuation from text
# First step removes specific punctuation ? ! ' " #
# Second step replaces . , ( ) | / with space so words don’t stick together
# Example: "hello,world!" → "hello world"
# -----------------------------------------------------------------
def cleanpunc(sentence):
    cleaned = re.sub(r'[?|!|\'|"|#]', r'', sentence)  # remove special punctuations
    cleaned = re.sub(r'[.|,|)|(|\|/]', r' ', cleaned) # replace other punctuation with a space
    return cleaned

# Print stopwords to verify
print(stop)   
print("*************************")

# Checking stemmer on an example
print(sno.stem('tasty'))                   # output: "tasti" (stem of tasty)


{'mustn', 'who', "hadn't", 'does', 'this', 'about', 'm', 'for', 'down', "we've", "wasn't", 'all', 'aren', 'mightn', 't', "weren't", 'shan', 'will', "that'll", 'having', 'y', 'nor', 'most', 'our', 'a', 'such', 'hasn', "won't", "wouldn't", 'these', 'won', 'after', 'itself', 'ours', 'shouldn', "doesn't", 'through', "aren't", 'some', 'until', "needn't", "we'd", 'but', "i'm", "you're", "it'd", 'few', 'didn', 'above', 'to', 'don', 'now', "shouldn't", 'had', "haven't", 'do', 'did', 'once', 'the', 'd', 'has', "they'll", 'they', 'each', 'weren', 'haven', 'we', 'where', 'were', 'same', 'on', 'other', "we'll", "they've", 'only', 'he', "he'd", 'so', 'me', "you'd", 'wouldn', 've', "hasn't", "don't", 'being', 'below', 'be', 'is', 'and', "couldn't", "she'll", 'it', 'than', 'themselves', "you'll", 'between', 'myself', 'with', "it'll", "i'll", 'its', 'out', 'have', "it's", 'am', 'been', 'when', 'while', 'just', 'how', 're', 'those', "they'd", 'or', 'yours', 'an', 'hadn', 'my', 'your', 'her', 'further',

In [30]:
# -------------------- Pre-processing Loop --------------------
# This prepares cleaned/stemmed text and collects positive & negative words

i = 0                              # index to track review score
str1 = ' '                         # temporary string holder
final_string = []                  # list to store cleaned reviews
all_positive_words = []            # store words from positive reviews
all_negative_words = []            # store words from negative reviews
s = ''                             # temporary variable for each processed word

# iterate over each review in the dataset
for sent in final['Text'].values:
    
    filtered_sentence = []         # list for cleaned words in single review
    
    # remove HTML tags like <br>, <p>, etc.
    sent = cleanhtml(sent)
    
    # split sentence into tokens/words
    for w in sent.split():
        
        # remove punctuation & possibly split into further tokens
        for cleaned_words in cleanpunc(w).split():
            
            # keep only alphabetic words (ignore numbers, symbols) AND length > 2
            if cleaned_words.isalpha() and len(cleaned_words) > 2:
                
                # ignore stopwords ("the", "is", "and", ...)
                if cleaned_words.lower() not in stop:
                    
                    # stemming: "loving" -> "love" ; lowercasing ; encode text
                    s = sno.stem(cleaned_words.lower()).encode('utf8')
                    
                    # add processed word to review sentence
                    filtered_sentence.append(s)
                    
                    # if this review has positive sentiment, collect its words
                    if final['Score'].values[i] == 'positive':
                        all_positive_words.append(s)
                    
                    # if this review has negative sentiment, collect its words
                    if final['Score'].values[i] == 'negative':
                        all_negative_words.append(s)
                else:
                    continue      # skip if stopword
            else:
                continue          # skip if not alphabetic or too short
    
    # join processed words back to sentence (byte-string format)
    str1 = b" ".join(filtered_sentence)
    
    # append cleaned review to final list
    final_string.append(str1)
    
    # move to next review / label
    i += 1

In [33]:
# Add the cleaned processed text (stored in final_string) as a new column
# Each entry in 'final_string' corresponds to the cleaned version of review text
final['CleanedText'] = final_string
#What this does
#creates a new column in your final DataFrame named CleanedText
#Stores the cleaned/stemmed/processed review text in it

In [34]:
final.shape

(364171, 11)

In [35]:
print(list(final.columns))

['Id', 'ProductId', 'UserId', 'ProfileName', 'HelpfulnessNumerator', 'HelpfulnessDenominator', 'Score', 'Time', 'Summary', 'Text', 'CleanedText']


In [28]:
# View first 3 rows of the final DataFrame 
# (to visually confirm that CleanedText column has processed reviews)
final.head(3)

# --------------------------------------------------------------
# Store the cleaned & processed review table in SQLite database
# --------------------------------------------------------------

# Create/Connect to SQLite database file named 'final.sqlite'
conn = sqlite3.connect('final.sqlite')    

# Create a cursor object to execute SQL commands
c = conn.cursor()                         

# Ensure SQLite can store text data properly
conn.text_factory = str                   

# Save the DataFrame into SQLite DB
# 'Reviews' will be the table name
# if_exists='replace' → replace table if it already exists
final.to_sql(
    'Reviews',       # table name inside SQLite
    conn,            # database connection
    schema=None,     
    if_exists='replace'
)


364171

In [23]:
new_data = pd.read_sql_query("""SELECT * FROM Reviews WHERE Score !=3""", conn)
print(new_data.shape)
new_data.head()  # Fixed: Added a new line and proper variable name

(364171, 12)


Unnamed: 0,index,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text,CleanedText
0,138706,150524,6641040,ACITT7DI6IDDL,shari zychinski,0,0,positive,939340800,EVERY book is educational,this witty little book makes my son laugh at l...,b'witti littl book make son laugh loud recit c...
1,138688,150506,6641040,A2IW4PEEKO2R0U,Tracy,1,1,positive,1194739200,"Love the book, miss the hard cover version","I grew up reading these Sendak books, and watc...",b'grew read sendak book watch realli rosi movi...
2,138689,150507,6641040,A1S4A3IQ2MU7V4,"sally sue ""sally sue""",1,1,positive,1191456000,chicken soup with rice months,This is a fun way for children to learn their ...,b'fun way children learn month year learn poem...
3,138690,150508,6641040,AZGXZ2UUK6X,"Catherine Hallberg ""(Kate)""",1,1,positive,1076025600,a good swingy rhythm for reading aloud,This is a great little book to read aloud- it ...,b'great littl book read nice rhythm well good ...
4,138691,150509,6641040,A3CMRKGE0P909G,Teresa,3,4,positive,1018396800,A great way to learn the months,This is a book of poetry about the months of t...,b'book poetri month year goe month cute littl ...


<h2>BIGRAM ,N-GRAM

In [38]:
# ---------------- Frequency Distribution of Words ----------------

# Create frequency dictionary for words in positive reviews
# Counts how many times each stemmed word appears
freq_dist_positive = nltk.FreqDist(all_positive_words)      

# Create frequency dictionary for words in negative reviews
freq_dist_negative = nltk.FreqDist(all_negative_words)

# Display 20 most common words in positive reviews
# Output will show [(word, frequency), ...]
print("Most Common Positive Words : ", freq_dist_positive.most_common(20))

# Display 20 most common words in negative reviews
print("Most Common Negative Words : ", freq_dist_negative.most_common(20))


Most Common Positive Words :  [(b'like', 139429), (b'tast', 129047), (b'good', 112766), (b'flavor', 109624), (b'love', 107357), (b'use', 103888), (b'great', 103870), (b'one', 96726), (b'product', 91033), (b'tri', 86791), (b'tea', 83888), (b'coffe', 78814), (b'make', 75107), (b'get', 72125), (b'food', 64802), (b'would', 55568), (b'time', 55264), (b'buy', 54198), (b'realli', 52715), (b'eat', 52004)]
Most Common Negative Words :  [(b'tast', 34585), (b'like', 32330), (b'product', 28218), (b'one', 20569), (b'flavor', 19575), (b'would', 17972), (b'tri', 17753), (b'use', 15302), (b'good', 15041), (b'coffe', 14716), (b'get', 13786), (b'buy', 13752), (b'order', 12871), (b'food', 12754), (b'dont', 11877), (b'tea', 11665), (b'even', 11085), (b'box', 10844), (b'amazon', 10073), (b'make', 9840)]


In [39]:
# bi-gram, tri-gram and n-gram

# Note: Do NOT remove stop-words like "not" before n-gram generation
# Because phrases like "not good", "not happy", "not recommended" are sentiment-important.

from sklearn.feature_extraction.text import CountVectorizer

# Create a CountVectorizer for:
# unigram (single words) + bigram (two-word combinations)
# Example unigrams: good, product, bad, quality
# Example bigrams: very good, not recommend, worth buying
count_vect = CountVectorizer(ngram_range=(1, 2))

# Fit the vectorizer on the cleaned text data and convert text to numerical feature vectors
# final['Text'] contains all cleaned review text
final_bigram_counts = count_vect.fit_transform(final['Text'].values)

# Check the size of the generated matrix (documents x features)
# Example output: (50000, 300000) → meaning:
# 50,000 reviews, and 300,000 unique unigrams+bigrams learned
final_bigram_counts.get_shape()



(364171, 2910192)

<h2> TF-IDF

In [40]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Create TF-IDF Vectorizer
# ngram_range=(1,2) means: include unigrams (1-word) and bigrams (2-word phrases)
# This helps capture phrases like "not good", "very bad", etc.
tf_idf_vect = TfidfVectorizer(ngram_range=(1, 2))

# Fit the vectorizer on the text data and convert text to TF-IDF features
# final['Text'] → contains all the review sentences
# .fit_transform() learns vocabulary + computes TF-IDF scores
final_tf_idf = tf_idf_vect.fit_transform(final['Text'].values)

# Show shape of TF-IDF matrix
# Example: (50000, 320000)
# → 50000 rows (documents/reviews)
# → 320000 columns (unique unigrams + bigrams as features)
print("TF-IDF matrix shape:", final_tf_idf.shape)

# Extract all feature (word) names from the TF-IDF vectorizer
features = tf_idf_vect.get_feature_names_out()

# Print total number of features (vocabulary size)
# These features are words + 2-word phrases found in dataset
print("Total features (unigrams + bigrams):", len(features))


TF-IDF matrix shape: (364171, 2910192)
Total features (unigrams + bigrams): 2910192


In [42]:
# Print 10 feature names starting from index 100000
# Slicing format → features[start : end]
# This helps check random vocab words/bigrams created by TF-IDF

print(features[100000:100010])
print(features.shape)

['ales until' 'ales ve' 'ales would' 'ales you' 'alessandra'
 'alessandra ambrosia' 'alessi' 'alessi added' 'alessi also' 'alessi and']
(2910192,)


In [45]:
# Print TF-IDF values of the 4th review (row index = 3)
# final_tf_idf[3,:] → select row 3 (entire vector for that document)
# .toarray() → convert sparse matrix row into normal dense array
# [0] → extract the actual 1-D vector from nested array

print(final_tf_idf[3, :].toarray()[0])
#what it will do


# Length = number of TF-IDF features (ex: 320,000)
# Mostly 0 values → word not present in this review
# Non-zero values → TF-IDF weight for words actually present in the review

# 0.45 = weight of some word/bigram in this review
# 0.79 = weight of another word/bigram
# Zeros mean the review did not contain those words

[0. 0. 0. ... 0. 0. 0.]


In [46]:
# Source reference (good practice to credit useful code)
# https://buhrmann.github.io/tfidf-analysis.html 

def top_tfidf_feats(row, features, top_n=25):
    """
    Return the top `top_n` TF-IDF features in a document.
    
    Parameters:
    row      → TF-IDF vector for one document (as a 1D array)
    features → List of feature names (words + bigrams)
    top_n    → Number of top features to return (default = 25)

    Output:
    A DataFrame containing top words and their TF-IDF scores
    """
    
    # Get indices of top TF-IDF values
    # np.argsort(row) → sorts values, returning indices
    # [::-1]          → reverse to get highest values first
    # [:top_n]        → take only the first top_n indices
    topn_ids = np.argsort(row)[::-1][:top_n]
    
    # Create list of (word, tfidf_score) pairs for those top indices
    top_feats = [(features[i], row[i]) for i in topn_ids]
    
    # Convert to dataframe for better readability
    df = pd.DataFrame(top_feats, columns=['feature', 'tfidf'])
    
    return df

# Get top 25 TF-IDF features for the review at index 1
top_tfidf = top_tfidf_feats(final_tf_idf[1,:].toarray()[0], features, 25)# Get top 25 most important words/bigrams (highest TF-IDF scores)
                                                                         # for the review at index 1 (second review in dataset)

# Show the result
top_tfidf


Unnamed: 0,feature,tfidf
0,these sendak,0.173437
1,paperbacks seem,0.173437
2,rosie movie,0.173437
3,the paperbacks,0.173437
4,pages open,0.173437
5,sendak books,0.173437
6,cover version,0.173437
7,incorporates them,0.168074
8,paperbacks,0.168074
9,really rosie,0.168074


In [48]:
top_tfidf.shape	#Meaning
                            #25 rows	top 25 words/bigrams for that review
                          # 2 columns	feature (word) and tfidf (score)

(25, 2)

In [50]:
# ✅ Import necessary libraries
# Word2Vec & KeyedVectors are tools for handling word embeddings (word vectors)
!pip install gensim

from gensim.models import Word2Vec
from gensim.models import KeyedVectors
import pickle   # Used later for saving/loading Python objects (like dict of word vectors)

# ✅ NOTE: We are using Google's pre-trained Word2Vec model
# - This model was trained on Google News dataset (~100 billion words)
# - File size ~1.9GB (.bin format) → after loading, uses ~9GB RAM in memory
# - Therefore, run this only if your system has at least 12GB RAM
# Download link (Google's public release):
# https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM/edit

# ❗ Final expected file name: GoogleNews-vectors-negative300.bin
# "300" = each word is represented as a 300-dimensional numerical vector

# ✅ Load the pre-trained model
# binary=True → because it is a .bin file (binary format)
model = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)

# ✅ After loading, you can use:
# model['king']  → gives 300-dimension vector for "king"
# model.most_similar('king') → finds similar words based on meaning
# model.similarity('king','queen') → computes similarity score



[notice] A new release of pip is available: 25.2 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


Collecting gensim
  Downloading gensim-4.4.0-cp313-cp313-win_amd64.whl.metadata (8.6 kB)
Collecting smart_open>=1.8.1 (from gensim)
  Downloading smart_open-7.4.2-py3-none-any.whl.metadata (24 kB)
Downloading gensim-4.4.0-cp313-cp313-win_amd64.whl (24.4 MB)
   ---------------------------------------- 0.0/24.4 MB ? eta -:--:--
   ---------------------------------------- 0.0/24.4 MB ? eta -:--:--
   ---------------------------------------- 0.0/24.4 MB ? eta -:--:--
   ---------------------------------------- 0.3/24.4 MB ? eta -:--:--
   ---------------------------------------- 0.3/24.4 MB ? eta -:--:--
   ---------------------------------------- 0.3/24.4 MB ? eta -:--:--
   ---------------------------------------- 0.3/24.4 MB ? eta -:--:--
   ---------------------------------------- 0.3/24.4 MB ? eta -:--:--
   ---------------------------------------- 0.3/24.4 MB ? eta -:--:--
   ---------------------------------------- 0.3/24.4 MB ? eta -:--:--
   ---------------------------------------

FileNotFoundError: [Errno 2] No such file or directory: 'GoogleNews-vectors-negative300.bin'