# This file is used for data reprocessing

#### selecting 20,000 from each category, where each contains 10,000 spam and 10,000 not spam

In [1]:
# import
import json
import random
import os

import nltk
nltk.download('stopwords')
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
import re

import pandas as pd

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/zhuzilu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [35]:
# Function to load and process a single JSON file
def process_json_file(file_path, outputfile, num_samples_per_class=10000):
    """
    Processes a JSON file and selects a specified number of samples for each class (e.g., 'spam', 'not spam').

    Args:
        file_path (str): Path to the JSON file.
        output_dir (str): Directory where the processed file will be saved.
        num_samples_per_class (int): Number of samples to select for each class.

    Returns:
        None
    """
    with open(file_path, 'r') as f:
        data = [json.loads(line.strip()) for line in f]

    # Separate data into spam and not spam based on the 'class' label
    spam_data = [item for item in data if item.get('class') == 1.0]
    not_spam_data = [item for item in data if item.get('class') == 0.0]

    # Randomly sample the specified number of rows for each class
    sampled_spam = random.sample(spam_data, min(num_samples_per_class, len(spam_data)))
    sampled_not_spam = random.sample(not_spam_data, min(num_samples_per_class, len(not_spam_data)))

    # Combine sampled data
    selected_data = sampled_spam + sampled_not_spam

    # Shuffle the combined data
    random.shuffle(selected_data)

    ps = PorterStemmer() # initializing porter stemmer

    # Save the processed data to a new JSON file
    df = pd.DataFrame(selected_data)
    df = df[['category','summary','reviewText','class']]
    corpus=[]
    # sentences=[]
    for i in range(len(df)):
        review=re.sub('[^a-zA-Z]',' ', df['reviewText'][i])
        review=review.lower()
        list=review.split()
        review=[ps.stem(word) for word in list if word not in set(stopwords.words('english'))]
        sentences=' '.join(review)
        corpus.append(sentences)
    df['reviewTextEdit'] = corpus

    df.to_json(outputfile, orient='records', lines=True)


    # with open(outputfile, 'a') as f:
    #     json.dump(df, f)

    print(f"Processed file saved to {outputfile}")



In [36]:
process_json_file('dataset_files/Clothing_Shoes_and_Jewelry.json', 'dataset_files/processeddata_clothing.json')

Processed file saved to dataset_files/processeddata_clothing.json


In [37]:
process_json_file('dataset_files/Home_and_Kitchen.json', 'dataset_files/processeddata_home.json')

Processed file saved to dataset_files/processeddata_home.json


In [38]:
process_json_file('dataset_files/Sports_and_Outdoors.json', 'dataset_files/processeddata_sports.json')

Processed file saved to dataset_files/processeddata_sports.json


In [39]:
process_json_file('dataset_files/Toys_and_Games.json', 'dataset_files/processeddata_toys.json')

Processed file saved to dataset_files/processeddata_toys.json


In [4]:
with open('dataset_files/Cell_Phones_and_Accessories.json', 'r') as f:
    data = [json.loads(line.strip()) for line in f]
data[0]

{'_id': {'$oid': '5a1321d5741a2384e802c552'},
 'reviewerID': 'A3HVRXV0LVJN7',
 'asin': '0110400550',
 'reviewerName': 'BiancaNicole',
 'helpful': [4, 4],
 'reviewText': 'Best phone case ever . Everywhere I go I get a ton of compliments on it. It was in perfect condition as well.',
 'overall': 5.0,
 'summary': 'A++++',
 'unixReviewTime': 1358035200,
 'reviewTime': '01 13, 2013',
 'category': 'Cell_Phones_and_Accessories',
 'class': 1.0}

In [15]:
spam_data = [item for item in data if item.get('class') == 1.0]

In [17]:
not_spam_data = [item for item in data if item.get('class') == 0.0]

# Randomly sample the specified number of rows for each class
sampled_spam = random.sample(spam_data, min(10000, len(spam_data)))
sampled_not_spam = random.sample(not_spam_data, min(10000, len(not_spam_data)))

# Combine sampled data
selected_data = sampled_spam + sampled_not_spam

# Shuffle the combined data
random.shuffle(selected_data)

In [19]:
len(selected_data)

20000

In [21]:
ps = PorterStemmer() # initializing porter stemmer

In [22]:
selected_data[0].get('reviewText')

"Love it. Really worth the money .hasn't bubbled up or anything..... it's a great product. .. you should try it"

In [26]:
df = pd.DataFrame(selected_data)
df = df[['category','summary','reviewText','class']]
corpus=[]
# sentences=[]
for i in range(len(df)):
    review=re.sub('[^a-zA-Z]',' ', df['reviewText'][i])
    review=review.lower()
    list=review.split()
    review=[ps.stem(word) for word in list if word not in set(stopwords.words('english'))]
    sentences=' '.join(review)
    corpus.append(sentences)
df['reviewTextEdit'] = corpus


In [29]:
corpus=[]
# sentences=[]
for i in range(len(df)):

    review=re.sub('[^a-zA-Z]',' ', df['reviewText'][i])
    review=review.lower()
    list=review.split()
    review=[ps.stem(word) for word in list if word not in set(stopwords.words('english'))]
    sentences=' '.join(review)
    corpus.append(sentences)


In [30]:
df['reviewTextEdit'] = corpus

In [67]:
df.head(5)

NameError: name 'df' is not defined

In [33]:
df.to_json('dataset_files/processeddata_cell.json', orient='records', lines=True)

# MSE

In [7]:
with open('dataset_files/final_logisticregression.json', 'r') as f:
    d = [json.loads(line.strip()) for line in f]
d[75721]

{'category': 'Sports_and_Outdoors',
 'summary': 'Oris Rocks!!!',
 'reviewText': 'I love it. Great watch! The size is perfect. It is a beautiful and excellent quality watch. I am in love with Oris! Oris Rocks!!!',
 'class': 1.0,
 'reviewTextEdit': 'love great watch size perfect beauti excel qualiti watch love ori ori rock',
 'text_length': 74,
 'predicted_label': None,
 'predict_value': 0.0}

In [50]:
d_df = pd.DataFrame(d)

In [61]:
d_df['predict_value'].tail(20000).isnull().sum()


20000

In [66]:
d_df['category'].tail(20000).unique()

array(['Toys_and_Games'], dtype=object)

In [8]:
with open('dataset_files/final_GPT2.json', 'r') as f:
    d2 = [json.loads(line.strip()) for line in f]
d2[75721]

{'category': 'Sports_and_Outdoors',
 'summary': 'Oris Rocks!!!',
 'reviewText': 'I love it. Great watch! The size is perfect. It is a beautiful and excellent quality watch. I am in love with Oris! Oris Rocks!!!',
 'class': 1.0,
 'reviewTextEdit': 'love great watch size perfect beauti excel qualiti watch love ori ori rock',
 'predict_value': 0}

In [58]:
d2_df = pd.DataFrame(d2)
print(d2_df['predict_value'].isnull())


0        False
1        False
2        False
3        False
4        False
         ...  
99995     True
99996     True
99997     True
99998     True
99999     True
Name: predict_value, Length: 100000, dtype: bool


In [36]:
d3 = pd.read_csv('dataset_files/final_bertlstm.csv')


In [56]:
d3['predicted_class'].isnull().sum()

20000

In [43]:
d3['reviewText'][75721]

'I love it. Great watch! The size is perfect. It is a beautiful and excellent quality watch. I am in love with Oris! Oris Rocks!!!'

In [28]:
with open('dataset_files/final_NB.json', 'r') as f:
    d4 = [json.loads(line.strip()) for line in f]
d4[75721]

{'category': 'Sports_and_Outdoors',
 'summary': 'Oris Rocks!!!',
 'reviewText': 'I love it. Great watch! The size is perfect. It is a beautiful and excellent quality watch. I am in love with Oris! Oris Rocks!!!',
 'class': 1.0,
 'reviewTextEdit': 'love great watch size perfect beauti excel qualiti watch love ori ori rock',
 'predict_value_NB': 1.0}

In [30]:
finaldf = pd.DataFrame(d4)

In [31]:
finaldf['predict_value_logistic'] = pd.DataFrame(d)['predict_value']


In [33]:
finaldf['predict_value_gpt2'] = pd.DataFrame(d2)['predict_value']


In [44]:
finaldf['predict_value_bertlstm'] = d3['predicted_class']

In [45]:
finaldf.head(5)

Unnamed: 0,category,summary,reviewText,class,reviewTextEdit,predict_value_NB,predict_value_logistic,predict_value_gpt2,predict_value_bertlstm
0,Cell_Phones_and_Accessories,yess,Love it. Really worth the money .hasn't bubble...,0.0,love realli worth money bubbl anyth great prod...,1.0,1.0,0.0,1.0
1,Cell_Phones_and_Accessories,Faulty SIM card reader,Let me start by saying that I am and will alww...,0.0,let start say alwway fan nokia symbian oper sy...,0.0,0.0,0.0,0.0
2,Cell_Phones_and_Accessories,sturdy and functional,love this holster. It kept my phone accessable...,1.0,love holster kept phone access protect easi at...,1.0,0.0,0.0,1.0
3,Cell_Phones_and_Accessories,The white case I've been looking for..,This is my third white case I've purchased. I ...,1.0,third white case purchas purchasedwireless cen...,0.0,0.0,1.0,1.0
4,Cell_Phones_and_Accessories,It's cool.,I just got it today so I will use it soon but ...,1.0,got today use soon littl disappoint expect lit...,0.0,1.0,1.0,0.0


In [46]:
finaldf.to_json('dataset_files/finaldf.json', orient='records', lines=True)

In [2]:
test_df = pd.read_csv('dataset_files/test_df.csv')

In [3]:
test_df

Unnamed: 0.1,Unnamed: 0,index,category,summary,reviewText,class,reviewTextEdit,text_length,predict_value_gpt2,predict_value_logistic,predict_value_NB,predict_value_bertlstm,predict_bagging,predict_bagging_binary
0,0,43410,Home_and_Kitchen,Good but...,! Great little fondue set! It's so cute! My on...,1,great littl fondu set cute complaint new crack...,168,1,1,1,1,1.000000,1
1,10,4998,Cell_Phones_and_Accessories,Just don't do it!!!,"""Ditto"" to the comment ""there wasn't a choice ...",0,ditto comment choic zero product would gotten ...,455,0,1,0,0,0.040050,0
2,11,15044,Cell_Phones_and_Accessories,Not worth consideration,"""Eh, it's only a few bucks, worth a shot!""WRON...",0,eh buck worth shot wrong use batteri accur com...,99,0,1,0,0,0.040050,0
3,12,85819,Toys_and_Games,This is not a true 1/16 scale model. It is muc...,"""Extremely"" upset when I opened the box. It's ...",0,extrem upset open box even close expect scale ...,234,0,1,0,0,0.040050,0
4,13,65232,Sports_and_Outdoors,Ecstatic two-tone case owner,"""First, I am overjoyed to find a case for such...",1,first overjoy find case good price especi ad f...,395,1,0,1,1,0.959950,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19995,99982,33413,Clothing_Shoes_and_Jewelry,you get what you pay for,you get what you pay for. makes everything yel...,0,get pay make everyth yellow glare actual seem ...,87,0,1,0,0,0.040050,0
19996,99983,50329,Home_and_Kitchen,Corn holder,you get what you pay for...it was inexpensive ...,0,get pay inexpens felt way prevent corn roll ar...,50,1,0,0,0,0.331500,0
19997,99992,40681,Home_and_Kitchen,Pricing is crazy on this item,you've got to be nuts to think someone will pa...,0,got nut think someon pay towel materi gold,42,0,0,0,0,0.000000,0
19998,99993,55616,Home_and_Kitchen,awful,you;d think that a cookbook stand would hold m...,0,think cookbook stand would hold narrowest book...,180,0,0,0,0,0.000000,0


In [5]:
print(test_df['reviewTextEdit'])

0        great littl fondu set cute complaint new crack...
1        ditto comment choic zero product would gotten ...
2        eh buck worth shot wrong use batteri accur com...
3        extrem upset open box even close expect scale ...
4        first overjoy find case good price especi ad f...
                               ...                        
19995    get pay make everyth yellow glare actual seem ...
19996    get pay inexpens felt way prevent corn roll ar...
19997           got nut think someon pay towel materi gold
19998    think cookbook stand would hold narrowest book...
19999    zumba brand bra differ style fit perfectli loo...
Name: reviewTextEdit, Length: 20000, dtype: object


In [10]:
test_df.columns

Index(['Unnamed: 0', 'index', 'category', 'summary', 'reviewText', 'class',
       'reviewTextEdit', 'text_length', 'predict_value_gpt2',
       'predict_value_logistic', 'predict_value_NB', 'predict_value_bertlstm',
       'predict_bagging', 'predict_bagging_binary'],
      dtype='object')

In [17]:
filtered_df = test_df[
    (test_df['predict_value_gpt2'] == 0) &
    (test_df['predict_value_logistic'] == 0) &
    (test_df['predict_value_NB'] == 0) &
    (test_df['predict_value_bertlstm'] == 0) &
    (test_df['class'] == 1)
]
filtered_df


Unnamed: 0.1,Unnamed: 0,index,category,summary,reviewText,class,reviewTextEdit,text_length,predict_value_gpt2,predict_value_logistic,predict_value_NB,predict_value_bertlstm,predict_bagging,predict_bagging_binary
8,38,70237,Sports_and_Outdoors,Math problems are not your forte,$3500 / 100 wheels = $35 per wheel...It shows ...,0,wheel per wheel show msrp per wheel umm someon...,68,0,0,0,0,0.0,0
9,40,56180,Home_and_Kitchen,Product is expensive for what it is,$40 for these shelves are too expensive. they ...,0,shelv expens shelv hold screw instruct poor do...,103,0,0,0,0,0.0,0
16,60,36435,Clothing_Shoes_and_Jewelry,poor construction,($59.99 on Easy Spirit website) wore these twi...,0,easi spirit websit wore twice sneaker came apa...,129,0,0,0,0,0.0,0
21,83,36293,Clothing_Shoes_and_Jewelry,I wish I had gotten something better,********UPDATE**********I would consider this ...,0,updat would consid bag inferior exact one repl...,1217,0,0,0,0,0.0,0
23,88,52595,Home_and_Kitchen,"Wanted to like this thing, but it smells terrible",***Instead of returning it yet Im ordering ano...,0,instead return yet im order anoth hepa filter ...,1326,0,0,0,0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19987,99937,64639,Sports_and_Outdoors,wow,wow we have had this item for about a year and...,0,wow item year roof fell apart littl piec cost ...,63,0,0,0,0,0.0,0
19991,99953,24455,Clothing_Shoes_and_Jewelry,The Mountain Men's Rottweiler Face T-shirt xxx...,xxx-large not! xxx-long yes my son love rotti'...,0,xxx larg xxx long ye son love rotti bought lb ...,243,0,0,0,0,0.0,0
19992,99961,87190,Toys_and_Games,Works of AHHH Treasure chest Coin bank,yes there is paint to decorate and paint your ...,0,ye paint decor paint design enough paint paint...,189,0,0,0,0,0.0,0
19997,99992,40681,Home_and_Kitchen,Pricing is crazy on this item,you've got to be nuts to think someone will pa...,0,got nut think someon pay towel materi gold,42,0,0,0,0,0.0,0


In [68]:
test_df[
    (test_df['predict_value_gpt2'] == 1) &
    (test_df['predict_value_logistic'] == 1) &
    (test_df['predict_value_NB'] == 1) &
    (test_df['predict_value_bertlstm'] == 1) &
    (test_df['class'] == 0)
]

Unnamed: 0.1,Unnamed: 0,index,category,summary,reviewText,class,reviewTextEdit,text_length,predict_value_gpt2,predict_value_logistic,predict_value_NB,predict_value_bertlstm,predict_bagging,predict_bagging_binary
89,451,4472,Cell_Phones_and_Accessories,Good,A bit too rubbery. Gets caught on fabric. Defi...,0,bit rubberi get caught fabric definit protect ...,78,1,1,1,1,1.0,1
106,551,86712,Toys_and_Games,Great in or outdoor toy!,A fun toy for a ride inside or around the bloc...,0,fun toy ride insid around block perfect clock ...,63,1,1,1,1,1.0,1
348,1788,35718,Clothing_Shoes_and_Jewelry,New Balance Men's MO889 Outdoor Multisport Hik...,"Aloha, normally I take a size 7.52E which fits...",0,aloha normal take size e fit perfectli seem co...,268,1,1,1,1,1.0,1
361,1853,24481,Clothing_Shoes_and_Jewelry,I want my child to grow up to be just like Mic...,Although Michael Jackson was an admitted child...,0,although michael jackson admit child molest di...,266,1,1,1,1,1.0,1
484,2406,34250,Clothing_Shoes_and_Jewelry,a good product,"Arrived sooner than expected, looks pretty str...",0,arriv sooner expect look pretti strong price p...,232,1,1,1,1,1.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19574,97966,831,Cell_Phones_and_Accessories,Works but just a bit flimsy,the stylus works and glides smoothly on my pho...,0,stylu work glide smoothli phone screen silicon...,139,1,1,1,1,1.0,1
19740,98789,70398,Sports_and_Outdoors,good kickstand,this kickstand was an easy installed and works...,0,kickstand easi instal work enough tension keep...,68,1,1,1,1,1.0,1
19802,99083,30854,Clothing_Shoes_and_Jewelry,UGLY WATCH,"this watch is even uglier in person, but it do...",0,watch even uglier person job number huge win f...,104,1,1,1,1,1.0,1
19819,99144,30867,Clothing_Shoes_and_Jewelry,mac,too small for me but a very nice purse the lea...,0,small nice purs leather littl stiff might brea...,52,1,1,1,1,1.0,1


In [25]:
test_df[
    (test_df['predict_bagging_binary'] == 0) &
    (test_df['class'] == 1)
].sample(10)

Unnamed: 0.1,Unnamed: 0,index,category,summary,reviewText,class,reviewTextEdit,text_length,predict_value_gpt2,predict_value_logistic,predict_value_NB,predict_value_bertlstm,predict_bagging,predict_bagging_binary
2108,10344,83890,Toys_and_Games,Decent Quality and Easy to Assemble,"From reading some of the previous reviews, I w...",1,read previou review nervou qualiti item would ...,155,1,1,1,0,0.503184,0
13974,69825,90376,Toys_and_Games,Goku Model Kit,There are a lot of pieces but i actually enjoy...,1,lot piec actual enjoy assembl itit realli good...,84,1,1,1,0,0.503184,0
5128,25605,26211,Clothing_Shoes_and_Jewelry,great!,I have planter facitius. I was skeptical if t...,1,planter facitiu skeptic would work could size ...,68,1,1,0,0,0.37155,0
10118,50577,45446,Home_and_Kitchen,"""may contain cosmetic defects""",My Breville juice fountain arrived with the pl...,1,brevil juic fountain arriv plastic case top ma...,336,0,0,0,1,0.496816,0
17908,89518,11727,Cell_Phones_and_Accessories,The Headset To Beat,Well it is about darn time I find a headset th...,1,well darn time find headset actual high qualit...,833,0,1,0,0,0.04005,0
8030,40295,44593,Home_and_Kitchen,Great for jerky; not so great for fruits.,I'm finding this unit amazing for doing jerky....,1,find unit amaz jerki done within hour use jerk...,196,0,0,0,0,0.0,0
14481,72470,37026,Clothing_Shoes_and_Jewelry,Run small,These sneakers are great in every way accept t...,1,sneaker great everi way accept run small broug...,79,1,1,1,0,0.503184,0
10526,52674,43857,Home_and_Kitchen,Microwave Oven,My old microwave was a Goldstar which gave a g...,1,old microwav goldstar gave good year heavi dut...,914,0,0,0,0,0.0,0
17889,89429,37104,Clothing_Shoes_and_Jewelry,thin,"Wear very thin, focused on inexpensive, whethe...",1,wear thin focus inexpens whether travel rain p...,71,0,1,1,0,0.171683,0
9479,47349,16736,Cell_Phones_and_Accessories,Launch Date Delayed,"Just wanted to let everyone know, if you do a ...",1,want let everyon know simpl search product typ...,207,0,0,0,0,0.0,0


In [26]:
test_df[
    (test_df['predict_value_gpt2'] == 0) &
    # (test_df['predict_value_logistic'] == 0) &
    (test_df['predict_value_NB'] == 0) &
    (test_df['predict_value_bertlstm'] == 1)
]

Unnamed: 0.1,Unnamed: 0,index,category,summary,reviewText,class,reviewTextEdit,text_length,predict_value_gpt2,predict_value_logistic,predict_value_NB,predict_value_bertlstm,predict_bagging,predict_bagging_binary
103,518,81772,Toys_and_Games,Sleek and appropriately smaller.,A fine specimen of the SX3 FIghter from Mass E...,1,fine specimen sx fighter mass effect would lik...,477,0,0,0,1,0.496816,0
263,1374,59370,Home_and_Kitchen,Nice Toaster,After purchasing a brand name I've never heard...,1,purchas brand name never heard sunbeam return ...,608,0,1,0,1,0.536866,0
268,1401,9022,Cell_Phones_and_Accessories,I prefer Zaggs,After reading an online review about these bei...,0,read onlin review better zagg invisishield dec...,460,0,0,0,1,0.496816,0
277,1460,75072,Sports_and_Outdoors,Worked great for us!,After reading the reviews we set the trap and ...,1,read review set trap put brick side prevent ro...,370,0,1,0,1,0.536866,0
294,1574,3537,Cell_Phones_and_Accessories,More like a Limited battery,"After using this battery for a week, i've noti...",0,use batteri week notic significantli faster po...,187,0,0,0,1,0.496816,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19283,96540,29786,Clothing_Shoes_and_Jewelry,love style,not having a zipper blows. buttons get stuck. ...,0,zipper blow button get stuck zipper would star...,61,0,0,0,1,0.496816,0
19384,97025,89488,Toys_and_Games,takes some getting use to,really hard to win! don't know why it doesn't ...,0,realli hard win know show game like direct say...,52,0,1,0,1,0.536866,0
19452,97357,56745,Home_and_Kitchen,Much smaller...,than I expected. I thought the height of it w...,0,expect thought height would bit taller paid at...,151,0,1,0,1,0.536866,0
19811,99115,64127,Sports_and_Outdoors,Exactly as pictured and expected,"tl;dr? 16 matched sets, all different colors, ...",1,tl dr match set differ color low defect bad in...,1123,0,1,0,1,0.536866,0


In [27]:
# imports
import json
import pandas as pd

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

import re

import nltk
nltk.download('stopwords')
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords

# plt.style.use('Solarize_Light2')
# %matplotlib inline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB, BernoulliNB

import joblib

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/zhuzilu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [56]:
loaded_cv = joblib.load('model/cvoptuna.pkl')
loaded_classifier = joblib.load('model/MultinomialNBoptuna.pkl')

In [29]:
loaded_classifier

In [36]:
c = 0
for _ in loaded_cv.get_feature_names_out():
    print(_)
    c+=1
    if c > 100:
        break

aa
aa batteri
aaa
ab
abil
abl
abl charg
abl find
abl fit
abl get
abl keep
abl make
abl play
abl put
abl return
abl see
abl take
abl use
abl wear
absolut
absolut ador
absolut beauti
absolut love
absolut perfect
absorb
abus
ac
accent
accept
access
accessori
accid
accident
accident drop
accommod
accomod
accompani
accomplish
accord
accordingli
account
accumul
accur
accuraci
ach
achiev
acquir
across
acryl
act
act like
action
action figur
activ
actual
actual bought
actual fit
actual get
actual like
actual look
actual made
actual make
actual product
actual size
actual use
actual work
ad
ad bonu
adapt
adaptor
add
add collect
add littl
add much
addict
addit
addit collect
address
adequ
adher
adhes
adida
adjust
adjust fit
adjust strap
admir
admit
admittedli
ador
adult
advanc
advantag
adventur
advert
advertis
advic
advis
aerob
aesthet
affect
afford


In [37]:
loaded_classifier.feature_log_prob_

array([[-10.32434127, -10.97277741, -11.65458231, ..., -10.93545964,
        -10.93545964, -11.09389298],
       [-10.16589815, -10.81600782, -10.85332558, ...,  -9.84207039,
        -10.81600782, -11.01830784]])

In [42]:
feature_names = loaded_cv.get_feature_names_out()
log_probabilities = loaded_classifier.feature_log_prob_

# Step 4: Create a DataFrame for better visualization
weights_df = pd.DataFrame(log_probabilities.T, index=feature_names, columns=["Negative(log prob)", "Positive(log prob)"])

# Sort by the highest weights for each class
sorted_weights = weights_df.sort_values(by=["Positive(log prob)", "Negative(log prob)"], ascending=False)

In [43]:
print(sorted_weights)

                  Negative(log prob)  Positive(log prob)
great                      -5.723095           -4.563638
use                        -4.741256           -4.638011
love                       -6.030511           -4.677379
one                        -4.640552           -4.715237
like                       -4.840865           -4.807807
...                              ...                 ...
dont wast money           -10.707201          -15.326867
piec garbag               -10.707201          -15.326867
realli want like          -10.707201          -15.326867
want money                -10.707201          -15.326867
terribl product           -10.767279          -15.326867

[10000 rows x 2 columns]


In [47]:
print("Bottom Feature Weights of Features")
print(sorted_weights.tail(20))  # Top 10 features

Bottom Feature Weights of Features
                  Negative(log prob)  Positive(log prob)
way overpr                -10.678463          -13.860530
cheap piec                -10.707201          -13.860530
cost return               -10.767279          -13.860530
total wast money          -10.033186          -15.326867
zero star                 -10.194046          -15.326867
dont wast                 -10.304656          -15.326867
return window             -10.324341          -15.326867
zipper broke              -10.385836          -15.326867
huge disappoint           -10.429039          -15.326867
return replac             -10.474194          -15.326867
item return               -10.497559          -15.326867
return immedi             -10.545995          -15.326867
wast money time           -10.545995          -15.326867
never recommend           -10.650529          -15.326867
restock fee               -10.650529          -15.326867
dont wast money           -10.707201          -15.326

In [65]:
x = loaded_cv.transform(['huge kershaw fan unhappi finish knife sever kershaw knive look fantast awar descript bead blast mean knife look unfinish kinda scratch unlik kershaw knive highli polish smooth finish knive knife shape size good meati seem get past condit look blade two cent']).toarray()

In [66]:
loaded_classifier.predict_proba(x)

array([[0.00310914, 0.99689086]])