# Importing the Libraries

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

### Reading the Data

In [4]:
df=pd.read_csv('reduced_sentiment_data.csv')
df.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,239427,B0044BQX2U,A23VNI5AIYLBR9,Orchid,0,0,2,1319673600,Not the best in flavor,"My son who eats everything, actually refused t..."
1,400912,B0016PC4TS,A23GFTVIETX7DS,Debbie Lee Wesselmann,1,2,3,1343692800,Sometimes Works,I live in an area frequented by all kinds of w...
2,159290,B003XUJ5AK,A16XG8UWI5W7Z2,Fred Benson,4,4,3,1298073600,Zwieback Toast,The Zwieback toast had a good taste. I compar...
3,172085,B00142C0X8,A2K69F2CROGTB1,MommyHH,0,0,2,1337558400,I didn't like it,"I have been looking for a good, organic tea bu..."
4,67661,B007OXJK3Y,A2OLB6ETQYWSRY,BSan,4,5,1,1328400000,pretty gross,"was really looking forward to this, read the r..."


In [5]:
df['Score'].value_counts().sort_index()

Unnamed: 0_level_0,count
Score,Unnamed: 1_level_1
1,1000
2,1000
3,1000
4,1000
5,1000


In [6]:
#convert the 1,2 -> 1 and 3 -> 3 and 4,5 -> 5
score_mapping = {1: 1, 2: 1, 3: 3, 4: 5, 5: 5}
df['Score'] = df['Score'].map(score_mapping)

In [7]:
df.drop(columns=['ProductId','UserId','ProfileName','HelpfulnessNumerator','HelpfulnessDenominator','Time'],inplace=True)
df.head()

Unnamed: 0,Id,Score,Summary,Text
0,239427,1,Not the best in flavor,"My son who eats everything, actually refused t..."
1,400912,3,Sometimes Works,I live in an area frequented by all kinds of w...
2,159290,3,Zwieback Toast,The Zwieback toast had a good taste. I compar...
3,172085,1,I didn't like it,"I have been looking for a good, organic tea bu..."
4,67661,1,pretty gross,"was really looking forward to this, read the r..."


In [8]:
df['New_Text']=df['Text']+df['Summary']
df['New_Text'][0]

"My son who eats everything, actually refused to eat this baby food.  I got it as a 'spare' for when we're travelling.  Unfortunately it smells a bit like dog food and there isn't any thing worth saying about it.  Even I wouldn't eat it and I am a firm believer that if I can't eat it, neither should he.  Texture wise it is good as  stage-3 food.Not the best in flavor"

In [9]:
df.drop(columns=['Text','Summary'],inplace=True)
df.head()

Unnamed: 0,Id,Score,New_Text
0,239427,1,"My son who eats everything, actually refused t..."
1,400912,3,I live in an area frequented by all kinds of w...
2,159290,3,The Zwieback toast had a good taste. I compar...
3,172085,1,"I have been looking for a good, organic tea bu..."
4,67661,1,"was really looking forward to this, read the r..."


In [10]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

nltk.download('stopwords')
nltk.download('punkt')
stop_words = set(stopwords.words('english'))


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [11]:
import re

def preprocess_text(text):
    # Remove HTML tags using regex
    text = re.sub('<[^<]+?>', '', text)
    # Tokenize the text
    tokens = word_tokenize(text.lower())
    # Remove stop words
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [w for w in tokens if not w in stop_words]
    # Return the filtered tokens as a string
    return ' '.join(filtered_tokens)

In [12]:
df['train'] = df['New_Text'].apply(preprocess_text)

In [13]:
from nltk.sentiment import SentimentIntensityAnalyzer
import nltk
nltk.download('vader_lexicon')
sia=SentimentIntensityAnalyzer()

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


In [14]:
df['train'][2000]

"black cherry flavor tastes pretty good.although assistant taster care orange switch , like black cherry . people preconditioned orange flavor . supposed taste certain way , black cherry freedom individual.i discovered wonderful way use switch black cherry - cooking salmon ! mixed switch black cherry blender half cup brown sugar tablespoon honey . added shake chinese five spice , shake ground cloves , approximately 1 teaspoon dry mustard . mixed blender . sprayed baking pan butter flavored cooking spray . placed boneless salmon - one fourth fish - middle pan surrounded slices peeled pre-baked sweet potatoes . liquid blender went top . hot oven ( 450 degrees ) went pan . soon liquid bubbled , carefully turned salmon , basted glaze , let cook five minutes . still pan , cut two servings , topped pecan pieces returned hot oven another two minutes heat pecans . pretty plate ! poured glaze . yummy ! switch black cherry would nice ham glaze too.mary lou cheatham ( jane riley ) author flavored

In [15]:
sia.polarity_scores(df['train'][1560])

{'neg': 0.0, 'neu': 0.724, 'pos': 0.276, 'compound': 0.9696}

In [16]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
vectorizer.fit(df['train'])
X = vectorizer.transform(df['train'])

In [17]:
X.shape

(5000, 16003)

In [18]:
Y=df['Score']
#train test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,Y,train_size=0.33,random_state=42)

In [19]:
X_train.shape

(1650, 16003)

In [20]:
y_train.shape

(1650,)

In [21]:
#random forest classifier
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
rf_pred = rf.predict(X_test)
#accuracy score
from sklearn.metrics import accuracy_score
accuracy_score(y_test, rf_pred)

0.6474626865671642

In [22]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical


# Tokenization and Padding
tokenizer = Tokenizer(num_words=5000, oov_token="<OOV>")
tokenizer.fit_on_texts(df['New_Text'])
sequences = tokenizer.texts_to_sequences(df['train'])
padded = pad_sequences(sequences, padding='post', maxlen=100)


In [23]:
output = pd.get_dummies(df,columns=['Score'])
output

Unnamed: 0,Id,New_Text,train,Score_1,Score_3,Score_5
0,239427,"My son who eats everything, actually refused t...","son eats everything , actually refused eat bab...",True,False,False
1,400912,I live in an area frequented by all kinds of w...,live area frequented kinds wildlife -- chipmun...,False,True,False
2,159290,The Zwieback toast had a good taste. I compar...,zwieback toast good taste . compare old nabisc...,False,True,False
3,172085,"I have been looking for a good, organic tea bu...","looking good , organic tea like taste one . fe...",True,False,False
4,67661,"was really looking forward to this, read the r...","really looking forward , read reviews bought ....",True,False,False
...,...,...,...,...,...,...
4995,324385,Great texture and balance of sweet to butter i...,great texture balance sweet butter right . adu...,False,False,True
4996,191624,Be aware that these treats are made in China. ...,aware treats made china . received christmas g...,True,False,False
4997,487203,This all in one pack of popcorn is great! Cut ...,one pack popcorn great ! cut end dump popcorn ...,False,False,True
4998,469250,"I was a little unsure about this one, but it's...","little unsure one , 's another good flavor nuu...",False,False,True


In [24]:

#take 3 columns in y
y=output[['Score_1','Score_3','Score_5']]
X_train, X_test, y_train, y_test = train_test_split(padded, y, test_size=0.2, random_state=42)

In [25]:
# Build LSTM model
model = Sequential()
model.add(Embedding(5000, 128))
model.add(LSTM(64))
model.add(Dense(3, activation='softmax')) # Using softmax activation

# Compile and train
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X_train, y_train, epochs=15, validation_data=(X_test, y_test))

Epoch 1/15
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 10ms/step - accuracy: 0.3993 - loss: 1.0640 - val_accuracy: 0.3930 - val_loss: 1.0480
Epoch 2/15
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.4379 - loss: 1.0512 - val_accuracy: 0.4310 - val_loss: 1.0416
Epoch 3/15
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.4405 - loss: 1.0307 - val_accuracy: 0.3940 - val_loss: 1.0775
Epoch 4/15
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.4621 - loss: 0.9883 - val_accuracy: 0.4090 - val_loss: 1.0655
Epoch 5/15
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.4769 - loss: 0.9439 - val_accuracy: 0.4280 - val_loss: 1.0887
Epoch 6/15
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.4951 - loss: 0.9212 - val_accuracy: 0.4020 - val_loss: 1.1064
Epoch 7/15
[1m125/125[0m 

<keras.src.callbacks.history.History at 0x7bf38c64c610>

In [26]:
from transformers import pipeline

In [29]:
classifier= pipeline("sentiment-analysis")

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


In [31]:
result = classifier(df['train'][0])
print(result)

[{'label': 'NEGATIVE', 'score': 0.9310936331748962}]


In [32]:
df['train'][0]

"son eats everything , actually refused eat baby food . got 'spare ' 're travelling . unfortunately smells bit like dog food n't thing worth saying . even would n't eat firm believer ca n't eat , neither . texture wise good stage-3 food.not best flavor"