In [31]:
import pandas as pd

df = pd.read_csv("/Users/Jithnuka/Documents/GitHub/Sentiment Analysis/Reviews.csv") 

print(df.shape)
df.head()

(568454, 10)


Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...


In [32]:
df = df[['Text', 'Score']]  #Adjust column names
print(df.head())

                                                Text  Score
0  I have bought several of the Vitality canned d...      5
1  Product arrived labeled as Jumbo Salted Peanut...      1
2  This is a confection that has been around a fe...      4
3  If you are looking for the secret ingredient i...      2
4  Great taffy at a great price.  There was a wid...      5


In [33]:
def label_sentiment(score):
    if score <= 2:
        return 0  # Negative
    elif score >= 4:
        return 1  # Positive
    else:
        return None  # Neutral

df['Sentiment'] = df['Score'].apply(label_sentiment)
df.dropna(inplace=True)
df['Sentiment'] = df['Sentiment'].astype(int)

df.to_csv("processed_reviews.csv", index=False)
print(df.head())

                                                Text  Score  Sentiment
0  I have bought several of the Vitality canned d...      5          1
1  Product arrived labeled as Jumbo Salted Peanut...      1          0
2  This is a confection that has been around a fe...      4          1
3  If you are looking for the secret ingredient i...      2          0
4  Great taffy at a great price.  There was a wid...      5          1


In [34]:
import pandas as pd
import nltk
import string
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords')
nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

df = pd.read_csv("Reviews.csv")  

def preprocess_text(text):
    text = text.lower()
    text = ''.join([char for char in text if char not in string.punctuation])
    tokens = text.split()
    tokens = [word for word in tokens if word not in stop_words]
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    tokens = [word for word in tokens if not word.isdigit()]

    return ' '.join(tokens) 

df['cleaned_text'] = df['Text'].apply(preprocess_text)
print(df[['Text', 'cleaned_text']].head())

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/Jithnuka/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/Jithnuka/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


                                                Text  \
0  I have bought several of the Vitality canned d...   
1  Product arrived labeled as Jumbo Salted Peanut...   
2  This is a confection that has been around a fe...   
3  If you are looking for the secret ingredient i...   
4  Great taffy at a great price.  There was a wid...   

                                        cleaned_text  
0  bought several vitality canned dog food produc...  
1  product arrived labeled jumbo salted peanutsth...  
2  confection around century light pillowy citrus...  
3  looking secret ingredient robitussin believe f...  
4  great taffy great price wide assortment yummy ...  


In [35]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import LabelEncoder

df['Sentiment'] = df['Score'].apply(lambda x: 1 if x >= 4 else 0)  # 1 = Positive, 0 = Negative

vectorizer = TfidfVectorizer(max_features=5000)  
X = vectorizer.fit_transform(df['cleaned_text'])
y = df['Sentiment']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LogisticRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f'Accuracy: {accuracy:.4f}')
print(f'Precision: {precision:.4f}')
print(f'Recall: {recall:.4f}')
print(f'F1 Score: {f1:.4f}')
print(df.head())

Accuracy: 0.8911
Precision: 0.9098
Recall: 0.9557
F1 Score: 0.9322
   Id   ProductId          UserId                      ProfileName  \
0   1  B001E4KFG0  A3SGXH7AUHU8GW                       delmartian   
1   2  B00813GRG4  A1D87F6ZCVE5NK                           dll pa   
2   3  B000LQOCH0   ABXLMWJIXXAIN  Natalia Corres "Natalia Corres"   
3   4  B000UA0QIQ  A395BORC6FGVXV                             Karl   
4   5  B006K2ZZ7K  A1UQRSCLF8GW1T    Michael D. Bigham "M. Wassir"   

   HelpfulnessNumerator  HelpfulnessDenominator  Score        Time  \
0                     1                       1      5  1303862400   
1                     0                       0      1  1346976000   
2                     1                       1      4  1219017600   
3                     3                       3      2  1307923200   
4                     0                       0      5  1350777600   

                 Summary                                               Text  \
0  Good Qual

In [36]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

X = df['cleaned_text']  
y = df['Sentiment']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)
y_pred = model.predict(X_test_tfidf)

print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print("Classification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 0.6359166512740674
Classification Report:
              precision    recall  f1-score   support

           0       0.25      0.33      0.28     24666
           1       0.80      0.72      0.76     89025

    accuracy                           0.64    113691
   macro avg       0.52      0.53      0.52    113691
weighted avg       0.68      0.64      0.65    113691



In [37]:
import joblib

joblib.dump(vectorizer, 'vectorizer.pkl')
joblib.dump(model, 'sentiment_analysis_model.pkl')

['sentiment_analysis_model.pkl']

In [39]:
loaded_model = joblib.load('sentiment_analysis_model.pkl')
tfidf_vectorizer = joblib.load('vectorizer.pkl')

new_review = ["The candy is just red, No flavor. Just plain and chewy. I would never buy them again"]
new_review_vect = tfidf_vectorizer.transform(new_review)
prediction = loaded_model.predict(new_review_vect)

print("Sentiment:", "Positive" if prediction == 1 else "Negative")

Sentiment: Positive


In [None]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier()
model.fit(X_train_tfidf, y_train)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(ngram_range=(1,2))  # Unigrams and bigrams
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)