# Lab Assignment 2

### Data Preprocessing

In [1]:
import pandas as pd

# Load the dataset
df = pd.read_csv("Reviews.csv")

# Display the first few rows of the dataset
print(df.head())

   Id   ProductId          UserId                      ProfileName  \
0   1  B001E4KFG0  A3SGXH7AUHU8GW                       delmartian   
1   2  B00813GRG4  A1D87F6ZCVE5NK                           dll pa   
2   3  B000LQOCH0   ABXLMWJIXXAIN  Natalia Corres "Natalia Corres"   
3   4  B000UA0QIQ  A395BORC6FGVXV                             Karl   
4   5  B006K2ZZ7K  A1UQRSCLF8GW1T    Michael D. Bigham "M. Wassir"   

   HelpfulnessNumerator  HelpfulnessDenominator  Score        Time  \
0                     1                       1      5  1303862400   
1                     0                       0      1  1346976000   
2                     1                       1      4  1219017600   
3                     3                       3      2  1307923200   
4                     0                       0      5  1350777600   

                 Summary                                               Text  
0  Good Quality Dog Food  I have bought several of the Vitality canned d...  
1 

### Bag-of-Words

In [2]:
from sklearn.feature_extraction.text import CountVectorizer

# Initialize the CountVectorizer
count_vectorizer = CountVectorizer()

# Fit and transform the text data
X_bow = count_vectorizer.fit_transform(df['Text'])

# Display the shape of the resulting matrix
print("Shape of Bag-of-Words matrix:", X_bow.shape)


Shape of Bag-of-Words matrix: (568454, 120252)


### TF-IDF

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize the TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the text data
X_tfidf = tfidf_vectorizer.fit_transform(df['Text'])

# Display the shape of the resulting matrix
print("Shape of TF-IDF matrix:", X_tfidf.shape)


Shape of TF-IDF matrix: (568454, 120252)


### Successfully transformed the text data into numerical features using both Bag-of-Words and TF-IDF representations. The resulting matrices have the same shape, indicating that they contain the same number of features.

### lexicon-based approach

In [4]:
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer

# Download VADER lexicon if not already downloaded
nltk.download('vader_lexicon')

# Initialize the VADER sentiment analyzer
sid = SentimentIntensityAnalyzer()

# Define a function to get sentiment scores
def get_sentiment(text):
    scores = sid.polarity_scores(text)
    if scores['compound'] >= 0.05:
        return 'positive'
    elif scores['compound'] <= -0.05:
        return 'negative'
    else:
        return 'neutral'

# Apply the function to each review text
df['LexiconSentiment'] = df['Text'].apply(get_sentiment)

# Display the first few rows of the DataFrame with sentiment labels
print(df[['Text', 'LexiconSentiment']].head())


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/hamody/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


                                                Text LexiconSentiment
0  I have bought several of the Vitality canned d...         positive
1  Product arrived labeled as Jumbo Salted Peanut...         negative
2  This is a confection that has been around a fe...         positive
3  If you are looking for the secret ingredient i...          neutral
4  Great taffy at a great price.  There was a wid...         positive


### machine-learning based approach

### Naive Bayes Algorithm

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, df['Score'], test_size=0.2, random_state=42)

# Initialize and train the Naive Bayes classifier
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train, y_train)

# Predict the sentiment labels for the test set
y_pred = nb_classifier.predict(X_test)

# Evaluate the performance of the classifier
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print("Classification Report:")
print(classification_report(y_test, y_pred))


Accuracy: 0.6499722933213711
Classification Report:
              precision    recall  f1-score   support

           1       0.82      0.09      0.16     10326
           2       0.20      0.00      0.00      5855
           3       0.41      0.00      0.00      8485
           4       0.65      0.01      0.02     16123
           5       0.65      1.00      0.79     72902

    accuracy                           0.65    113691
   macro avg       0.55      0.22      0.19    113691
weighted avg       0.62      0.65      0.52    113691



### Logistic Regression

In [6]:
from sklearn.linear_model import LogisticRegression

# Initialize and train the Logistic Regression classifier
lr_classifier = LogisticRegression(max_iter=1000)
lr_classifier.fit(X_train, y_train)

# Predict the sentiment labels for the test set
y_pred_lr = lr_classifier.predict(X_test)

# Evaluate the performance of the classifier
accuracy_lr = accuracy_score(y_test, y_pred_lr)
print("Logistic Regression Accuracy:", accuracy_lr)
print("Classification Report:")
print(classification_report(y_test, y_pred_lr))

Logistic Regression Accuracy: 0.7591805859742635
Classification Report:
              precision    recall  f1-score   support

           1       0.69      0.72      0.70     10326
           2       0.54      0.27      0.36      5855
           3       0.52      0.36      0.43      8485
           4       0.55      0.29      0.38     16123
           5       0.81      0.95      0.88     72902

    accuracy                           0.76    113691
   macro avg       0.62      0.52      0.55    113691
weighted avg       0.73      0.76      0.73    113691



In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.utils import resample

# Load the dataset
df = pd.read_csv("Reviews.csv")

# Define the TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Transform the text data into TF-IDF features
X_tfidf = tfidf_vectorizer.fit_transform(df['Text'])

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, df['Score'], test_size=0.2, random_state=42)

# Combine the features and target variable for training data
train_data = pd.concat([pd.DataFrame(X_train.toarray()), y_train.reset_index(drop=True)], axis=1)

# Separate minority and majority classes
minority_class = train_data[train_data['Score'] != 5]
majority_class = train_data[train_data['Score'] == 5]

# Upsample minority class
minority_upsampled = resample(minority_class, replace=True, n_samples=len(majority_class), random_state=42)

# Combine majority class with upsampled minority class
upsampled_data = pd.concat([majority_class, minority_upsampled])

# Split features and target variable
X_train_balanced = upsampled_data.drop('Score', axis=1)
y_train_balanced = upsampled_data['Score']

# Initialize and train the Logistic Regression classifier with balanced data
lr_classifier_balanced = LogisticRegression(max_iter=1000)
lr_classifier_balanced.fit(X_train_balanced, y_train_balanced)

# Predict the sentiment labels for the test set
y_pred_lr_balanced = lr_classifier_balanced.predict(X_test)

# Evaluate the performance of the classifier with balanced data
accuracy_lr_balanced = accuracy_score(y_test, y_pred_lr_balanced)
print("Logistic Regression Accuracy with Resampling:", accuracy_lr_balanced)
print("Classification Report:")
print(classification_report(y_test, y_pred_lr_balanced))
