#### Import Libraries

In [1]:
# Analysis
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
import joblib
import pickle

import warnings
warnings.filterwarnings("ignore")

In [2]:
# import cleaned Data
data = pd.read_csv(r"C:\Users\Harrison\Downloads\sentiment_analysis_project\data\cleaned_data.csv")

data.head(4)

Unnamed: 0,Reviewer,Country,StarRatings,Subject,Review,Date_posted
0,Mike Santarcangelo,US,3,"Amazon may be a fine company, but...","Amazon is easy to work with, but they fulfill ...","Tuesday, June 4, 2024"
1,Arthur Riding,GB,5,"Great range of products, generally superior qu...",I have been using AliExpress for well over a y...,"Tuesday, June 4, 2024"
2,UK shopper Val,GB,5,Good!,"A good, customer-centric marketplace. But neve...","Monday, June 3, 2024"
3,Jim Corkery,CA,1,I purchased an Android Tesla style…,I purchased an Android Tesla style radio.\nAft...,"Sunday, June 2, 2024"


In [3]:
# Creating a 'StarRatings_Sentiment' column to classify the sentiment of reviews based on the StarRatings

# Define the classification function
def classify_rating(StarRatings):
    if StarRatings >= 3:
        return 'Positive'
    else:
        return 'Negative'

# Apply the classification
data['StarRatings_Sentiment'] = data['StarRatings'].apply(classify_rating)

data.head(2)

Unnamed: 0,Reviewer,Country,StarRatings,Subject,Review,Date_posted,StarRatings_Sentiment
0,Mike Santarcangelo,US,3,"Amazon may be a fine company, but...","Amazon is easy to work with, but they fulfill ...","Tuesday, June 4, 2024",Positive
1,Arthur Riding,GB,5,"Great range of products, generally superior qu...",I have been using AliExpress for well over a y...,"Tuesday, June 4, 2024",Positive


#### Data Preprocessing

In [4]:

import re
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Text cleaning functions
def remove_html_tags(text):
    clean = re.compile('<.*?>')
    return re.sub(clean, '', text)

def to_lowercase(text):
    return text.lower()

def remove_special_characters(text):
    return re.sub(r'[^a-zA-Z\s]', '', text)

def remove_numbers(text):
    return re.sub(r'\d+', '', text)

def remove_whitespace(text):
    return ' '.join(text.split())

# Tokenization function
def tokenize_text(text):
    return word_tokenize(text)

# Remove stopwords
def remove_stopwords(tokens):
    stop_words = set(stopwords.words('english'))
    return [word for word in tokens if word not in stop_words]

# Lemmatization
def lemmatize_words(tokens):
    lemmatizer = WordNetLemmatizer()
    return [lemmatizer.lemmatize(word) for word in tokens]

# Comprehensive preprocessing function
def preprocess_text(text):
    if not isinstance(text, str):
        return []
    text = remove_html_tags(text)
    text = to_lowercase(text)
    text = remove_special_characters(text)
    text = remove_numbers(text)
    text = remove_whitespace(text)
    tokens = tokenize_text(text)
    tokens = remove_stopwords(tokens)
    tokens = lemmatize_words(tokens)
    return tokens

# Apply preprocessing to the 'Review' column
data['cleaned_review'] = data['Review'].apply(preprocess_text)

data.head(3)

Unnamed: 0,Reviewer,Country,StarRatings,Subject,Review,Date_posted,StarRatings_Sentiment,cleaned_review
0,Mike Santarcangelo,US,3,"Amazon may be a fine company, but...","Amazon is easy to work with, but they fulfill ...","Tuesday, June 4, 2024",Positive,"[amazon, easy, work, fulfill, order, via, rd, ..."
1,Arthur Riding,GB,5,"Great range of products, generally superior qu...",I have been using AliExpress for well over a y...,"Tuesday, June 4, 2024",Positive,"[using, aliexpress, well, year, consistently, ..."
2,UK shopper Val,GB,5,Good!,"A good, customer-centric marketplace. But neve...","Monday, June 3, 2024",Positive,"[good, customercentric, marketplace, neverthel..."


#### Feature Engineering

-  TF-IDF: Transform text data into TF-IDF matrices.

In [5]:
# Combine tokens back into strings for each review
data['cleaned_review'] = data['cleaned_review'].apply(lambda tokens: ' '.join(tokens))

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
# #Instantiate TF-IDF
tfidf_vectorizer = TfidfVectorizer()
x_tfidf = tfidf_vectorizer.fit_transform(data['cleaned_review'])

In [7]:
# Using the most important feature for our model building
df = data[['StarRatings_Sentiment','cleaned_review']]
df.head(2)

Unnamed: 0,StarRatings_Sentiment,cleaned_review
0,Positive,amazon easy work fulfill order via rd party co...
1,Positive,using aliexpress well year consistently impres...


In [8]:
# Instantiating the dependent and independent variables

X = df["cleaned_review"] # independent variable
y = df["StarRatings_Sentiment"] # dependent variable


In [9]:
from sklearn.preprocessing import LabelEncoder

# Encoding the dependent variable (StarRatings_Sentiment)
df["StarRatings_Sentiment"] = LabelEncoder().fit_transform(df["StarRatings_Sentiment"])
df["StarRatings_Sentiment"].sample(5)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["StarRatings_Sentiment"] = LabelEncoder().fit_transform(df["StarRatings_Sentiment"])


7414     1
7215     1
11778    1
4241     0
785      1
Name: StarRatings_Sentiment, dtype: int32

In [10]:
from sklearn.model_selection import train_test_split
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X.to_numpy(), y, test_size=0.2, random_state=42)


- Vectorize the Training Data Using TF-IDF

In [11]:
# Choose our vectorizer (TF-IDF)
tfidf_vectorizer = TfidfVectorizer()

# Vectorize the text data using TF-IDF
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test) # Transform using learned vocabulary

#### Model Training
  - Support Vector Classifier Model

In [12]:
# Fit the SVC to train the data
from sklearn.svm import SVC

svc_model = SVC(kernel = 'linear')
svc_model.fit(X_train_tfidf, y_train)

In [13]:
# Make predictions on the test set
svc_y_pred = svc_model.predict(X_test_tfidf)

#### Model Evaluation

In [14]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.metrics import confusion_matrix

# Evaluation metrics
accuracy = accuracy_score(y_test, svc_y_pred)
precision = precision_score(y_test, svc_y_pred, average='weighted')
f1 = f1_score(y_test, svc_y_pred, average='weighted')

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"F1-Score: {f1:.4f}")

# Classification report
print("\nClassification Report:")
print(classification_report(y_test, svc_y_pred))

Accuracy: 0.9179
Precision: 0.9183
F1-Score: 0.9180

Classification Report:
              precision    recall  f1-score   support

    Negative       0.94      0.92      0.93      1368
    Positive       0.90      0.92      0.91      1032

    accuracy                           0.92      2400
   macro avg       0.92      0.92      0.92      2400
weighted avg       0.92      0.92      0.92      2400

