## Dataset Loading

In [1]:
import os
import pandas as pd

review_dir = os.path.join(os.getcwd(), '../../data/processed_data/prep_reviews.csv')
df = pd.read_csv(review_dir)
print(df.shape)
df.head()

(1814534, 3)


Unnamed: 0,Score,Reviews,Feedback
0,5,great product price,Positive
1,3,waaay small use futur children,Neutral
2,5,stays vibrant many washes,Positive
3,5,son really likes pink ones nervous,Positive
4,3,waaay small use future child,Neutral


## Dataset Splitting

In [2]:
# Resampling Technique
#df = df.sample(frac=.6, random_state=42)

In [3]:
# Map the Data
x = df['Reviews']  # Features (processed text)
y = df['Feedback']  # Target labels (Positive-Neutral-Negative)

In [4]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

print("Number of Reviews:")
print(f"Training set  : {len(x_train)}")
print(f"Testing set   : {len(x_test)}")

Number of Reviews:
Training set  : 1451627
Testing set   : 362907


## Vectorization

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(ngram_range=(1, 2))
vectorizer.fit(x_train)

x_train_vectorized = vectorizer.transform(x_train)
x_test_vectorized = vectorizer.transform(x_test)

## Model Training

In [None]:
from sklearn.svm import LinearSVC 
classifier = LinearSVC()

# Train the Model
classifier.fit(x_train_vectorized, y_train) 

# Make predictions on the test set
y_pred = classifier.predict(x_test_vectorized) 

In [None]:
# Evaluate the model
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("Confusion Matrix:")
print(conf_matrix)
print("Classification Report:")
print(classification_rep)

## Save the Model

In [None]:
import joblib
save_path = os.path.join(os.getcwd(), '../../models/')
joblib.dump(classifier, save_path+"SVM_classifier.joblib")
joblib.dump(vectorizer, save_path+"tfidf_vectorizer.joblib")
