In [1]:
#Enable hardware acceleration
%env SCIKIT_LEARN_INTEL=SKLEARN

env: SCIKIT_LEARN_INTEL=SKLEARN


## Dataset Loading

In [2]:
import os
import pandas as pd

review_dir = os.path.join(os.getcwd(), '../../data/processed_data/prep_reviews.csv')
load_df = pd.read_csv(review_dir)
load_df.dropna(inplace=True)     #Fixes data that might've gotten lost.
load_df.drop_duplicates(subset='Reviews', keep='first', inplace=True) 

print(load_df.shape)
load_df.head()

(1733527, 3)


Unnamed: 0,Score,Reviews,Feedback
0,5,great product price,Positive
1,3,waaay small use futur children,Neutral
2,5,stays vibrant many washes,Positive
3,5,son really likes pink ones nervous,Positive
4,3,waaay small use future child,Neutral


## Resampling Technique

In [3]:
my_sample=.3
df = load_df.sample(frac=my_sample)

print(f"Dataset       : {load_df.shape[0]}")
print(f"Sampled       : {df.shape[0]}({my_sample*100}% of Dataset)\n")

Dataset       : 1733527
Sampled       : 520058(30.0% of Dataset)



## Dataset Splitting

In [4]:
# Map the Data
x = df['Reviews']  # Features (processed text)
y = df['Feedback']  # Target labels (Positive-Neutral-Negative)

In [5]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

print("Sampled Reviews:")
print(f"Training set  : {len(x_train)}")
print(f"Testing set   : {len(x_test)}")
print(f"Total         : {df.shape[0]}")

Sampled Reviews:
Training set  : 416046
Testing set   : 104012
Total         : 520058


## Vectorization

In [6]:
%%time
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(ngram_range=(1, 2))
vectorizer.fit(x_train)

x_train_vectorized = vectorizer.transform(x_train)
x_test_vectorized = vectorizer.transform(x_test)

CPU times: user 1min, sys: 3.36 s, total: 1min 3s
Wall time: 1min 3s


## Model Training

In [7]:
%%time
from sklearn.svm import LinearSVC 
classifier = LinearSVC()

# Train the Model
classifier.fit(x_train_vectorized, y_train) 

# Make predictions on the test set
y_pred = classifier.predict(x_test_vectorized) 



CPU times: user 25.9 s, sys: 191 ms, total: 26 s
Wall time: 26.2 s


In [8]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Evaluate the metrics
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

# Print the metrics to the console
print("Accuracy:", accuracy)
print("Confusion Matrix:\n", conf_matrix)
print("Classification Report:\n", classification_rep)

# Save the metrics
eval_path = os.path.join(os.getcwd(), '../../data/model_weights/0_evaluation_results.txt')
with open(eval_path, 'w') as file:
    file.write("Model Specifications\n")
    file.write(f"Classifier    : {classifier} \n")
    file.write(f"Vectorizer    : {vectorizer} \n")
    file.write(f"Dataset       : {load_df.shape[0]}\n")
    file.write(f"Sampled       : {df.shape[0]}({my_sample*100}% of Dataset)\n")
    file.write(f"Training set  : {len(x_train)} \n")
    file.write(f"Testing set   : {len(x_test)} \n\n")
    
    
    file.write("Model Evaluation\n")
    file.write(f"Accuracy      : {accuracy*100} \n\n")
    file.write(f"Confusion Matrix:\n{conf_matrix} \n\n")
    file.write(f"Classification Report:\n{classification_rep} \n")

print("Evaluation results saved to:", eval_path)

Accuracy: 0.8735242087451448
Confusion Matrix:
 [[ 8332   960  2705]
 [ 2172  1886  5169]
 [ 1181   968 80639]]
Classification Report:
               precision    recall  f1-score   support

    Negative       0.71      0.69      0.70     11997
     Neutral       0.49      0.20      0.29      9227
    Positive       0.91      0.97      0.94     82788

    accuracy                           0.87    104012
   macro avg       0.71      0.62      0.64    104012
weighted avg       0.85      0.87      0.86    104012

Evaluation results saved to: /home/aerienn/Desktop/CS-THESIS/code/machine_learning/../../data/model_weights/0_evaluation_results.txt


## Save the Model

In [9]:
import joblib
save_path = os.path.join(os.getcwd(), '../../models/test_model/')
joblib.dump(classifier, save_path+"SVM_classifier.joblib")
joblib.dump(vectorizer, save_path+"tfidf_vectorizer.joblib")

['/home/aerienn/Desktop/CS-THESIS/code/machine_learning/../../models/test_model/tfidf_vectorizer.joblib']