In [1]:
# Scikit-learn Optimization for Intel Processors
from sklearnex import patch_sklearn
patch_sklearn()

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


## Dataset Loading

In [2]:
import os
import pandas as pd

review_dir = os.path.join(os.getcwd(), 'datasets/prep_reviews.csv')
load_df = pd.read_csv(review_dir)
load_df.dropna(inplace=True)     #Drops null data that csv added during save

print(load_df.shape)
load_df.head()

(1440161, 3)


Unnamed: 0,Score,Reviews,Feedback
0,5,handcream beautiful fragrance not stay protect...,Positive
1,5,wonderful hand lotion seriously dry skin stays...,Positive
2,5,best hand cream around silky thick soaks way l...,Positive
3,5,thanks,Positive
4,5,great hand lotion soaks right leaves skin supe...,Positive


In [3]:
feedback_counts = load_df['Feedback'].value_counts()
print(feedback_counts)

Feedback
Positive    1204772
Negative     123566
Neutral      111823
Name: count, dtype: int64


## Resampling Technique

In [4]:
my_sample=.20
df = load_df.sample(frac=my_sample, random_state=42)


print(f"Dataset       : {load_df.shape[0]}")
print(f"Sampled       : {df.shape[0]}({my_sample*100}% of Dataset)\n")

Dataset       : 1440161
Sampled       : 288032(20.0% of Dataset)



## Dataset Splitting

In [5]:
# Map the Data
x = df['Reviews']  # Features (processed text)
y = df['Feedback']  # Target labels (Positive-Neutral-Negative)

In [6]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

print("Sampled Reviews:")
print(f"Training set  : {len(x_train)}")
print(f"Testing set   : {len(x_test)}")
print(f"Total         : {df.shape[0]}")

Sampled Reviews:
Training set  : 230425
Testing set   : 57607
Total         : 288032


## Vectorization

In [7]:
%%time
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(ngram_range=(1, 2))
vectorizer.fit(x_train)

x_train_vectorized = vectorizer.transform(x_train)
x_test_vectorized = vectorizer.transform(x_test)

CPU times: total: 34.6 s
Wall time: 42.7 s


## Model Training

In [8]:
%%time
from sklearn.svm import LinearSVC 
classifier = LinearSVC()

# Train the Model
classifier.fit(x_train_vectorized, y_train) 

# Make predictions on the test set
y_pred = classifier.predict(x_test_vectorized) 



CPU times: total: 18.1 s
Wall time: 19.7 s


## Save the Model & Evaluation

In [9]:
import joblib

save_path = os.path.join(os.getcwd(), 'models/test_model/')
classifier_path = save_path+"SVM_classifier.joblib"
vectorizer_path = save_path+"tfidf_vectorizer.joblib"

#Save the Model
joblib.dump(classifier, classifier_path)
joblib.dump(vectorizer, vectorizer_path)

["D:\\ELI\\Education\\Programming\\RevU's\\machine_learning\\models/test_model/tfidf_vectorizer.joblib"]

In [10]:
# Get the file size
vectorizer_size = os.path.getsize(classifier_path) / 1024  # Convert to MB
classifier_size = os.path.getsize(vectorizer_path) / 1024  # Convert to MB

In [11]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Evaluate the metrics
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

# Print the metrics to the console
print("Accuracy:", accuracy)
print("Confusion Matrix:\n", conf_matrix)
print("Classification Report:\n", classification_rep)

# Save the metrics
eval_path = os.path.join(os.getcwd(), 'model_evaluations/0_evaluation_results.txt')
with open(eval_path, 'w') as file:
    file.write("Model Specifications\n")
    file.write(f"Classifier    : {classifier} \n")
    file.write(f"Vectorizer    : {vectorizer} \n")
    file.write(f"Dataset       : {load_df.shape[0]}\n")
    file.write(f"Sampled       : {df.shape[0]}({my_sample*100}% of Dataset)\n")
    file.write(f"Training set  : {len(x_train)} \n")
    file.write(f"Testing set   : {len(x_test)} \n\n")
    file.write(f"File Sizes\n")
    file.write(f"Vectorizer   : {vectorizer_size:.2f}KB \n")  
    file.write(f"Model        : {classifier_size:.2f}KB \n\n") 
  
    
    file.write("Model Evaluation\n")
    file.write(f"Accuracy      : {accuracy*100} \n\n")
    file.write(f"Confusion Matrix:\n{conf_matrix} \n\n")
    file.write(f"Classification Report:\n{classification_rep} \n")

print("Evaluation results saved to:", eval_path)

Accuracy: 0.8810040446473519
Confusion Matrix:
 [[ 2939   309  1737]
 [  772   663  3035]
 [  564   438 47150]]
Classification Report:
               precision    recall  f1-score   support

    Negative       0.69      0.59      0.63      4985
     Neutral       0.47      0.15      0.23      4470
    Positive       0.91      0.98      0.94     48152

    accuracy                           0.88     57607
   macro avg       0.69      0.57      0.60     57607
weighted avg       0.86      0.88      0.86     57607

Evaluation results saved to: D:\ELI\Education\Programming\RevU's\machine_learning\model_evaluations/0_evaluation_results.txt
