# Import libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve, auc
from sklearn.preprocessing import label_binarize
from xgboost import XGBClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import VotingClassifier
from imblearn.over_sampling import SMOTE
from textblob import TextBlob
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import warnings
warnings.filterwarnings('ignore')

# Load the Dataset

## Upload train_data,test_data , test_data_hidden & train_data_tfidf

In [2]:
train_data = pd.read_csv(r'C:\Users\zubair_khan\Desktop\Data_Science\Projects\Capstone-Project-2-digi-chrome\data\cleaned_train_data.csv')
test_data = pd.read_csv(r'C:\Users\zubair_khan\Desktop\Data_Science\Projects\Capstone-Project-2-digi-chrome\data\cleaned_test_data.csv')
test_data_hidden = pd.read_csv(r'C:\Users\zubair_khan\Desktop\Data_Science\Projects\Capstone-Project-2-digi-chrome\data\cleaned_test_data_hidden.csv')

In [3]:
train_data.head(2)

Unnamed: 0,Name of the product,Product Brand,categories,primaryCategories,reviews.date,reviews.text,reviews.title,sentiment
0,"All-New Fire HD 8 Tablet, 8"" HD Display, Wi-Fi...",Amazon,"Electronics,iPad & Tablets,All Tablets,Fire Ta...",Electronics,2016-12-26T00:00:00.000Z,purchased black fridaypros great price even sa...,powerful tablet,Positive
1,Amazon - Echo Plus w/ Built-In Hub - Silver,Amazon,"Amazon Echo,Smart Home,Networking,Home & Tools...","Electronics,Hardware",2018-01-17T00:00:00.000Z,purchased amazon echo plus dot plus four fire ...,amazon echo plus awesome,Positive


In [4]:
test_data.head(2)

Unnamed: 0,Name of the product,Product Brand,categories,primaryCategories,reviews.date,reviews.text,reviews.title
0,"Fire Tablet, 7 Display, Wi-Fi, 16 GB - Include...",Amazon,"Fire Tablets,Computers/Tablets & Networking,Ta...",Electronics,2016-05-23T00:00:00.000Z,amazon kindle fire free used that want online ...,very handy device
1,Amazon Echo Show Alexa-enabled Bluetooth Speak...,Amazon,"Computers,Amazon Echo,Virtual Assistant Speake...","Electronics,Hardware",2018-01-02T00:00:00.000Z,echo show great addition amazon family work ju...,another winner from amazon


In [5]:
test_data_hidden.head(2)

Unnamed: 0,Name of the product,Product Brand,categories,primaryCategories,reviews.date,reviews.text,reviews.title,sentiment
0,"Fire Tablet, 7 Display, Wi-Fi, 16 GB - Include...",Amazon,"Fire Tablets,Computers/Tablets & Networking,Ta...",Electronics,2016-05-23T00:00:00.000Z,amazon kindle fire free used that want online ...,very handy device,Positive
1,Amazon Echo Show Alexa-enabled Bluetooth Speak...,Amazon,"Computers,Amazon Echo,Virtual Assistant Speake...","Electronics,Hardware",2018-01-02T00:00:00.000Z,echo show great addition amazon family work ju...,another winner from amazon,Positive


In [6]:
train_data.columns, test_data.columns, test_data_hidden.columns

(Index(['Name of the product', 'Product Brand', 'categories',
        'primaryCategories', 'reviews.date', 'reviews.text', 'reviews.title',
        'sentiment'],
       dtype='object'),
 Index(['Name of the product', 'Product Brand', 'categories',
        'primaryCategories', 'reviews.date', 'reviews.text', 'reviews.title'],
       dtype='object'),
 Index(['Name of the product', 'Product Brand', 'categories',
        'primaryCategories', 'reviews.date', 'reviews.text', 'reviews.title',
        'sentiment'],
       dtype='object'))

In [7]:
train_data.duplicated().sum() , test_data.duplicated().sum() , test_data_hidden.duplicated().sum()

(0, 0, 0)

In [9]:
# Handle missing values (if any)
train_data.fillna("", inplace=True)
test_data.fillna("", inplace=True)
test_data_hidden.fillna("", inplace=True)

In [10]:
train_data.isnull().sum() , test_data.isnull().sum() , test_data_hidden.isnull().sum()

(Name of the product    0
 Product Brand          0
 categories             0
 primaryCategories      0
 reviews.date           0
 reviews.text           0
 reviews.title          0
 sentiment              0
 dtype: int64,
 Name of the product    0
 Product Brand          0
 categories             0
 primaryCategories      0
 reviews.date           0
 reviews.text           0
 reviews.title          0
 dtype: int64,
 Name of the product    0
 Product Brand          0
 categories             0
 primaryCategories      0
 reviews.date           0
 reviews.text           0
 reviews.title          0
 sentiment              0
 dtype: int64)

### Task-2.1- Multi-class SVM and Neural Nets:
##### Data Preprocessing

In [11]:
# Combine 'reviews.title' and 'reviews.text' into one feature
train_data['text'] = train_data['reviews.title'] + " " + train_data['reviews.text']
test_data['text'] = test_data['reviews.title'] + " " + test_data['reviews.text']
test_data_hidden['text'] = test_data_hidden['reviews.title'] + " " + test_data_hidden['reviews.text']

In [13]:
# Encode target labels
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
train_data['sentiment_encoded'] = label_encoder.fit_transform(train_data['sentiment'])

In [14]:
# Vectorize text using TF-IDF
tfidf = TfidfVectorizer(max_features=5000, stop_words='english')  
X_train_tfidf = tfidf.fit_transform(train_data['text'])
X_test_tfidf = tfidf.transform(test_data['text'])
X_test_hidden_tfidf = tfidf.transform(test_data_hidden['text'])

In [15]:
# Split train data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(
    X_train_tfidf, train_data['sentiment_encoded'], test_size=0.2, random_state=42
)

##### Multi-class Support Vector Machines (SVM)

In [16]:
# Train the SVM model
svm_model = SVC(kernel='linear', C=1.0, decision_function_shape='ovr', random_state=42)
svm_model.fit(X_train, y_train)

In [17]:
# Evaluate on validation data
y_val_pred_svm = svm_model.predict(X_val)

In [19]:
# Metrics
from sklearn.metrics import classification_report, accuracy_score
print("SVM Classification Report:")
print(classification_report(y_val, y_val_pred_svm, target_names=label_encoder.classes_))
print("Validation Accuracy (SVM):", accuracy_score(y_val, y_val_pred_svm))

SVM Classification Report:
              precision    recall  f1-score   support

    Negative       1.00      0.25      0.40        20
     Neutral       0.50      0.03      0.06        29
    Positive       0.94      1.00      0.97       737

    accuracy                           0.94       786
   macro avg       0.81      0.43      0.48       786
weighted avg       0.93      0.94      0.92       786

Validation Accuracy (SVM): 0.9440203562340967


##### Neural Networks

In [21]:
# One-hot encode target labels
from tensorflow.keras.utils import to_categorical
y_train_onehot = to_categorical(y_train, num_classes=len(label_encoder.classes_))
y_val_onehot = to_categorical(y_val, num_classes=len(label_encoder.classes_))

In [22]:
# Build the neural network
nn_model = Sequential([
    Dense(512, input_shape=(X_train.shape[1],), activation='relu'),
    Dropout(0.5),
    Dense(256, activation='relu'),
    Dropout(0.5),
    Dense(len(label_encoder.classes_), activation='softmax')
])

In [23]:
# Compile the model
nn_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [25]:
nn_model.summary()

In [24]:
# Train the model
history = nn_model.fit(X_train, y_train_onehot, validation_data=(X_val, y_val_onehot), epochs=10, batch_size=64, verbose=1)

Epoch 1/10
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 61ms/step - accuracy: 0.8487 - loss: 0.6436 - val_accuracy: 0.9377 - val_loss: 0.2387
Epoch 2/10
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 40ms/step - accuracy: 0.9365 - loss: 0.2120 - val_accuracy: 0.9377 - val_loss: 0.2186
Epoch 3/10
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 40ms/step - accuracy: 0.9335 - loss: 0.1581 - val_accuracy: 0.9377 - val_loss: 0.2167
Epoch 4/10
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 46ms/step - accuracy: 0.9275 - loss: 0.1339 - val_accuracy: 0.9377 - val_loss: 0.2195
Epoch 5/10
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 40ms/step - accuracy: 0.9461 - loss: 0.0923 - val_accuracy: 0.9338 - val_loss: 0.2334
Epoch 6/10
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 46ms/step - accuracy: 0.9715 - loss: 0.0687 - val_accuracy: 0.9326 - val_loss: 0.2632
Epoch 7/10
[1m50/50[0m [32m━━━━

In [26]:
# Evaluate on validation data
val_loss, val_accuracy = nn_model.evaluate(X_val, y_val_onehot)
print(f"Validation Accuracy (Neural Network): {val_accuracy:.4f}")

[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.9316 - loss: 0.4277
Validation Accuracy (Neural Network): 0.9275


##### Predictions on Test Data

In [28]:
# SVM Predictions
y_test_pred_svm = svm_model.predict(X_test_tfidf)
y_test_hidden_pred_svm = svm_model.predict(X_test_hidden_tfidf)
y_test_hidden_pred_svm

array([2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,

In [29]:
# Neural Network Predictions
y_test_pred_nn = nn_model.predict(X_test_tfidf)
y_test_hidden_pred_nn = nn_model.predict(X_test_hidden_tfidf)
y_test_hidden_pred_nn

[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step


array([[1.6227698e-07, 4.3109424e-07, 9.9999940e-01],
       [8.2141553e-13, 3.9796784e-11, 1.0000000e+00],
       [3.0163760e-10, 1.2255619e-08, 1.0000000e+00],
       ...,
       [4.6609838e-08, 1.6595968e-06, 9.9999833e-01],
       [2.6479589e-02, 1.6225846e-01, 8.1126201e-01],
       [1.7357265e-11, 9.0634131e-09, 1.0000000e+00]], dtype=float32)

In [30]:
# Decode predictions
y_test_pred_svm_labels = label_encoder.inverse_transform(y_test_pred_svm)
y_test_hidden_pred_svm_labels = label_encoder.inverse_transform(y_test_hidden_pred_svm)

In [32]:
y_test_pred_nn_labels = label_encoder.inverse_transform(np.argmax(y_test_pred_nn, axis=1))
y_test_pred_nn_labels

array(['Positive', 'Positive', 'Positive', 'Positive', 'Positive',
       'Positive', 'Positive', 'Positive', 'Positive', 'Positive',
       'Positive', 'Positive', 'Positive', 'Positive', 'Positive',
       'Positive', 'Positive', 'Positive', 'Positive', 'Positive',
       'Positive', 'Positive', 'Positive', 'Positive', 'Positive',
       'Positive', 'Positive', 'Positive', 'Positive', 'Positive',
       'Positive', 'Positive', 'Positive', 'Positive', 'Positive',
       'Positive', 'Positive', 'Positive', 'Positive', 'Positive',
       'Positive', 'Positive', 'Positive', 'Positive', 'Positive',
       'Positive', 'Positive', 'Positive', 'Positive', 'Positive',
       'Positive', 'Positive', 'Positive', 'Neutral', 'Positive',
       'Positive', 'Positive', 'Positive', 'Positive', 'Positive',
       'Positive', 'Positive', 'Positive', 'Positive', 'Positive',
       'Positive', 'Positive', 'Positive', 'Positive', 'Positive',
       'Positive', 'Positive', 'Positive', 'Positive', 'Positiv

In [33]:
y_test_hidden_pred_nn_labels = label_encoder.inverse_transform(np.argmax(y_test_hidden_pred_nn, axis=1))
y_test_hidden_pred_nn_labels

array(['Positive', 'Positive', 'Positive', 'Positive', 'Positive',
       'Positive', 'Positive', 'Positive', 'Positive', 'Positive',
       'Positive', 'Positive', 'Positive', 'Positive', 'Positive',
       'Positive', 'Positive', 'Positive', 'Positive', 'Positive',
       'Positive', 'Positive', 'Positive', 'Positive', 'Positive',
       'Positive', 'Positive', 'Positive', 'Positive', 'Positive',
       'Positive', 'Positive', 'Positive', 'Positive', 'Positive',
       'Positive', 'Positive', 'Positive', 'Positive', 'Positive',
       'Positive', 'Positive', 'Positive', 'Positive', 'Positive',
       'Positive', 'Positive', 'Positive', 'Positive', 'Positive',
       'Positive', 'Positive', 'Positive', 'Neutral', 'Positive',
       'Positive', 'Positive', 'Positive', 'Positive', 'Positive',
       'Positive', 'Positive', 'Positive', 'Positive', 'Positive',
       'Positive', 'Positive', 'Positive', 'Positive', 'Positive',
       'Positive', 'Positive', 'Positive', 'Positive', 'Positiv

##### Comparing SVM and Neural Network

In [35]:
from sklearn.metrics import classification_report, accuracy_score, f1_score, precision_score, recall_score

In [36]:
# SVM Metrics
y_val_pred_svm = svm_model.predict(X_val)
svm_accuracy = accuracy_score(y_val, y_val_pred_svm)
svm_f1 = f1_score(y_val, y_val_pred_svm, average='weighted')
svm_precision = precision_score(y_val, y_val_pred_svm, average='weighted')
svm_recall = recall_score(y_val, y_val_pred_svm, average='weighted')

print("SVM Metrics:")
print(f"Accuracy: {svm_accuracy:.4f}")
print(f"F1-Score: {svm_f1:.4f}")
print(f"Precision: {svm_precision:.4f}")
print(f"Recall: {svm_recall:.4f}")
print("\nSVM Classification Report:")
print(classification_report(y_val, y_val_pred_svm, target_names=label_encoder.classes_))

SVM Metrics:
Accuracy: 0.9440
F1-Score: 0.9230
Precision: 0.9298
Recall: 0.9440

SVM Classification Report:
              precision    recall  f1-score   support

    Negative       1.00      0.25      0.40        20
     Neutral       0.50      0.03      0.06        29
    Positive       0.94      1.00      0.97       737

    accuracy                           0.94       786
   macro avg       0.81      0.43      0.48       786
weighted avg       0.93      0.94      0.92       786



In [37]:
# Neural Network Metrics
y_val_pred_nn = nn_model.predict(X_val)
y_val_pred_nn_classes = np.argmax(y_val_pred_nn, axis=1)  
nn_accuracy = accuracy_score(y_val, y_val_pred_nn_classes)
nn_f1 = f1_score(y_val, y_val_pred_nn_classes, average='weighted')
nn_precision = precision_score(y_val, y_val_pred_nn_classes, average='weighted')
nn_recall = recall_score(y_val, y_val_pred_nn_classes, average='weighted')

print("\nNeural Network Metrics:")
print(f"Accuracy: {nn_accuracy:.4f}")
print(f"F1-Score: {nn_f1:.4f}")
print(f"Precision: {nn_precision:.4f}")
print(f"Recall: {nn_recall:.4f}")
print("\nNeural Network Classification Report:")
print(classification_report(y_val, y_val_pred_nn_classes, target_names=label_encoder.classes_))

[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step

Neural Network Metrics:
Accuracy: 0.9275
F1-Score: 0.9262
Precision: 0.9262
Recall: 0.9275

Neural Network Classification Report:
              precision    recall  f1-score   support

    Negative       0.62      0.40      0.48        20
     Neutral       0.26      0.28      0.27        29
    Positive       0.96      0.97      0.96       737

    accuracy                           0.93       786
   macro avg       0.61      0.55      0.57       786
weighted avg       0.93      0.93      0.93       786



#### Explanation
* Accuracy: Measures the proportion of correct predictions out of the total predictions.
* F1-score: Harmonic mean of precision and recall, especially useful for imbalanced classes.
* Precision: Proportion of correctly predicted positive observations to the total predicted positives.
* Recall: Proportion of correctly predicted positive observations to all observations in the actual class.

In [None]:
### Tastk-2.2- Ensemble Techniques: