In [11]:
import pandas as pd
import numpy as np
import re
import nltk
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import warnings
warnings.filterwarnings('ignore')

# Load the dataset
data = pd.read_csv('./data/JAL_tripadvisor_reviews.csv')

# Display the first few rows of the dataset
print(data.head())

                                               title  \
0  Pleasant Flights For Senior Passengers From Ja...   
1            Dunno why people recommend this airline   
2                          BEST 5 HOURS FLIGHT EVER!   
3                     Nice Flight...Terrible Website   
4  Great Legroom, Best Flight Experience, On Sche...   

                                                text lang published_date  \
0  My elderly father, my aunt (  my late mother’s...   en     2024-11-05   
1  Dunno how people rate this airline as good, ea...   en     2024-10-21   
2  I flew Japan Airlines From Bangkok to Osaka, a...   en     2024-10-20   
3  In a way JAL reminds me of my entire Japan exp...   en     2024-10-19   
4  Great, maybe BEST flight experience. Always on...   en     2024-10-18   

  travel_date  helpful_votes  rating  
0  2024-05-31              0       4  
1  2024-10-31              0       1  
2  2024-10-31              2       5  
3  2023-12-31              1       5  
4  2024-08-

In [12]:
# get only the data we will work with
clean_data = data[data["lang"] == "en"][["title", "text", "rating"]]

clean_data.reset_index(drop=True, inplace=True)

# Display the first few rows of the cleaned dataset
print(clean_data.head())

# Split the data into training and test sets
X = clean_data[['title', 'text']]
y = clean_data['rating']
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# Download necessary NLTK data
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')

# Function to clean text
def clean_text(text):
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove special characters and numbers
    text = text.lower()  # Convert to lowercase
    return text

# Function to preprocess text
def preprocess_text(text):
    text = clean_text(text)
    tokens = word_tokenize(text)  # Tokenization
    tokens = [word for word in tokens if word not in stopwords.words('english')]  # Stopword removal
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]  # Lemmatization
    return ' '.join(tokens)

# Apply preprocessing to the title and text columns
X_train_cleaned = x_train['title'] + ' ' + x_train['text']
X_train_cleaned = X_train_cleaned.apply(preprocess_text)

X_test_cleaned = x_test['title'] + ' ' + x_test['text']
X_test_cleaned = X_test_cleaned.apply(preprocess_text)

# Vectorize the text using TF-IDF
vectorizer = TfidfVectorizer(max_features=1000)
x_train_tfidf = vectorizer.fit_transform(X_train_cleaned)
x_test_tfidf = vectorizer.transform(X_test_cleaned)

# Display the shape of the vectorized data
print(f'Training data shape: {x_train_tfidf.shape}')
print(f'Test data shape: {x_test_tfidf.shape}')




                                               title  \
0  Pleasant Flights For Senior Passengers From Ja...   
1            Dunno why people recommend this airline   
2                          BEST 5 HOURS FLIGHT EVER!   
3                     Nice Flight...Terrible Website   
4  Great Legroom, Best Flight Experience, On Sche...   

                                                text  rating  
0  My elderly father, my aunt (  my late mother’s...       4  
1  Dunno how people rate this airline as good, ea...       1  
2  I flew Japan Airlines From Bangkok to Osaka, a...       5  
3  In a way JAL reminds me of my entire Japan exp...       5  
4  Great, maybe BEST flight experience. Always on...       4  


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\moham\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\moham\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\moham\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\moham\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Training data shape: (2541, 1000)
Test data shape: (636, 1000)


In [13]:
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from cvxopt import matrix, solvers

# Function to solve SVM dual optimization problem
def solve_svm(X, y, C=1.0):
    n_samples, n_features = X.shape
    y = y.astype(float).reshape(-1, 1)

    # Compute the Gram matrix
    K = np.dot(X, X.T)
    P = matrix(np.outer(y, y) * K)  # Quadratic term
    q = matrix(-np.ones((n_samples, 1)))  # Linear term
    G = matrix(np.vstack((-np.eye(n_samples), np.eye(n_samples))))  # Constraints
    h = matrix(np.vstack((np.zeros((n_samples, 1)), C * np.ones((n_samples, 1)))))
    A = matrix(y.T)  # Equality constraint
    b = matrix(np.zeros(1))

    # Solve the quadratic program
    solvers.options['show_progress'] = False
    solution = solvers.qp(P, q, G, h, A, b)

    # Extract Lagrange multipliers
    alphas = np.ravel(solution['x'])
    return alphas

# Function to compute weights and bias
def compute_weights_and_bias(X, y, alphas, threshold=1e-4):
    support_vector_indices = np.where(alphas > threshold)[0]
    support_alphas = alphas[support_vector_indices]
    support_vectors = X[support_vector_indices]
    support_labels = y[support_vector_indices]

    # Calculate weights
    w = np.sum(support_alphas[:, None] * support_labels[:, None] * support_vectors, axis=0)

    # Calculate bias
    b = np.mean(support_labels - np.dot(support_vectors, w))
    return w, b, support_vectors

# Function for SVM prediction
def svm_predict(X, w, b):
    return np.sign(np.dot(X, w) + b)

# Convert y_train and y_test to binary (positive/negative sentiment)
y_train_binary = np.where(y_train >= 4, 1, -1)
y_test_binary = np.where(y_test >= 4, 1, -1)

# Train SVM from scratch
alphas = solve_svm(x_train_tfidf.toarray(), y_train_binary, C=1.0)
w, b, support_vectors = compute_weights_and_bias(x_train_tfidf.toarray(), y_train_binary, alphas)

# Predict on test set
y_pred = svm_predict(x_test_tfidf.toarray(), w, b)

# Convert predictions to binary class labels (1 or -1)
y_pred_binary = (y_pred > 0).astype(int)

# Compute evaluation metrics
accuracy = accuracy_score(y_test_binary, y_pred_binary)
precision = precision_score(y_test_binary, y_pred_binary, average='macro')
recall = recall_score(y_test_binary, y_pred_binary, average='macro')
f1 = f1_score(y_test_binary, y_pred_binary, average='macro')

# Display results
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")


Accuracy: 0.7343
Precision: 0.3064
Recall: 0.3170
F1 Score: 0.3116


In [14]:
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import GridSearchCV


# 1. SVM
svm_model = SVC(kernel='linear', C=1.0)
svm_model.fit(x_train_tfidf, y_train_binary)
svm_y_pred = svm_model.predict(x_test_tfidf)
svm_acc = accuracy_score(y_test_binary, svm_y_pred)
svm_prec = precision_score(y_test_binary, svm_y_pred, average='binary')
svm_rec = recall_score(y_test_binary, svm_y_pred, average='binary')
svm_f1 = f1_score(y_test_binary, svm_y_pred, average='binary')

# 2. Logistic Regression
lr_model = LogisticRegression(max_iter=1000, solver='liblinear', C=1.0)
lr_model.fit(x_train_tfidf, y_train_binary)
lr_y_pred = lr_model.predict(x_test_tfidf)
lr_acc = accuracy_score(y_test_binary, lr_y_pred)
lr_prec = precision_score(y_test_binary, lr_y_pred, average='binary')
lr_rec = recall_score(y_test_binary, lr_y_pred, average='binary')
lr_f1 = f1_score(y_test_binary, lr_y_pred, average='binary')

# 3. Random Forest
rf_model = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42)
rf_model.fit(x_train_tfidf, y_train_binary)
rf_y_pred = rf_model.predict(x_test_tfidf)
rf_acc = accuracy_score(y_test_binary, rf_y_pred)
rf_prec = precision_score(y_test_binary, rf_y_pred, average='binary')
rf_rec = recall_score(y_test_binary, rf_y_pred, average='binary')
rf_f1 = f1_score(y_test_binary, rf_y_pred, average='binary')

# Print results
print("Model Evaluation Metrics:")
print(f"SVM -> Accuracy: {svm_acc:.4f}, Precision: {svm_prec:.4f}, Recall: {svm_rec:.4f}, F1 Score: {svm_f1:.4f}")
print(f"Logistic Regression -> Accuracy: {lr_acc:.4f}, Precision: {lr_prec:.4f}, Recall: {lr_rec:.4f}, F1 Score: {lr_f1:.4f}")
print(f"Random Forest -> Accuracy: {rf_acc:.4f}, Precision: {rf_prec:.4f}, Recall: {rf_rec:.4f}, F1 Score: {rf_f1:.4f}")

# Hyperparameter Tuning Example (for SVM using GridSearchCV)
param_grid_svm = {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf']}
grid_search_svm = GridSearchCV(SVC(), param_grid_svm, cv=5, scoring='f1')
grid_search_svm.fit(x_train_tfidf, y_train_binary)

print("Best Parameters for SVM:", grid_search_svm.best_params_)
print("Best F1 Score for SVM:", grid_search_svm.best_score_)

# Hyperparameter Tuning Example (for Logistic Regression using GridSearchCV)
param_grid_lr = {'C': [0.1, 1, 10], 'solver': ['liblinear', 'lbfgs']}
grid_search_lr = GridSearchCV(LogisticRegression(max_iter=1000), param_grid_lr, cv=5, scoring='f1')
grid_search_lr.fit(x_train_tfidf, y_train_binary)

print("Best Parameters for Logistic Regression:", grid_search_lr.best_params_)
print("Best F1 Score for Logistic Regression:", grid_search_lr.best_score_)

# Hyperparameter Tuning Example (for Random Forest using GridSearchCV)
param_grid_rf = {'n_estimators': [50, 100, 200], 'max_depth': [None, 10, 20]}
grid_search_rf = GridSearchCV(RandomForestClassifier(random_state=42), param_grid_rf, cv=5, scoring='f1')
grid_search_rf.fit(x_train_tfidf, y_train_binary)

print("Best Parameters for Random Forest:", grid_search_rf.best_params_)
print("Best F1 Score for Random Forest:", grid_search_rf.best_score_)


Model Evaluation Metrics:
SVM -> Accuracy: 0.9104, Precision: 0.9125, Recall: 0.9776, F1 Score: 0.9440
Logistic Regression -> Accuracy: 0.8805, Precision: 0.8712, Recall: 0.9919, F1 Score: 0.9276
Random Forest -> Accuracy: 0.8223, Precision: 0.8140, Recall: 0.9980, F1 Score: 0.8966
Best Parameters for SVM: {'C': 10, 'kernel': 'rbf'}
Best F1 Score for SVM: 0.9485249662771841
Best Parameters for Logistic Regression: {'C': 10, 'solver': 'liblinear'}
Best F1 Score for Logistic Regression: 0.9458933439588556
Best Parameters for Random Forest: {'max_depth': None, 'n_estimators': 200}
Best F1 Score for Random Forest: 0.9360000134458852


# Model Comparison Report

## Implemented SVM:
- Accuracy: 0.7343
- Precision: 0.3064
- Recall: 0.3170
- F1 Score: 0.3116

## Built-in SVM:
- Accuracy: 0.9104
- Precision: 0.9125
- Recall: 0.9776
- F1 Score: 0.9440
- Best F1 Score: 0.9485

## Logistic Regression:
- Accuracy: 0.8805
- Precision: 0.8712
- Recall: 0.9919
- F1 Score: 0.9276
- Best F1 Score: 0.9459

## Random Forest:
- Accuracy: 0.8223
- Precision: 0.8140
- Recall: 0.9980
- F1 Score: 0.8966
- Best F1 Score: 0.9360

## Best Model:
The best model based on F1 Score is: **Built-in SVM**
