In [1]:
import pandas as pd 
from sklearn.model_selection import train_test_split 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, f1_score
import xgboost as xgb

In [2]:
#Load data
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

In [3]:
#preprocess data
train_df["text"] = train_df["benefits_review"] + " " + train_df["side_effects_review"] + " " + train_df["comments_review"]
test_df["text"] = test_df["benefits_review"] + " " + test_df["side_effects_review"] + " " + test_df["comments_review"]

In [4]:
#preprocess data
train_df["sentiment"] = train_df["rating"].apply(lambda x: 1 if x >= 5 else 0)
test_df["sentiment"] = test_df["rating"].apply(lambda x: 1 if x >= 5 else 0)

In [5]:
#split into train and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(
    train_df["text"], train_df["sentiment"], test_size=0.2, random_state=42)

In [6]:
#initialize TFIDF Vectorizer
vectorizer = TfidfVectorizer(stop_words="english", max_features=10000)
#fit vectorizer on training data
X_train = vectorizer.fit_transform(train_texts)
X_val = vectorizer.transform(val_texts)
X_test = vectorizer.transform(test_df["text"])

In [7]:
#define models to train 
models = [
    ("Logistic Regression", LogisticRegression(max_iter=1000)),
    ("Support Vector Machines", LinearSVC(max_iter=10000)),
    ("XGBoost", xgb.XGBClassifier(objective="binary:logistic")),]

In [8]:
#train models, eval performance 
for name, model in models:
    model.fit(X_train, train_labels)
    val_preds = model.predict(X_val)
    print(f"{name} - Accuracy: {accuracy_score(val_labels, val_preds)} | F1 Score: {f1_score(val_labels, val_preds)}")

Logistic Regression - Accuracy: 0.8161290322580645 | F1 Score: 0.8956043956043956
Support Vector Machines - Accuracy: 0.8338709677419355 | F1 Score: 0.8997078870496592
XGBoost - Accuracy: 0.8096774193548387 | F1 Score: 0.8867562380038387


In [9]:
from sklearn.model_selection import GridSearchCV

In [10]:
#Use grid search to find best hyperparameters
param_grid = {
    "C": [0.1, 1, 10],
    "loss": ["hinge", "squared_hinge"],
}

svm_grid_search = GridSearchCV(LinearSVC(max_iter=10000), param_grid, scoring="f1", cv=5, n_jobs=-1)

svm_grid_search.fit(X_train, train_labels)

print("Best hyperparameters for SVM:", svm_grid_search.best_params_)

Best hyperparameters for SVM: {'C': 1, 'loss': 'hinge'}


In [11]:
#train best SVM model using best hyperparameters
best_svm = LinearSVC(max_iter=10000, **svm_grid_search.best_params_)
best_svm.fit(X_train, train_labels)

LinearSVC(C=1, loss='hinge', max_iter=10000)

In [12]:
test_preds = best_svm.predict(X_test)
print(f"Best SVM Model - Accuracy: {accuracy_score(test_df['sentiment'], test_preds)} | F1 Score: {f1_score(test_df['sentiment'], test_preds)}")

Best SVM Model - Accuracy: 0.8233590733590733 | F1 Score: 0.8942807625649912


In [13]:
#Save SVM model and vectorizer as pickle file
import pickle

with open("best_svm_model.pkl", "wb") as f:
    pickle.dump(best_svm, f)

with open("vectorizer.pkl", "wb") as f:
    pickle.dump(vectorizer, f)