In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import copy
import math
from collections import defaultdict, Counter
from sklearn.metrics import precision_recall_fscore_support


In [2]:
!pip install transformers
!pip install pyspellchecker
!pip install transformers[torch]

Collecting transformers
  Downloading transformers-4.34.0-py3-none-any.whl (7.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m19.7 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers)
  Downloading huggingface_hub-0.17.3-py3-none-any.whl (295 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m295.0/295.0 kB[0m [31m16.4 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.15,>=0.14 (from transformers)
  Downloading tokenizers-0.14.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m66.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m71.9 MB/s[0m eta [36m0:00:00[0m
Colle

In [3]:
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.dummy import DummyClassifier
from sklearn.naive_bayes import GaussianNB, BernoulliNB,CategoricalNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
import sklearn
from transformers import TrainingArguments, Trainer
import torch
from torch.utils.data import Dataset
from transformers import AutoModelForSequenceClassification, AutoTokenizer

from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.optimizers import Adam
print('The scikit-learn version is {}.'.format(sklearn.__version__))

The scikit-learn version is 1.2.2.


In [4]:
import warnings

# ignore future warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Reading and Pre-processing


In [6]:
import pandas as pd

def preprocess(fileName1, fileName2):
    ## read the csv files
    data1 = pd.read_csv(fileName1)
    data2 = pd.read_csv(fileName2)

    # Drop the first column of data2
    data2 = data2.drop(data2.columns[0], axis=1)

    # Check if the number of rows in both datasets are the same
    if len(data1) != len(data2):
        raise ValueError("The number of rows in the two datasets do not match!")

    # Convert -1 to 0 in the 'rating' column of data1
    data1['rating'] = data1['rating'].replace(-1, 0)

    # Concatenate data2 (word embeddings) with 'rating' and 'dr-id-adjusted' columns from data1
    merged_data = pd.concat([data1[['dr-id-adjusted', 'rating']], data2], axis=1)

    # Splitting the dataset into features and target
    # Assuming all other columns except 'rating' in merged_data are features
    features = merged_data.drop(columns=['rating'])
    target = merged_data['rating']

    return merged_data, features, target


# Word embeddings 384

In [None]:
## read the data of word embedding
dataset_train, features_train,target_train = preprocess("/content/drive/MyDrive/dataset/TRAIN.csv","/content/drive/MyDrive/dataset/384EMBEDDINGS_TRAIN.csv")
dataset_val, features_val, target_val = preprocess("/content/drive/MyDrive/dataset/VALIDATION.csv","/content/drive/MyDrive/dataset/384EMBEDDINGS_VALIDATION.csv")

#### Baseline methods


In [None]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.dummy import DummyClassifier

def baselines(dataset_train, dataset_val):

    ZeroR_Acc_1 = []
    WRand_Acc_1 = []

    ## your code here
    report = []
    train_x, train_y = dataset_train.drop(['rating'], axis=1), dataset_train['rating']
    val_x, val_y = dataset_val.drop(['rating'], axis=1), dataset_val['rating']

    # Train and test Zero-R
    zero_r = DummyClassifier(strategy='most_frequent')
    zero_r.fit(train_x, train_y)
    zero_r_predictions = zero_r.predict(val_x)
    zero_r_accuracy = accuracy_score(val_y, zero_r_predictions)
    zero_r_precision, zero_r_recall, zero_r_f1, _ = precision_recall_fscore_support(val_y, zero_r_predictions, average='binary')
    report_zero = classification_report(val_y, zero_r_predictions, zero_division=0)
    report.append(report_zero)
    ZeroR_Acc_1.append(zero_r_accuracy)

    # Train and test Weighted Random
    weighted_random = DummyClassifier(strategy='stratified')
    weighted_random.fit(train_x, train_y)
    weighted_random_predictions = weighted_random.predict(val_x)
    weighted_random_accuracy = accuracy_score(val_y, weighted_random_predictions)
    weighted_random_precision, weighted_random_recall, weighted_random_f1, _ = precision_recall_fscore_support(val_y, weighted_random_predictions, average='binary')
    report_w = classification_report(val_y, weighted_random_predictions, zero_division=0)
    report.append(report_w)
    WRand_Acc_1.append(weighted_random_accuracy)

    print("Accuracy of ZeroR:", np.mean(ZeroR_Acc_1).round(2))
    print("Precision of ZeroR:", zero_r_precision.round(2))
    print("Recall of ZeroR:", zero_r_recall.round(2))
    print("F1 Score of ZeroR:", zero_r_f1.round(2))

    print("Accuracy of Weighted Random:", np.mean(WRand_Acc_1).round(2))
    print("Precision of Weighted Random:", weighted_random_precision.round(2))
    print("Recall of Weighted Random:", weighted_random_recall.round(2))
    print("F1 Score of Weighted Random:", weighted_random_f1.round(2))

baselines(dataset_train, dataset_val)


Accuracy of ZeroR: 0.73
Precision of ZeroR: 0.73
Recall of ZeroR: 1.0
F1 Score of ZeroR: 0.85
Accuracy of Weighted Random: 0.61
Precision of Weighted Random: 0.74
Recall of Weighted Random: 0.73
F1 Score of Weighted Random: 0.73


logistic regression

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

train_x, train_y = dataset_train.drop(['rating', 'dr-id-adjusted'], axis=1), dataset_train['rating']
val_x, val_y = dataset_val.drop(['rating', 'dr-id-adjusted'], axis=1), dataset_val['rating']

logistic_regression_model = LogisticRegression(solver='newton-cg', max_iter=1000)

logistic_regression_model.fit(train_x, train_y)

y_pred = logistic_regression_model.predict(val_x)

accuracy = accuracy_score(val_y, y_pred)
precision, recall, f1, _ = precision_recall_fscore_support(val_y, y_pred, average='binary')

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

Accuracy: 0.9196363636363636
Precision: 0.9385365853658536
Recall: 0.9529470034670628
F1 Score: 0.9456869009584663


test on the test dataset with logistics regression

In [None]:
test_data = pd.read_csv("D:/unimelb-3rd/IML/ASS3/dataset/384EMBEDDINGS_TEST.csv")
#test_data = pd.read_csv("D:/unimelb-3rd/IML/ASS3/dataset/TFIDF_TEST.csv")

Unnamed: 0.1,Unnamed: 0,0,1,2,3,4,5,6,7,8,...,374,375,376,377,378,379,380,381,382,383
0,48503,-0.022147,-0.022475,0.086003,0.007822,0.012784,-0.000159,0.011789,0.021219,-0.058783,...,-0.065846,0.021954,0.063496,-0.023876,-0.080697,0.025039,0.03598,-0.004304,-0.091479,0.032933
1,48504,0.017011,0.08469,0.031479,-0.018542,-0.136334,-0.040547,0.024581,-0.001718,0.031045,...,-0.022533,0.003009,0.063849,-0.029857,-0.009398,0.083773,0.048096,-0.021208,0.018363,-0.004735
2,48505,-0.019075,0.014318,-0.0547,-0.013129,-0.138464,-0.054037,-0.006527,0.07305,-0.024574,...,-0.012777,0.025411,-0.040082,-0.02808,-0.058692,0.009434,0.103693,-0.076458,-0.092033,-0.020267
3,48506,-0.014013,-0.022374,-0.02047,-0.037221,-0.161702,-0.060834,0.020752,0.023565,-0.039837,...,-0.052155,0.076453,-0.034619,-0.002245,-0.035704,0.010705,0.029748,-0.034872,-0.07028,0.032426
4,48507,-0.021505,0.092344,0.000482,-0.024316,-0.098049,-0.08382,0.015389,-0.033406,-0.018172,...,0.035989,0.000115,0.041575,-0.082211,-0.083002,0.126218,0.010888,-0.027627,-0.060814,0.019299


In [None]:
test_x = test_data.drop(test_data.columns[0], axis=1)
predictions = logistic_regression_model.predict(test_x)
threshold = 0.5
predicted_labels = (predictions > threshold).astype(int)
predicted_labels[predicted_labels == 0] = -1
predicted_labels = predicted_labels.flatten()
#predicted_labels
result_df = pd.DataFrame({'id': np.arange(5514), 'rating': predicted_labels})
result_df.to_csv('logistic.csv', index=False, header=['id', 'rating'])

Random Forest Model

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

# 定义参数网格
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30],
    'min_samples_split': [2, 5, 10]
}

# 创建随机森林模型
rf_model = RandomForestClassifier(random_state=42)

# 创建GridSearchCV对象
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=5, verbose=2)

# 执行网格搜索
grid_search.fit(train_x, train_y)

# 获取最佳参数
best_params = grid_search.best_params_

# 使用最佳模型进行预测
best_model = grid_search.best_estimator_
y_pred = best_model.predict(val_x)

# 计算准确率
accuracy = accuracy_score(val_y, y_pred)
print("Best Parameters:", best_params)
print("Accuracy:", accuracy)


Fitting 5 folds for each of 27 candidates, totalling 135 fits
[CV] END max_depth=10, min_samples_split=2, n_estimators=100; total time= 1.0min
[CV] END max_depth=10, min_samples_split=2, n_estimators=100; total time= 1.0min
[CV] END max_depth=10, min_samples_split=2, n_estimators=100; total time= 1.1min
[CV] END max_depth=10, min_samples_split=2, n_estimators=100; total time= 1.0min
[CV] END max_depth=10, min_samples_split=2, n_estimators=100; total time= 1.1min
[CV] END max_depth=10, min_samples_split=2, n_estimators=200; total time= 2.1min
[CV] END max_depth=10, min_samples_split=2, n_estimators=200; total time= 2.1min
[CV] END max_depth=10, min_samples_split=2, n_estimators=200; total time= 2.3min
[CV] END max_depth=10, min_samples_split=2, n_estimators=200; total time= 2.4min
[CV] END max_depth=10, min_samples_split=2, n_estimators=200; total time= 2.8min
[CV] END max_depth=10, min_samples_split=2, n_estimators=300; total time= 4.3min
[CV] END max_depth=10, min_samples_split=2, n_e

[CV] END max_depth=30, min_samples_split=2, n_estimators=300; total time= 6.1min
[CV] END max_depth=30, min_samples_split=2, n_estimators=300; total time= 6.1min
[CV] END max_depth=30, min_samples_split=2, n_estimators=300; total time= 6.1min
[CV] END max_depth=30, min_samples_split=2, n_estimators=300; total time= 6.0min
[CV] END max_depth=30, min_samples_split=5, n_estimators=100; total time= 2.0min
[CV] END max_depth=30, min_samples_split=5, n_estimators=100; total time= 2.0min
[CV] END max_depth=30, min_samples_split=5, n_estimators=100; total time= 2.0min
[CV] END max_depth=30, min_samples_split=5, n_estimators=100; total time= 2.0min
[CV] END max_depth=30, min_samples_split=5, n_estimators=100; total time= 2.0min
[CV] END max_depth=30, min_samples_split=5, n_estimators=200; total time= 4.0min
[CV] END max_depth=30, min_samples_split=5, n_estimators=200; total time= 4.0min
[CV] END max_depth=30, min_samples_split=5, n_estimators=200; total time= 4.0min
[CV] END max_depth=30, min_s

Simple neuro network (MLP)

In [None]:
from keras.optimizers import Adam
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# Construct the network
model = keras.Sequential()
model.add(layers.Input(shape=(384,))) # Input layer
model.add(layers.Dense(32, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))  # Output layer

# Set the learning rate
custom_optimizer = Adam(learning_rate=0.001)

# Compile the model
model.compile(optimizer=custom_optimizer, loss='binary_crossentropy', metrics=['accuracy'])

# Training
model.fit(train_x, train_y, epochs=10, batch_size=32, validation_data=(val_x, val_y))

# Evaluate
y_pred = (model.predict(val_x) > 0.5).astype(int)
accuracy = accuracy_score(val_y, y_pred)
precision, recall, f1, _ = precision_recall_fscore_support(val_y, y_pred, average='binary')

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Accuracy: 0.9196363636363636
Precision: 0.9692066805845512
Recall: 0.9197622585438335
F1 Score: 0.9438373570520966


Now try training a model with SVM to see the effect

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# 1. Initialize the SVM model with a linear kernel (You can choose different kernel functions)
svm_model = SVC(kernel="linear")

# 2. Train the SVM model
svm_model.fit(train_x, train_y)

# 3. Make predictions on the validation set
val_predictions = svm_model.predict(val_x)

# 4. Calculate accuracy, precision, recall, and F1 score
accuracy = accuracy_score(val_y, val_predictions)
precision, recall, f1, _ = precision_recall_fscore_support(val_y, val_predictions, average='binary')

# 6. Print evaluation metrics
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)


Accuracy: 0.9223636363636364
Precision: 0.9443760767905488
Recall: 0.950222882615156
F1 Score: 0.947290457968152


# Now try RandomForest with TFIDF

In [17]:
## read the data of TFIDF
dataset_train, features_train,target_train = preprocess("/content/drive/MyDrive/dataset/TRAIN.csv","/content/drive/MyDrive/dataset/TFIDF_TRAIN.csv")
dataset_val, features_val, target_val = preprocess("/content/drive/MyDrive/dataset/VALIDATION.csv","/content/drive/MyDrive/dataset/TFIDF_VALIDATION.csv")

In [None]:
from sklearn.dummy import DummyClassifier
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def baselines(dataset_train, dataset_val):
    ZeroR_Acc_1 = []
    ZeroR_Precision = []
    ZeroR_Recall = []
    ZeroR_F1 = []

    WRand_Acc_1 = []
    WRand_Precision = []
    WRand_Recall = []
    WRand_F1 = []

    train_x, train_y = dataset_train.drop(['rating'], axis=1), dataset_train['rating']
    val_x, val_y = dataset_val.drop(['rating'], axis=1), dataset_val['rating']

    # Train and test Zero-R
    zero_r = DummyClassifier(strategy='most_frequent')
    zero_r.fit(train_x, train_y)
    zero_r_predictions = zero_r.predict(val_x)
    zero_r_accuracy = accuracy_score(val_y, zero_r_predictions)
    zero_r_precision, zero_r_recall, zero_r_f1, _ = precision_recall_fscore_support(val_y, zero_r_predictions, average='binary')
    ZeroR_Acc_1.append(zero_r_accuracy)
    ZeroR_Precision.append(zero_r_precision)
    ZeroR_Recall.append(zero_r_recall)
    ZeroR_F1.append(zero_r_f1)

    # Train and test Weighted Random
    weighted_random = DummyClassifier(strategy='stratified')
    weighted_random.fit(train_x, train_y)
    weighted_random_predictions = weighted_random.predict(val_x)
    weighted_random_accuracy = accuracy_score(val_y, weighted_random_predictions)
    weighted_random_precision, weighted_random_recall, weighted_random_f1, _ = precision_recall_fscore_support(val_y, weighted_random_predictions, average='binary')
    WRand_Acc_1.append(weighted_random_accuracy)
    WRand_Precision.append(weighted_random_precision)
    WRand_Recall.append(weighted_random_recall)
    WRand_F1.append(weighted_random_f1)

    # Print evaluation metrics
    print("Zero-R Metrics:")
    print("Accuracy:", np.mean(ZeroR_Acc_1).round(2))
    print("Precision:", np.mean(ZeroR_Precision).round(2))
    print("Recall:", np.mean(ZeroR_Recall).round(2))
    print("F1 Score:", np.mean(ZeroR_F1).round(2))

    print("Weighted Random Metrics:")
    print("Accuracy:", np.mean(WRand_Acc_1).round(2))
    print("Precision:", np.mean(WRand_Precision).round(2))
    print("Recall:", np.mean(WRand_Recall).round(2))
    print("F1 Score:", np.mean(WRand_F1).round(2))

baselines(dataset_train, dataset_val)

Zero-R Metrics:
Accuracy: 0.73
Precision: 0.73
Recall: 1.0
F1 Score: 0.85
Weighted Random Metrics:
Accuracy: 0.61
Precision: 0.74
Recall: 0.72
F1 Score: 0.73


Random Forest Model

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def evaluate_random_forest(dataset_train, dataset_val):
    train_x, train_y = dataset_train.drop(['rating'], axis=1), dataset_train['rating']
    val_x, val_y = dataset_val.drop(['rating'], axis=1), dataset_val['rating']

    rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
    rf_model.fit(train_x, train_y)

    y_pred = rf_model.predict(val_x)

    accuracy = accuracy_score(val_y, y_pred)
    precision, recall, f1, _ = precision_recall_fscore_support(val_y, y_pred, average='binary')

    print("Random Forest Metrics:")
    print("Accuracy:", accuracy)
    print("Precision:", precision)
    print("Recall:", recall)
    print("F1 Score:", f1)

evaluate_random_forest(dataset_train, dataset_val)


Random Forest Metrics:
Accuracy: 0.8903636363636364
Precision: 0.9325107025938051
Recall: 0.9170381376919267
F1 Score: 0.9247097015857161


Simple neuro network

In [None]:
from tensorflow import keras
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# Prepare the data
train_x, train_y = dataset_train.drop(['rating', 'dr-id-adjusted'], axis=1), dataset_train['rating']
val_x, val_y = dataset_val.drop(['rating', 'dr-id-adjusted'], axis=1), dataset_val['rating']

# Construct the network
model = keras.Sequential()
model.add(keras.layers.Input(shape=(500,)))  # Input layer
model.add(keras.layers.Dense(128, activation='relu'))  # Hidden layer 1
model.add(keras.layers.Dense(1, activation='sigmoid'))  # Output layer

# Set the learning rate
custom_optimizer = keras.optimizers.Adam(learning_rate=0.001)

# Compile the model
model.compile(optimizer=custom_optimizer, loss='binary_crossentropy', metrics=['accuracy'])

# Training
model.fit(train_x, train_y, epochs=10, batch_size=32, validation_data=(val_x, val_y))

# Evaluation
y_pred = model.predict(val_x)
y_pred_binary = [1 if pred > 0.5 else 0 for pred in y_pred]

accuracy = accuracy_score(val_y, y_pred_binary)
precision, recall, f1, _ = precision_recall_fscore_support(val_y, y_pred_binary, average='binary')

print("Neural Network Metrics:")
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Neural Network Metrics:
Accuracy: 0.9065454545454545
Precision: 0.9424912104470116
Recall: 0.9294205052005944
F1 Score: 0.9359102244389028


Logistics regression with TFIDF

In [19]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

train_x, train_y = dataset_train.drop(['rating', 'dr-id-adjusted'], axis=1), dataset_train['rating']
val_x, val_y = dataset_val.drop(['rating', 'dr-id-adjusted'], axis=1), dataset_val['rating']

logistic_regression_model = LogisticRegression(solver='newton-cg', max_iter=1000)

logistic_regression_model.fit(train_x, train_y)

y_pred = logistic_regression_model.predict(val_x)

accuracy = accuracy_score(val_y, y_pred)
precision, recall, f1, _ = precision_recall_fscore_support(val_y, y_pred, average='binary')

print("Logistic Regression Metrics:")
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)


Logistic Regression Metrics:
Accuracy: 0.9063636363636364
Precision: 0.9431446540880503
Recall: 0.928429915799901
F1 Score: 0.9357294396605516


ablation + TFIDF + logistics

In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.feature_selection import RFE
import numpy as np

# Train a baseline model
baseline_model = LogisticRegression()
baseline_model.fit(train_x, train_y)
baseline_predictions = baseline_model.predict(val_x)

# Calculate baseline metrics
baseline_accuracy = accuracy_score(val_y, baseline_predictions)
baseline_precision, baseline_recall, baseline_f1, _ = precision_recall_fscore_support(val_y, baseline_predictions, average='binary')

print("Baseline Metrics:")
print("Accuracy:", baseline_accuracy)
print("Precision:", baseline_precision)
print("Recall:", baseline_recall)
print("F1 Score:", baseline_f1)

# Create an RFE model with logistic regression
rfe = RFE(LogisticRegression(), n_features_to_select=50)  # Choose the top k features

# Fit the RFE model on the training data
rfe.fit(train_x, train_y)

# Get the selected features
selected_features = np.array(train_x.columns)[rfe.support_]

# Retrain the model with the selected features
final_X_train = train_x[selected_features]
final_X_val = val_x[selected_features]

final_model = LogisticRegression()
final_model.fit(final_X_train, train_y)
final_predictions = final_model.predict(final_X_val)

# Calculate final metrics
final_accuracy = accuracy_score(val_y, final_predictions)
final_precision, final_recall, final_f1, _ = precision_recall_fscore_support(val_y, final_predictions, average='binary')

print("Final Metrics with Selected Features:")
print("Accuracy:", final_accuracy)
print("Precision:", final_precision)
print("Recall:", final_recall)
print("F1 Score:", final_f1)

Baseline Metrics:
Accuracy: 0.9063636363636364
Precision: 0.9431446540880503
Recall: 0.928429915799901
F1 Score: 0.9357294396605516
Final Metrics with Selected Features:
Accuracy: 0.87
Precision: 0.9194647816208028
Recall: 0.9019316493313522
F1 Score: 0.9106138267283411


# Raw text

 training with raw text using TinyBERT

In [7]:
raw_train_data = pd.read_csv("/content/drive/MyDrive/dataset/TRAIN.csv")
raw_train_data = raw_train_data.drop(["Unnamed: 0","dr-id-adjusted","dr_id_gender"], axis=1)
raw_val_data = pd.read_csv("/content/drive/MyDrive/dataset/VALIDATION.csv")
raw_val_data = raw_val_data.drop(["Unnamed: 0","dr-id-adjusted","dr_id_gender"], axis=1)
raw_train_data['rating'] = raw_train_data['rating'].replace(-1, 0)
raw_val_data['rating'] = raw_val_data['rating'].replace(-1, 0)
clean_train_data =  raw_train_data.copy()
clean_val_data =  raw_val_data.copy()

In [8]:
class CustomDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item


In [8]:
import string
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from spellchecker import SpellChecker

def clean_and_preprocess_text(raw_text_column):
    # Initialize the spell checker
    spell = SpellChecker()

    # Initialize the list of stopwords
    nltk.download('stopwords')
    stop_words = set(stopwords.words('english'))

    # Initialize the stemmer
    stemmer = PorterStemmer()

    # Initialize the lemmatizer
    nltk.download('wordnet')
    lemmatizer = WordNetLemmatizer()

    # Define the text cleaning and preprocessing function
    def preprocess_text(text):
        if text is None or pd.isnull(text):  # Check for missing values
            return ""  # Return an empty string if the text is empty or missing
        elif text.strip() == "":  # Check if the text contains only spaces
            return ""  # Return an empty string if it contains only spaces

        # 1. Remove punctuation
        text = text.translate(str.maketrans('', '', string.punctuation))

        # 2. Split the text into a list of words
        words = text.split()

        # 3. Text normalization (convert to lowercase)
        text = text.lower()

        # 4. Remove stopwords
        words = text.split()
        filtered_words = [word for word in words if word not in stop_words]
        text = ' '.join(filtered_words)

        # 5. Stemming
        words = text.split()
        stemmed_words = [stemmer.stem(word) for word in words]
        text = ' '.join(stemmed_words)

        # 6. Lemmatization
        words = text.split()
        lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
        text = ' '.join(lemmatized_words)

        return text

    # Apply the preprocessing function to each text in the DataFrame column
    preprocessed_text_column = raw_text_column.apply(preprocess_text)

    return preprocessed_text_column

In [9]:
clean_train_data['review-text-cleaned'] = clean_and_preprocess_text(raw_train_data['review-text-cleaned'])
clean_val_data['review-text-cleaned'] = clean_and_preprocess_text(raw_val_data['review-text-cleaned'])

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


experiment with epoch = 3

In [9]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer, TrainingArguments, Trainer
import torch

model_name = "prajjwal1/bert-tiny"  # TinyBERT model name
num_labels = 2  # Binary classification task with two classes

# Initialize the model and tokenizer, and move them to the GPU
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Convert text to token IDs using the tokenizer
train_encodings = tokenizer(raw_train_data['review-text-cleaned'].tolist(), truncation=True, max_length=512, padding=True, return_tensors="pt")
val_encodings = tokenizer(raw_val_data['review-text-cleaned'].tolist(), truncation=True, max_length=512, padding=True, return_tensors="pt")

# Create PyTorch datasets
train_dataset = CustomDataset(train_encodings, raw_train_data['rating'])
val_dataset = CustomDataset(val_encodings, raw_val_data['rating'])

num_epochs = 3
batch_size = 32
training_args = TrainingArguments(
    output_dir="./tinybert_classification",
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    evaluation_strategy="steps",
    save_steps=500,
    save_total_limit=2,
    load_best_model_at_end=True,
)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

trainer.train()

Downloading (…)lve/main/config.json:   0%|          | 0.00/285 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/17.8M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-tiny and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Step,Training Loss,Validation Loss
500,0.3592,0.212254
1000,0.2112,0.196725
1500,0.1946,0.187966
2000,0.17,0.181188
2500,0.1726,0.17537
3000,0.1522,0.183764
3500,0.1504,0.181218
4000,0.1555,0.176375


TrainOutput(global_step=4032, training_loss=0.19561147122156053, metrics={'train_runtime': 182.658, 'train_samples_per_second': 706.287, 'train_steps_per_second': 22.074, 'total_flos': 163904262543360.0, 'train_loss': 0.19561147122156053, 'epoch': 3.0})

In [10]:
model

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 128, padding_idx=0)
      (position_embeddings): Embedding(512, 128)
      (token_type_embeddings): Embedding(2, 128)
      (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-1): 2 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=128, out_features=128, bias=True)
              (key): Linear(in_features=128, out_features=128, bias=True)
              (value): Linear(in_features=128, out_features=128, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=128, out_features=128, bias=True)
              (LayerNorm): LayerNorm((128,), eps=1e-12, e

experiment with epoch = 4

In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer, TrainingArguments, Trainer
import torch

model_name = "prajjwal1/bert-tiny"  # TinyBERT model name
num_labels = 2  # Binary classification task with two classes

# Initialize the model and tokenizer, and move them to the GPU
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Convert text to token IDs using the tokenizer
train_encodings = tokenizer(raw_train_data['review-text-cleaned'].tolist(), truncation=True, max_length=512, padding=True, return_tensors="pt")
val_encodings = tokenizer(raw_val_data['review-text-cleaned'].tolist(), truncation=True, max_length=512, padding=True, return_tensors="pt")

# Create PyTorch datasets
train_dataset = CustomDataset(train_encodings, raw_train_data['rating'])
val_dataset = CustomDataset(val_encodings, raw_val_data['rating'])

num_epochs = 4
batch_size = 32
training_args = TrainingArguments(
    output_dir="./tinybert_classification",
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    evaluation_strategy="steps",
    save_steps=500,
    save_total_limit=2,
    load_best_model_at_end=True,
)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

trainer.train()

Downloading (…)lve/main/config.json:   0%|          | 0.00/285 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/17.8M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-tiny and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Step,Training Loss,Validation Loss
500,0.361,0.210442
1000,0.2088,0.197414
1500,0.1919,0.183498
2000,0.1679,0.181989
2500,0.17,0.173292
3000,0.1478,0.195236
3500,0.1454,0.182896
4000,0.1496,0.172012
4500,0.1379,0.183313
5000,0.1391,0.180834


TrainOutput(global_step=5376, training_loss=0.1793267166330701, metrics={'train_runtime': 236.9498, 'train_samples_per_second': 725.943, 'train_steps_per_second': 22.688, 'total_flos': 218539016724480.0, 'train_loss': 0.1793267166330701, 'epoch': 4.0})

In [None]:
trainer.save_model("./tinybert_classification")

In [None]:
raw_test_data = pd.read_csv("/content/drive/MyDrive/dataset/TEST_NO_LABELS.csv")
raw_test_data = raw_test_data.drop(["Unnamed: 0","dr-id-adjusted","dr_id_gender"], axis=1)
#raw_test_data['review-text-cleaned'] =  clean_and_preprocess_text(raw_test_data['review-text-cleaned'])

In [None]:
# Encode the test data using the tokenizer
test_encodings = tokenizer(raw_test_data['review-text-cleaned'].tolist(), truncation=True, padding=True, return_tensors="pt")

# Create a dummy label list with the same length as the test dataset (although these labels won't be used, they are needed as placeholders)
dummy_labels = [0] * len(raw_test_data)

# Create a test dataset (dummy labels will not be used)
test_dataset = CustomDataset(test_encodings, dummy_labels)

# Make predictions using the trained model on the test dataset
predictions = trainer.predict(test_dataset)

# Extract the predicted labels for each sample
predicted_labels = predictions.predictions.argmax(axis=1)

# Replace 0 labels with -1 (if necessary)
predicted_labels[predicted_labels == 0] = -1

# Flatten the predicted labels
predicted_labels = predicted_labels.flatten()

# Create a DataFrame with 'id' and 'rating' columns and save it as a CSV file
result_df = pd.DataFrame({'id': np.arange(5514), 'rating': predicted_labels})
result_df.to_csv('bert_raw_4.csv', index=False, header=['id', 'rating'])


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Evaluate the fine-tuned model on different metrics

In [None]:
from transformers import Trainer

def compute_metrics(p):
    # Get labels from predictions
    preds = p.predictions.argmax(-1)

    # Calculate accuracy
    accuracy = accuracy_score(p.label_ids, preds)

    # Calculate precision, recall, and F1 score
    precision, recall, f1, _ = precision_recall_fscore_support(p.label_ids, preds, average='binary')

    return {
        'eval_accuracy': accuracy,
        'eval_precision': precision,
        'eval_recall': recall,
        'eval_f1': f1,
    }

trainer = Trainer(
    model=model,
    compute_metrics=compute_metrics,  # Function for calculating metrics
)

# Perform evaluation
results = trainer.evaluate(val_dataset)

# Print the values of different metrics
print("Accuracy:", results["eval_accuracy"])
print("Precision:", results["eval_precision"])
print("Recall:", results["eval_recall"])
print("F1:", results["eval_f1"])


Accuracy: 0.9321818181818182
Precision: 0.9477156120205228
Recall: 0.9606240713224369
F1: 0.9541261837412371


# sentiment dictionaries

sentiment dicitonaries without stemming and lemmatizing the raw text

In [15]:
import pandas as pd
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score

# Get the reviews from the DataFrame
reviews = raw_train_data['review-text-cleaned']

# Initialize NLTK's opinion lexicon
nltk.download('opinion_lexicon')
from nltk.corpus import opinion_lexicon

# Remove duplicate words from the opinion lexicon
unique_words = list(set(opinion_lexicon.words()))

# Create a vectorizer for sentiment word frequency
vectorizer = CountVectorizer(vocabulary=unique_words, binary=True)

# Transform text into sentiment feature vectors
sentiment_features_train = vectorizer.transform(reviews)

x_train = sentiment_features_train

# Prepare labels
y_train = raw_train_data['rating']

# Create and train a logistic regression model
model = LogisticRegression(solver='newton-cg', max_iter=1000)
model.fit(x_train, y_train)

x_val = raw_val_data['review-text-cleaned']
sentiment_features_val = vectorizer.transform(x_val)
y_val = raw_val_data['rating']

# Predict on the validation set
y_pred = model.predict(sentiment_features_val)

# Calculate accuracy
accuracy = accuracy_score(y_val, y_pred)
print("Accuracy:", accuracy)

# Calculate F1 score
f1 = f1_score(y_val, y_pred)
print("F1 Score:", f1)


[nltk_data] Downloading package opinion_lexicon to /root/nltk_data...
[nltk_data]   Package opinion_lexicon is already up-to-date!


Accuracy: 0.8816363636363637
F1 Score: 0.9220639291272598


sentiment dicitonaries after stemming and lemmatizing the raw text

In [16]:
#clean_train_data['review-text-cleaned'] = clean_and_preprocess_text(raw_train_data['review-text-cleaned'])
#clean_val_data['review-text-cleaned'] = clean_and_preprocess_text(raw_val_data['review-text-cleaned'])
reviews = clean_train_data['review-text-cleaned']

# Initialize NLTK's opinion lexicon
nltk.download('opinion_lexicon')
from nltk.corpus import opinion_lexicon

# Remove duplicate words from the opinion lexicon
unique_words = list(set(opinion_lexicon.words()))

# Create a vectorizer for sentiment word frequency
vectorizer = CountVectorizer(vocabulary=unique_words, binary=True)

# Transform text into sentiment feature vectors
sentiment_features_train = vectorizer.transform(reviews)

x_train = sentiment_features_train

# Prepare labels
y_train = clean_train_data['rating']

# Create and train a logistic regression model
model = LogisticRegression(solver='newton-cg', max_iter=1000)
model.fit(x_train, y_train)

x_val = clean_val_data['review-text-cleaned']
sentiment_features_val = vectorizer.transform(x_val)
y_val = clean_val_data['rating']

# Predict and evaluate on the validation set
y_pred = model.predict(sentiment_features_val)
accuracy = accuracy_score(y_val, y_pred)
print("Accuracy:", accuracy)

# Calculate F1 score
f1 = f1_score(y_val, y_pred)
print("F1 Score:", f1)

[nltk_data] Downloading package opinion_lexicon to /root/nltk_data...
[nltk_data]   Package opinion_lexicon is already up-to-date!


Accuracy: 0.8443636363636363
F1 Score: 0.8994360902255639
