In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import copy
import math
from collections import defaultdict, Counter

In [2]:
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.dummy import DummyClassifier
from sklearn.naive_bayes import GaussianNB, BernoulliNB,CategoricalNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
import sklearn
print('The scikit-learn version is {}.'.format(sklearn.__version__))



The scikit-learn version is 1.2.2.


In [3]:
import warnings

# ignore future warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

## Reading and Pre-processing


In [39]:
import pandas as pd

def preprocess(fileName1, fileName2):
    ## read the csv files
    data1 = pd.read_csv(fileName1)
    data2 = pd.read_csv(fileName2)

    # Drop the first column of data2
    data2 = data2.drop(data2.columns[0], axis=1)

    # Check if the number of rows in both datasets are the same
    if len(data1) != len(data2):
        raise ValueError("The number of rows in the two datasets do not match!")

    # Convert -1 to 0 in the 'rating' column of data1
    data1['rating'] = data1['rating'].replace(-1, 0)

    # Concatenate data2 (word embeddings) with 'rating' and 'dr-id-adjusted' columns from data1
    merged_data = pd.concat([data1[['dr-id-adjusted', 'rating']], data2], axis=1)

    # Splitting the dataset into features and target
    # Assuming all other columns except 'rating' in merged_data are features
    features = merged_data.drop(columns=['rating'])
    target = merged_data['rating']
    
    return merged_data, features, target


In [40]:
## read the data of word embedding
dataset_train, features_train,target_train = preprocess("D:/unimelb-3rd/IML/ASS3/dataset/TRAIN.csv","D:/unimelb-3rd/IML/ASS3/dataset/384EMBEDDINGS_TRAIN.csv")
dataset_val, features_val, target_val = preprocess("D:/unimelb-3rd/IML/ASS3/dataset/VALIDATION.csv","D:/unimelb-3rd/IML/ASS3/dataset/384EMBEDDINGS_VALIDATION.csv")
print(len(dataset_train))
print(len(dataset_val))
dataset_val

43003
5500


Unnamed: 0,dr-id-adjusted,rating,0,1,2,3,4,5,6,7,...,374,375,376,377,378,379,380,381,382,383
0,33620,1,-0.022207,0.064185,0.023957,0.021580,-0.063474,-0.060289,0.057354,0.066238,...,-0.033674,-0.055816,0.013047,-0.035806,-0.016576,0.040326,0.005111,-0.029757,-0.063486,0.000462
1,33620,0,-0.047829,0.039449,0.025721,0.024461,0.013234,-0.007365,-0.025881,-0.007678,...,0.088275,-0.104800,-0.039734,-0.038932,-0.067038,-0.025953,-0.077584,0.018969,-0.091612,-0.016109
2,33626,1,-0.015018,-0.004742,-0.015077,0.026958,-0.061960,0.000557,0.038323,0.099361,...,0.025605,0.056154,0.024323,-0.017221,-0.075064,0.007564,-0.072174,-0.020699,-0.057820,0.039419
3,33626,1,-0.014154,-0.015275,0.032033,0.045189,-0.076433,-0.001758,-0.019410,0.068679,...,-0.009228,0.029588,-0.022354,-0.013990,-0.082329,0.040468,-0.015963,-0.068362,-0.050644,0.096260
4,33628,1,-0.069949,-0.012617,0.035879,-0.041826,-0.076924,-0.057929,0.026214,-0.029924,...,-0.021068,-0.038854,0.042229,-0.022856,0.026084,0.134875,0.022401,-0.051483,-0.014984,0.006698
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5495,38063,1,0.006699,0.033123,0.003509,0.063899,0.020196,-0.076222,-0.052318,0.042699,...,0.000500,0.067840,0.030330,-0.008184,-0.027531,-0.051565,-0.057745,-0.026203,-0.160159,-0.000017
5496,38064,1,-0.027958,-0.021277,0.022908,0.005755,-0.139231,-0.047026,-0.070944,0.033015,...,0.024188,0.012772,0.103551,-0.042511,-0.064135,0.052720,-0.005511,0.015123,-0.008484,0.012005
5497,38065,1,-0.052219,-0.011069,0.007917,0.008442,-0.076079,-0.029523,-0.030472,0.104013,...,0.018229,0.026702,-0.071201,-0.036976,-0.092275,0.027811,0.011771,0.012291,0.025588,0.010331
5498,38065,1,-0.000394,-0.083509,0.002389,0.039134,-0.119880,-0.060715,-0.020452,0.030014,...,0.017917,0.024247,-0.003603,-0.046223,-0.025451,0.105774,-0.067439,-0.058233,0.051917,0.006985


#### Baseline methods


In [16]:
def baselines(dataset_train,dataset_val):

    ZeroR_Acc_1 = []
    WRand_Acc_1 = []

    ## your code here
    report=[]
    train_x, train_y = dataset_train.drop(['rating'],axis = 1), dataset_train['rating']
    val_x, val_y = dataset_val.drop(['rating'], axis=1), dataset_val['rating']

      # Train and test Zero-R
    zero_r = DummyClassifier(strategy='most_frequent')
    zero_r.fit(train_x, train_y)
    zero_r_predictions = zero_r.predict(val_x)
    zero_r_accuracy = sum(1 for pred, true in zip(zero_r_predictions, val_y) if pred == true) / len(val_y)
    report_zero = classification_report(val_y,zero_r_predictions,zero_division=0)
    report.append(report_zero)
    ZeroR_Acc_1.append(zero_r_accuracy)

      # Train and test Weighted Random
    weighted_random = DummyClassifier(strategy='stratified')
    weighted_random.fit(train_x, train_y)
    weighted_random_predictions = weighted_random.predict(val_x)
    report_w = classification_report(val_y, weighted_random_predictions,zero_division=0)
    report.append(report_w)
    weighted_random_accuracy = sum(1 for pred, true in zip(weighted_random_predictions, val_y) if pred == true) / len(val_y)
    WRand_Acc_1.append(weighted_random_accuracy)


    print("Accuracy of ZeroR:", np.mean(ZeroR_Acc_1).round(2))
    print("Accuracy of Weighted Random:", np.mean(WRand_Acc_1).round(2))


baselines(dataset_train,dataset_val)


Accuracy of ZeroR: 0.73
Accuracy of Weighted Random: 0.6


Random Forest Model

In [53]:
train_x, train_y = dataset_train.drop(['rating','dr-id-adjusted'],axis = 1), dataset_train['rating']
val_x, val_y = dataset_val.drop(['rating','dr-id-adjusted'], axis=1), dataset_val['rating']

rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

rf_model.fit(train_x, train_y)

y_pred = rf_model.predict(val_x)

accuracy = accuracy_score(val_y, y_pred)

print("Accuracy:", accuracy)


Accuracy: 0.8927272727272727


In [18]:
train_x.head()

Unnamed: 0,dr-id-adjusted,0,1,2,3,4,5,6,7,8,...,374,375,376,377,378,379,380,381,382,383
0,1,0.016249,0.041551,-0.000219,-0.004635,-0.119714,-0.061171,0.056912,0.003796,0.025208,...,-0.018835,-0.011457,0.025458,-0.013316,-0.108422,-0.030389,0.041146,-0.044184,0.016079,0.05153
1,1,0.01766,-0.05878,-0.039708,0.029005,-0.093917,-0.017276,-0.017363,0.068082,0.001246,...,-0.025679,0.0567,-0.007257,-0.039331,-0.045309,0.048402,-0.044417,-0.034528,0.026516,0.07266
2,1,0.05424,0.024244,0.003913,0.010369,-0.145461,-0.031338,0.10276,0.023458,-0.037534,...,0.041787,-0.001077,0.071329,0.016939,-0.084727,-0.009989,0.073522,-0.007146,0.023193,0.070061
3,1,0.056422,-0.033869,-0.01758,-0.023686,-0.076519,-0.050073,0.01125,0.05372,-0.02507,...,-0.02996,0.029225,0.008371,-0.033793,-0.091898,-0.004014,-0.027794,-0.035759,-0.043717,0.041542
4,1,0.03649,-0.019932,-0.011038,0.02358,-0.046569,0.0071,-0.019931,0.03266,-0.007583,...,-0.025044,0.061533,-0.014741,-0.065765,-0.053074,-0.003246,-0.061715,-0.018613,-0.003515,0.08235


Simple neuro network

In [52]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.optimizers import Adam

#prepare the data
train_x, train_y = dataset_train.drop(['rating','dr-id-adjusted'],axis = 1), dataset_train['rating']
val_x, val_y = dataset_val.drop(['rating','dr-id-adjusted'], axis=1), dataset_val['rating']

#construct the network
model = keras.Sequential()
model.add(layers.Input(shape=(384,))) #input layer
model.add(layers.Dense(128, activation='relu'))  # hidden layer 1
#model.add(layers.Dense(64, activation='relu'))   
#model.add(layers.Dense(32, activation='relu'))
#model.add(layers.Dense(16, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))  # output layer


# set the learning rate
custom_optimizer = Adam(learning_rate=0.001)  

#compile the model
model.compile(optimizer=custom_optimizer, loss='binary_crossentropy', metrics=['accuracy'])


# training
model.fit(train_x, train_y, epochs=15, batch_size=32, validation_data=(val_x, val_y))

# evaluate
accuracy = model.evaluate(val_x, val_y)[1]
print("Accuracy:", accuracy)


Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
Accuracy: 0.9234545230865479


Now try RandomForest with TFIDF

In [54]:
## read the data of TFIDF
dataset_train, features_train,target_train = preprocess("D:/unimelb-3rd/IML/ASS3/dataset/TRAIN.csv","D:/unimelb-3rd/IML/ASS3/dataset/TFIDF_TRAIN.csv")
dataset_val, features_val, target_val = preprocess("D:/unimelb-3rd/IML/ASS3/dataset/VALIDATION.csv","D:/unimelb-3rd/IML/ASS3/dataset/TFIDF_VALIDATION.csv")
print(len(dataset_train))
print(len(dataset_val))
dataset_val

43003
5500


Unnamed: 0,dr-id-adjusted,rating,10,100,15,20,30,63,able,absolutely,...,worse,worst,worth,wouldn,wrong,year,years,yes,young,yrs
0,33620,1,0.000000,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.164930,0.0,0.0,0.0
1,33620,0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
2,33626,1,0.161179,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.106070,0.0,0.0,0.0
3,33626,1,0.000000,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.191319,0.0,0.0,0.0
4,33628,1,0.000000,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5495,38063,1,0.000000,0.0,0.0,0.0,0.0,0.225299,0.223975,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
5496,38064,1,0.000000,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
5497,38065,1,0.302670,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.199183,0.0,0.0,0.0
5498,38065,1,0.000000,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0


In [10]:
def baselines(dataset_train,dataset_val):

    ZeroR_Acc_1 = []
    WRand_Acc_1 = []

    ## your code here
    report=[]
    train_x, train_y = dataset_train.drop(['rating'],axis = 1), dataset_train['rating']
    val_x, val_y = dataset_val.drop(['rating'], axis=1), dataset_val['rating']

      # Train and test Zero-R
    zero_r = DummyClassifier(strategy='most_frequent')
    zero_r.fit(train_x, train_y)
    zero_r_predictions = zero_r.predict(val_x)
    zero_r_accuracy = sum(1 for pred, true in zip(zero_r_predictions, val_y) if pred == true) / len(val_y)
    report_zero = classification_report(val_y,zero_r_predictions,zero_division=0)
    report.append(report_zero)
    ZeroR_Acc_1.append(zero_r_accuracy)

      # Train and test Weighted Random
    weighted_random = DummyClassifier(strategy='stratified')
    weighted_random.fit(train_x, train_y)
    weighted_random_predictions = weighted_random.predict(val_x)
    report_w = classification_report(val_y, weighted_random_predictions,zero_division=0)
    report.append(report_w)
    weighted_random_accuracy = sum(1 for pred, true in zip(weighted_random_predictions, val_y) if pred == true) / len(val_y)
    WRand_Acc_1.append(weighted_random_accuracy)


    print("Accuracy of ZeroR:", np.mean(ZeroR_Acc_1).round(2))
    print("Accuracy of Weighted Random:", np.mean(WRand_Acc_1).round(2))


baselines(dataset_train,dataset_val)

Accuracy of ZeroR: 0.73
Accuracy of Weighted Random: 0.6


Random Forest Model

In [56]:
train_x, train_y = dataset_train.drop(['rating'],axis = 1), dataset_train['rating']
val_x, val_y = dataset_val.drop(['rating'], axis=1), dataset_val['rating']

rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

rf_model.fit(train_x, train_y)

y_pred = rf_model.predict(val_x)

accuracy = accuracy_score(val_y, y_pred)

print("Accuracy:", accuracy)

Accuracy: 0.8903636363636364


In [57]:
train_x.head()

Unnamed: 0,dr-id-adjusted,10,100,15,20,30,63,able,absolutely,actually,...,worse,worst,worth,wouldn,wrong,year,years,yes,young,yrs
0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,0.0,0.0,0.0,0.0,0.361055,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.208039,0.0,0.0,0.0
2,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.327285,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.231648,0.0,0.0,0.0


Simple neuro network

In [68]:

#prepare the data
train_x, train_y = dataset_train.drop(['rating','dr-id-adjusted'],axis = 1), dataset_train['rating']
val_x, val_y = dataset_val.drop(['rating','dr-id-adjusted'], axis=1), dataset_val['rating']

#construct the network
model = keras.Sequential()
model.add(layers.Input(shape=(500,))) #input layer
model.add(layers.Dense(128, activation='relu'))  # hidden layer 1
#model.add(layers.Dense(64, activation='relu'))   
#model.add(layers.Dense(32, activation='relu'))
#model.add(layers.Dense(16, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))  # output layer


# set the learning rate
custom_optimizer = Adam(learning_rate=0.001)  

#compile the model
model.compile(optimizer=custom_optimizer, loss='binary_crossentropy', metrics=['accuracy'])


# training
model.fit(train_x, train_y, epochs=10, batch_size=32, validation_data=(val_x, val_y))

# evaluate
accuracy = model.evaluate(val_x, val_y)[1]
print("Accuracy:", accuracy)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Accuracy: 0.904909074306488
