In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import copy
import math
from collections import defaultdict, Counter

In [7]:
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.dummy import DummyClassifier
from sklearn.naive_bayes import GaussianNB, BernoulliNB,CategoricalNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
import sklearn
print('The scikit-learn version is {}.'.format(sklearn.__version__))

The scikit-learn version is 1.2.2.


In [4]:
import warnings

# ignore future warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

## Reading and Pre-processing


In [5]:
import pandas as pd

def preprocess(fileName1, fileName2):
    ## read the csv files
    data1 = pd.read_csv(fileName1)
    data2 = pd.read_csv(fileName2)

    # Drop the first column of data2
    data2 = data2.drop(data2.columns[0], axis=1)

    # Check if the number of rows in both datasets are the same
    if len(data1) != len(data2):
        raise ValueError("The number of rows in the two datasets do not match!")

    # Concatenate data2 (word embeddings) with 'rating' and 'dr-id-adjusted' columns from data1
    merged_data = pd.concat([data1[['dr-id-adjusted', 'rating']], data2], axis=1)

    # Splitting the dataset into features and target
    # Assuming all other columns except 'rating' in merged_data are features
    features = merged_data.drop(columns=['rating'])
    target = merged_data['rating']

    return merged_data, features, target


In [6]:
## read the data of word embedding
dataset_train, features_train,target_train = preprocess("D:/unimelb-3rd/IML/ASS3/dataset/TRAIN.csv","D:/unimelb-3rd/IML/ASS3/dataset/384EMBEDDINGS_TRAIN.csv")
dataset_val, features_val, target_val = preprocess("D:/unimelb-3rd/IML/ASS3/dataset/VALIDATION.csv","D:/unimelb-3rd/IML/ASS3/dataset/384EMBEDDINGS_VALIDATION.csv")
print(len(dataset_train))
print(len(dataset_val))
dataset_val

43003
5500


Unnamed: 0,dr-id-adjusted,rating,0,1,2,3,4,5,6,7,...,374,375,376,377,378,379,380,381,382,383
0,33620,1,-0.022207,0.064185,0.023957,0.021580,-0.063474,-0.060289,0.057354,0.066238,...,-0.033674,-0.055816,0.013047,-0.035806,-0.016576,0.040326,0.005111,-0.029757,-0.063486,0.000462
1,33620,-1,-0.047829,0.039449,0.025721,0.024461,0.013234,-0.007365,-0.025881,-0.007678,...,0.088275,-0.104800,-0.039734,-0.038932,-0.067038,-0.025953,-0.077584,0.018969,-0.091612,-0.016109
2,33626,1,-0.015018,-0.004742,-0.015077,0.026958,-0.061960,0.000557,0.038323,0.099361,...,0.025605,0.056154,0.024323,-0.017221,-0.075064,0.007564,-0.072174,-0.020699,-0.057820,0.039419
3,33626,1,-0.014154,-0.015275,0.032033,0.045189,-0.076433,-0.001758,-0.019410,0.068679,...,-0.009228,0.029588,-0.022354,-0.013990,-0.082329,0.040468,-0.015963,-0.068362,-0.050644,0.096260
4,33628,1,-0.069949,-0.012617,0.035879,-0.041826,-0.076924,-0.057929,0.026214,-0.029924,...,-0.021068,-0.038854,0.042229,-0.022856,0.026084,0.134875,0.022401,-0.051483,-0.014984,0.006698
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5495,38063,1,0.006699,0.033123,0.003509,0.063899,0.020196,-0.076222,-0.052318,0.042699,...,0.000500,0.067840,0.030330,-0.008184,-0.027531,-0.051565,-0.057745,-0.026203,-0.160159,-0.000017
5496,38064,1,-0.027958,-0.021277,0.022908,0.005755,-0.139231,-0.047026,-0.070944,0.033015,...,0.024188,0.012772,0.103551,-0.042511,-0.064135,0.052720,-0.005511,0.015123,-0.008484,0.012005
5497,38065,1,-0.052219,-0.011069,0.007917,0.008442,-0.076079,-0.029523,-0.030472,0.104013,...,0.018229,0.026702,-0.071201,-0.036976,-0.092275,0.027811,0.011771,0.012291,0.025588,0.010331
5498,38065,1,-0.000394,-0.083509,0.002389,0.039134,-0.119880,-0.060715,-0.020452,0.030014,...,0.017917,0.024247,-0.003603,-0.046223,-0.025451,0.105774,-0.067439,-0.058233,0.051917,0.006985


#### Baseline methods


In [7]:
## You can define your helper functions for One-R or other baselines in this block
## for One-R at training time, you can break the ties randomly
## for One-R at prediction time, if the test contains an unseen feature value, return the majority class
# Zero-R Classifier
import random
def one_r_classifier(train_x, train_y):
    best_error = float('inf')
    best_rules = {}
    best_attribute = None
    majority_class = train_y.mode()[0]

    for attribute in train_x.columns:
        rules = {}
        error_count = 0

        for value in pd.unique(train_x[attribute]):
            # Get the most common class for this attribute value
            possible_classes = train_y[train_x[attribute] == value].value_counts()

            # If there's a tie, pick randomly
            if len(possible_classes) > 1 and possible_classes.iloc[0] == possible_classes.iloc[1]:
                chosen_class = random.choice(possible_classes.index[:2])
            else:
                chosen_class = possible_classes.idxmax()

            rules[value] = chosen_class
            error_count += sum(train_y[train_x[attribute] == value] != chosen_class)

        # If this attribute has a lower error than the best so far, update best_rules and best_attribute
        if error_count < best_error:
            best_error = error_count
            best_rules = rules
            best_attribute = attribute

    # Return the lambda function for prediction and the best attribute
    return lambda x: best_rules.get(x[best_attribute], majority_class), best_attribute


In [9]:
def baselines(dataset_train,dataset_val):

    ZeroR_Acc_1 = []
    WRand_Acc_1 = []

    ## your code here
    report=[]
    train_x, train_y = dataset_train.drop(['rating'],axis = 1), dataset_train['rating']
    val_x, val_y = dataset_val.drop(['rating'], axis=1), dataset_val['rating']

      # Train and test Zero-R
    zero_r = DummyClassifier(strategy='most_frequent')
    zero_r.fit(train_x, train_y)
    zero_r_predictions = zero_r.predict(val_x)
    zero_r_accuracy = sum(1 for pred, true in zip(zero_r_predictions, val_y) if pred == true) / len(val_y)
    report_zero = classification_report(val_y,zero_r_predictions,zero_division=0)
    report.append(report_zero)
    ZeroR_Acc_1.append(zero_r_accuracy)

      # Train and test Weighted Random
    weighted_random = DummyClassifier(strategy='stratified')
    weighted_random.fit(train_x, train_y)
    weighted_random_predictions = weighted_random.predict(val_x)
    report_w = classification_report(val_y, weighted_random_predictions,zero_division=0)
    report.append(report_w)
    weighted_random_accuracy = sum(1 for pred, true in zip(weighted_random_predictions, val_y) if pred == true) / len(val_y)
    WRand_Acc_1.append(weighted_random_accuracy)


    print("Accuracy of ZeroR:", np.mean(ZeroR_Acc_1).round(2))
    print("Accuracy of Weighted Random:", np.mean(WRand_Acc_1).round(2))


baselines(dataset_train,dataset_val)


Accuracy of ZeroR: 0.73
Accuracy of Weighted Random: 0.61


In [11]:


train_x, train_y = dataset_train.drop(['rating'],axis = 1), dataset_train['rating']
val_x, val_y = dataset_val.drop(['rating'], axis=1), dataset_val['rating']

rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

rf_model.fit(train_x, train_y)

y_pred = rf_model.predict(val_x)

accuracy = accuracy_score(val_y, y_pred)

print("Accuracy:", accuracy)


Accuracy: 0.8949090909090909


Now try RandomForest with TFIDF

In [6]:
## read the data of TFIDF
dataset_train, features_train,target_train = preprocess("D:/unimelb-3rd/IML/ASS3/dataset/TRAIN.csv","D:/unimelb-3rd/IML/ASS3/dataset/TFIDF_TRAIN.csv")
dataset_val, features_val, target_val = preprocess("D:/unimelb-3rd/IML/ASS3/dataset/VALIDATION.csv","D:/unimelb-3rd/IML/ASS3/dataset/TFIDF_VALIDATION.csv")
print(len(dataset_train))
print(len(dataset_val))
dataset_val

43003
5500


Unnamed: 0,dr-id-adjusted,rating,10,100,15,20,30,63,able,absolutely,...,worse,worst,worth,wouldn,wrong,year,years,yes,young,yrs
0,33620,1,0.000000,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.164930,0.0,0.0,0.0
1,33620,-1,0.000000,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
2,33626,1,0.161179,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.106070,0.0,0.0,0.0
3,33626,1,0.000000,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.191319,0.0,0.0,0.0
4,33628,1,0.000000,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5495,38063,1,0.000000,0.0,0.0,0.0,0.0,0.225299,0.223975,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
5496,38064,1,0.000000,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
5497,38065,1,0.302670,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.199183,0.0,0.0,0.0
5498,38065,1,0.000000,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0


In [9]:
def baselines(dataset_train,dataset_val):

    ZeroR_Acc_1 = []
    WRand_Acc_1 = []

    ## your code here
    report=[]
    train_x, train_y = dataset_train.drop(['rating'],axis = 1), dataset_train['rating']
    val_x, val_y = dataset_val.drop(['rating'], axis=1), dataset_val['rating']

      # Train and test Zero-R
    zero_r = DummyClassifier(strategy='most_frequent')
    zero_r.fit(train_x, train_y)
    zero_r_predictions = zero_r.predict(val_x)
    zero_r_accuracy = sum(1 for pred, true in zip(zero_r_predictions, val_y) if pred == true) / len(val_y)
    report_zero = classification_report(val_y,zero_r_predictions,zero_division=0)
    report.append(report_zero)
    ZeroR_Acc_1.append(zero_r_accuracy)

      # Train and test Weighted Random
    weighted_random = DummyClassifier(strategy='stratified')
    weighted_random.fit(train_x, train_y)
    weighted_random_predictions = weighted_random.predict(val_x)
    report_w = classification_report(val_y, weighted_random_predictions,zero_division=0)
    report.append(report_w)
    weighted_random_accuracy = sum(1 for pred, true in zip(weighted_random_predictions, val_y) if pred == true) / len(val_y)
    WRand_Acc_1.append(weighted_random_accuracy)


    print("Accuracy of ZeroR:", np.mean(ZeroR_Acc_1).round(2))
    print("Accuracy of Weighted Random:", np.mean(WRand_Acc_1).round(2))


baselines(dataset_train,dataset_val)

Accuracy of ZeroR: 0.73
Accuracy of Weighted Random: 0.6


In [8]:
train_x, train_y = dataset_train.drop(['rating'],axis = 1), dataset_train['rating']
val_x, val_y = dataset_val.drop(['rating'], axis=1), dataset_val['rating']

rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

rf_model.fit(train_x, train_y)

y_pred = rf_model.predict(val_x)

accuracy = accuracy_score(val_y, y_pred)

print("Accuracy:", accuracy)

Accuracy: 0.8903636363636364
