In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import copy
import math
from collections import defaultdict, Counter

In [2]:
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.dummy import DummyClassifier
from sklearn.naive_bayes import GaussianNB, BernoulliNB,CategoricalNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
import sklearn
from transformers import TrainingArguments, Trainer
import torch
from torch.utils.data import Dataset
from transformers import AutoModelForSequenceClassification, AutoTokenizer

from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.optimizers import Adam
print('The scikit-learn version is {}.'.format(sklearn.__version__))



The scikit-learn version is 1.2.2.


In [3]:
import warnings

# ignore future warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

## Reading and Pre-processing


In [48]:
import pandas as pd

def preprocess(fileName1, fileName2):
    ## read the csv files
    data1 = pd.read_csv(fileName1)
    data2 = pd.read_csv(fileName2)

    # Drop the first column of data2
    data2 = data2.drop(data2.columns[0], axis=1)

    # Check if the number of rows in both datasets are the same
    if len(data1) != len(data2):
        raise ValueError("The number of rows in the two datasets do not match!")

    # Convert -1 to 0 in the 'rating' column of data1
    data1['rating'] = data1['rating'].replace(-1, 0)

    # Concatenate data2 (word embeddings) with 'rating' and 'dr-id-adjusted' columns from data1
    merged_data = pd.concat([data1[['dr-id-adjusted', 'rating']], data2], axis=1)

    # Splitting the dataset into features and target
    # Assuming all other columns except 'rating' in merged_data are features
    features = merged_data.drop(columns=['rating'])
    target = merged_data['rating']
    
    return merged_data, features, target


In [49]:
## read the data of word embedding
dataset_train, features_train,target_train = preprocess("D:/unimelb-3rd/IML/ASS3/dataset/TRAIN.csv","D:/unimelb-3rd/IML/ASS3/dataset/384EMBEDDINGS_TRAIN.csv")
dataset_val, features_val, target_val = preprocess("D:/unimelb-3rd/IML/ASS3/dataset/VALIDATION.csv","D:/unimelb-3rd/IML/ASS3/dataset/384EMBEDDINGS_VALIDATION.csv")
print(len(dataset_train))
print(len(dataset_val))
dataset_val

43003
5500


Unnamed: 0,dr-id-adjusted,rating,0,1,2,3,4,5,6,7,...,374,375,376,377,378,379,380,381,382,383
0,33620,1,-0.022207,0.064185,0.023957,0.021580,-0.063474,-0.060289,0.057354,0.066238,...,-0.033674,-0.055816,0.013047,-0.035806,-0.016576,0.040326,0.005111,-0.029757,-0.063486,0.000462
1,33620,0,-0.047829,0.039449,0.025721,0.024461,0.013234,-0.007365,-0.025881,-0.007678,...,0.088275,-0.104800,-0.039734,-0.038932,-0.067038,-0.025953,-0.077584,0.018969,-0.091612,-0.016109
2,33626,1,-0.015018,-0.004742,-0.015077,0.026958,-0.061960,0.000557,0.038323,0.099361,...,0.025605,0.056154,0.024323,-0.017221,-0.075064,0.007564,-0.072174,-0.020699,-0.057820,0.039419
3,33626,1,-0.014154,-0.015275,0.032033,0.045189,-0.076433,-0.001758,-0.019410,0.068679,...,-0.009228,0.029588,-0.022354,-0.013990,-0.082329,0.040468,-0.015963,-0.068362,-0.050644,0.096260
4,33628,1,-0.069949,-0.012617,0.035879,-0.041826,-0.076924,-0.057929,0.026214,-0.029924,...,-0.021068,-0.038854,0.042229,-0.022856,0.026084,0.134875,0.022401,-0.051483,-0.014984,0.006698
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5495,38063,1,0.006699,0.033123,0.003509,0.063899,0.020196,-0.076222,-0.052318,0.042699,...,0.000500,0.067840,0.030330,-0.008184,-0.027531,-0.051565,-0.057745,-0.026203,-0.160159,-0.000017
5496,38064,1,-0.027958,-0.021277,0.022908,0.005755,-0.139231,-0.047026,-0.070944,0.033015,...,0.024188,0.012772,0.103551,-0.042511,-0.064135,0.052720,-0.005511,0.015123,-0.008484,0.012005
5497,38065,1,-0.052219,-0.011069,0.007917,0.008442,-0.076079,-0.029523,-0.030472,0.104013,...,0.018229,0.026702,-0.071201,-0.036976,-0.092275,0.027811,0.011771,0.012291,0.025588,0.010331
5498,38065,1,-0.000394,-0.083509,0.002389,0.039134,-0.119880,-0.060715,-0.020452,0.030014,...,0.017917,0.024247,-0.003603,-0.046223,-0.025451,0.105774,-0.067439,-0.058233,0.051917,0.006985


#### Baseline methods


In [6]:
def baselines(dataset_train,dataset_val):

    ZeroR_Acc_1 = []
    WRand_Acc_1 = []

    ## your code here
    report=[]
    train_x, train_y = dataset_train.drop(['rating'],axis = 1), dataset_train['rating']
    val_x, val_y = dataset_val.drop(['rating'], axis=1), dataset_val['rating']

      # Train and test Zero-R
    zero_r = DummyClassifier(strategy='most_frequent')
    zero_r.fit(train_x, train_y)
    zero_r_predictions = zero_r.predict(val_x)
    zero_r_accuracy = sum(1 for pred, true in zip(zero_r_predictions, val_y) if pred == true) / len(val_y)
    report_zero = classification_report(val_y,zero_r_predictions,zero_division=0)
    report.append(report_zero)
    ZeroR_Acc_1.append(zero_r_accuracy)

      # Train and test Weighted Random
    weighted_random = DummyClassifier(strategy='stratified')
    weighted_random.fit(train_x, train_y)
    weighted_random_predictions = weighted_random.predict(val_x)
    report_w = classification_report(val_y, weighted_random_predictions,zero_division=0)
    report.append(report_w)
    weighted_random_accuracy = sum(1 for pred, true in zip(weighted_random_predictions, val_y) if pred == true) / len(val_y)
    WRand_Acc_1.append(weighted_random_accuracy)


    print("Accuracy of ZeroR:", np.mean(ZeroR_Acc_1).round(2))
    print("Accuracy of Weighted Random:", np.mean(WRand_Acc_1).round(2))


baselines(dataset_train,dataset_val)


Accuracy of ZeroR: 0.73
Accuracy of Weighted Random: 0.61


Random Forest Model

In [7]:
train_x, train_y = dataset_train.drop(['rating','dr-id-adjusted'],axis = 1), dataset_train['rating']
val_x, val_y = dataset_val.drop(['rating','dr-id-adjusted'], axis=1), dataset_val['rating']

rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

rf_model.fit(train_x, train_y)

y_pred = rf_model.predict(val_x)

accuracy = accuracy_score(val_y, y_pred)

print("Accuracy:", accuracy)


Accuracy: 0.8927272727272727


In [8]:
train_x.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,374,375,376,377,378,379,380,381,382,383
0,0.016249,0.041551,-0.000219,-0.004635,-0.119714,-0.061171,0.056912,0.003796,0.025208,-0.002971,...,-0.018835,-0.011457,0.025458,-0.013316,-0.108422,-0.030389,0.041146,-0.044184,0.016079,0.05153
1,0.01766,-0.05878,-0.039708,0.029005,-0.093917,-0.017276,-0.017363,0.068082,0.001246,-0.075523,...,-0.025679,0.0567,-0.007257,-0.039331,-0.045309,0.048402,-0.044417,-0.034528,0.026516,0.07266
2,0.05424,0.024244,0.003913,0.010369,-0.145461,-0.031338,0.10276,0.023458,-0.037534,0.021414,...,0.041787,-0.001077,0.071329,0.016939,-0.084727,-0.009989,0.073522,-0.007146,0.023193,0.070061
3,0.056422,-0.033869,-0.01758,-0.023686,-0.076519,-0.050073,0.01125,0.05372,-0.02507,-0.0643,...,-0.02996,0.029225,0.008371,-0.033793,-0.091898,-0.004014,-0.027794,-0.035759,-0.043717,0.041542
4,0.03649,-0.019932,-0.011038,0.02358,-0.046569,0.0071,-0.019931,0.03266,-0.007583,-0.089468,...,-0.025044,0.061533,-0.014741,-0.065765,-0.053074,-0.003246,-0.061715,-0.018613,-0.003515,0.08235


In [53]:
#prepare the data
train_x, train_y = dataset_train.drop(['rating','dr-id-adjusted'],axis = 1), dataset_train['rating']
val_x, val_y = dataset_val.drop(['rating','dr-id-adjusted'], axis=1), dataset_val['rating']

Simple neuro network

In [31]:

#construct the network
model = keras.Sequential()
model.add(layers.Input(shape=(384,))) #input layer
#model.add(layers.Dense(128, activation='relu'))  # hidden layer 1
#model.add(layers.Dense(64, activation='relu'))   
model.add(layers.Dense(32, activation='relu'))
#model.add(layers.Dense(16, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))  # output layer


# set the learning rate
custom_optimizer = Adam(learning_rate=0.001)  

#compile the model
model.compile(optimizer=custom_optimizer, loss='binary_crossentropy', metrics=['accuracy'])


# training
model.fit(train_x, train_y, epochs=20, batch_size=32, validation_data=(val_x, val_y))

# evaluate
accuracy = model.evaluate(val_x, val_y)[1]
print("Accuracy:", accuracy)


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Accuracy: 0.921999990940094


In [51]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

# 3. 初始化 SVM 模型
svm_model = SVC(kernel="linear")  # 你可以选择不同的核函数，这里选择线性核

# 4. 训练 SVM 模型
svm_model.fit(train_x, train_y)

ValueError: Classification metrics can't handle a mix of continuous-multioutput and binary targets

In [56]:
predicted_labels = svm_model.predict(val_x)

# 然后计算准确率
accuracy = accuracy_score(val_y, predicted_labels)
accuracy

0.9223636363636364

Now try RandomForest with TFIDF

In [10]:
## read the data of TFIDF
dataset_train, features_train,target_train = preprocess("D:/unimelb-3rd/IML/ASS3/dataset/TRAIN.csv","D:/unimelb-3rd/IML/ASS3/dataset/TFIDF_TRAIN.csv")
dataset_val, features_val, target_val = preprocess("D:/unimelb-3rd/IML/ASS3/dataset/VALIDATION.csv","D:/unimelb-3rd/IML/ASS3/dataset/TFIDF_VALIDATION.csv")
print(len(dataset_train))
print(len(dataset_val))
dataset_val

43003
5500


Unnamed: 0,dr-id-adjusted,rating,10,100,15,20,30,63,able,absolutely,...,worse,worst,worth,wouldn,wrong,year,years,yes,young,yrs
0,33620,1,0.000000,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.164930,0.0,0.0,0.0
1,33620,0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
2,33626,1,0.161179,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.106070,0.0,0.0,0.0
3,33626,1,0.000000,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.191319,0.0,0.0,0.0
4,33628,1,0.000000,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5495,38063,1,0.000000,0.0,0.0,0.0,0.0,0.225299,0.223975,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
5496,38064,1,0.000000,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
5497,38065,1,0.302670,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.199183,0.0,0.0,0.0
5498,38065,1,0.000000,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0


In [11]:
def baselines(dataset_train,dataset_val):

    ZeroR_Acc_1 = []
    WRand_Acc_1 = []

    ## your code here
    report=[]
    train_x, train_y = dataset_train.drop(['rating'],axis = 1), dataset_train['rating']
    val_x, val_y = dataset_val.drop(['rating'], axis=1), dataset_val['rating']

      # Train and test Zero-R
    zero_r = DummyClassifier(strategy='most_frequent')
    zero_r.fit(train_x, train_y)
    zero_r_predictions = zero_r.predict(val_x)
    zero_r_accuracy = sum(1 for pred, true in zip(zero_r_predictions, val_y) if pred == true) / len(val_y)
    report_zero = classification_report(val_y,zero_r_predictions,zero_division=0)
    report.append(report_zero)
    ZeroR_Acc_1.append(zero_r_accuracy)

      # Train and test Weighted Random
    weighted_random = DummyClassifier(strategy='stratified')
    weighted_random.fit(train_x, train_y)
    weighted_random_predictions = weighted_random.predict(val_x)
    report_w = classification_report(val_y, weighted_random_predictions,zero_division=0)
    report.append(report_w)
    weighted_random_accuracy = sum(1 for pred, true in zip(weighted_random_predictions, val_y) if pred == true) / len(val_y)
    WRand_Acc_1.append(weighted_random_accuracy)


    print("Accuracy of ZeroR:", np.mean(ZeroR_Acc_1).round(2))
    print("Accuracy of Weighted Random:", np.mean(WRand_Acc_1).round(2))


baselines(dataset_train,dataset_val)

Accuracy of ZeroR: 0.73
Accuracy of Weighted Random: 0.6


Random Forest Model

In [12]:
train_x, train_y = dataset_train.drop(['rating'],axis = 1), dataset_train['rating']
val_x, val_y = dataset_val.drop(['rating'], axis=1), dataset_val['rating']

rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

rf_model.fit(train_x, train_y)

y_pred = rf_model.predict(val_x)

accuracy = accuracy_score(val_y, y_pred)

print("Accuracy:", accuracy)

Accuracy: 0.8903636363636364


In [13]:
train_x.head()

Unnamed: 0,dr-id-adjusted,10,100,15,20,30,63,able,absolutely,actually,...,worse,worst,worth,wouldn,wrong,year,years,yes,young,yrs
0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,0.0,0.0,0.0,0.0,0.361055,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.208039,0.0,0.0,0.0
2,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.327285,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.231648,0.0,0.0,0.0


Simple neuro network

In [14]:

#prepare the data
train_x, train_y = dataset_train.drop(['rating','dr-id-adjusted'],axis = 1), dataset_train['rating']
val_x, val_y = dataset_val.drop(['rating','dr-id-adjusted'], axis=1), dataset_val['rating']

#construct the network
model = keras.Sequential()
model.add(layers.Input(shape=(500,))) #input layer
model.add(layers.Dense(128, activation='relu'))  # hidden layer 1
#model.add(layers.Dense(64, activation='relu'))   
#model.add(layers.Dense(32, activation='relu'))
#model.add(layers.Dense(16, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))  # output layer


# set the learning rate
custom_optimizer = Adam(learning_rate=0.001)  

#compile the model
model.compile(optimizer=custom_optimizer, loss='binary_crossentropy', metrics=['accuracy'])


# training
model.fit(train_x, train_y, epochs=10, batch_size=32, validation_data=(val_x, val_y))

# evaluate
accuracy = model.evaluate(val_x, val_y)[1]
print("Accuracy:", accuracy)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Accuracy: 0.9070909023284912


read in the test dataset for kaggle

In [59]:
test_data = pd.read_csv("D:/unimelb-3rd/IML/ASS3/dataset/384EMBEDDINGS_TEST.csv")
#test_data = pd.read_csv("D:/unimelb-3rd/IML/ASS3/dataset/TFIDF_TEST.csv")
test_data.head()

Unnamed: 0.1,Unnamed: 0,0,1,2,3,4,5,6,7,8,...,374,375,376,377,378,379,380,381,382,383
0,48503,-0.022147,-0.022475,0.086003,0.007822,0.012784,-0.000159,0.011789,0.021219,-0.058783,...,-0.065846,0.021954,0.063496,-0.023876,-0.080697,0.025039,0.03598,-0.004304,-0.091479,0.032933
1,48504,0.017011,0.08469,0.031479,-0.018542,-0.136334,-0.040547,0.024581,-0.001718,0.031045,...,-0.022533,0.003009,0.063849,-0.029857,-0.009398,0.083773,0.048096,-0.021208,0.018363,-0.004735
2,48505,-0.019075,0.014318,-0.0547,-0.013129,-0.138464,-0.054037,-0.006527,0.07305,-0.024574,...,-0.012777,0.025411,-0.040082,-0.02808,-0.058692,0.009434,0.103693,-0.076458,-0.092033,-0.020267
3,48506,-0.014013,-0.022374,-0.02047,-0.037221,-0.161702,-0.060834,0.020752,0.023565,-0.039837,...,-0.052155,0.076453,-0.034619,-0.002245,-0.035704,0.010705,0.029748,-0.034872,-0.07028,0.032426
4,48507,-0.021505,0.092344,0.000482,-0.024316,-0.098049,-0.08382,0.015389,-0.033406,-0.018172,...,0.035989,0.000115,0.041575,-0.082211,-0.083002,0.126218,0.010888,-0.027627,-0.060814,0.019299


In [60]:
test_x = test_data.drop(test_data.columns[0], axis=1)
predictions = svm_model.predict(test_x)
threshold = 0.5
predicted_labels = (predictions > threshold).astype(int)
predicted_labels[predicted_labels == 0] = -1
predicted_labels = predicted_labels.flatten()
#predicted_labels
result_df = pd.DataFrame({'id': np.arange(5514), 'rating': predicted_labels})
result_df.to_csv('output.csv', index=False, header=['id', 'rating'])

training with raw text using TinyBERT

In [32]:
raw_train_data = pd.read_csv("D:/unimelb-3rd/IML/ASS3/dataset/TRAIN.csv")
raw_train_data = raw_train_data.drop(["Unnamed: 0","dr-id-adjusted","dr_id_gender"], axis=1)
raw_val_data = pd.read_csv("D:/unimelb-3rd/IML/ASS3/dataset/VALIDATION.csv")
raw_val_data = raw_val_data.drop(["Unnamed: 0","dr-id-adjusted","dr_id_gender"], axis=1)
raw_train_data['rating'] = raw_train_data['rating'].replace(-1, 0)
raw_val_data['rating'] = raw_val_data['rating'].replace(-1, 0)
raw_train_data['review-text-cleaned'][5149]

' '

In [5]:
class CustomDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item


In [36]:
import string
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from spellchecker import SpellChecker

def clean_and_preprocess_text(raw_text_column):
    # 初始化拼写检查器
    spell = SpellChecker()

    # 初始化停用词列表
    nltk.download('stopwords')
    stop_words = set(stopwords.words('english'))

    # 初始化词干提取器
    stemmer = PorterStemmer()

    # 初始化词形还原器
    nltk.download('wordnet')
    lemmatizer = WordNetLemmatizer()

    # 定义文本清洗和预处理函数
    def preprocess_text(text):
        if text is None or pd.isnull(text):  # 检查缺失值
            return ""  # 如果文本为空或缺失，返回空字符串
        elif text.strip() == "":  # 检查是否只包含空格
            return ""  # 如果只包含空格，返回空字符串

        # 1. 去除标点符号
        text = text.translate(str.maketrans('', '', string.punctuation))

        # 2. 拆分文本成单词列表
        words = text.split()

        # 3. 文本标准化（转为小写）
        text = text.lower()

        # 4. 去除停用词
        words = text.split()
        filtered_words = [word for word in words if word not in stop_words]
        text = ' '.join(filtered_words)

        # 5. 词干提取
        words = text.split()
        stemmed_words = [stemmer.stem(word) for word in words]
        text = ' '.join(stemmed_words)

        # 6. 词形还原
        words = text.split()
        lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
        text = ' '.join(lemmatized_words)

        return text

    # 对DataFrame列中的每个文本应用预处理函数
    preprocessed_text_column = raw_text_column.apply(preprocess_text)

    return preprocessed_text_column


In [37]:
raw_train_data['review-text-cleaned'] =  clean_and_preprocess_text(raw_train_data['review-text-cleaned'])
raw_val_data['review-text-cleaned'] =  clean_and_preprocess_text(raw_val_data['review-text-cleaned'])

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\94245\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\94245\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\94245\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\94245\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [43]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer, TrainingArguments, Trainer
import torch

model_name = "prajjwal1/bert-tiny"  # TinyBERT模型名称
num_labels = 2  # 二分类任务，有两个类别

# 初始化模型和分词器，并将它们移到GPU上
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# 使用分词器将文本转换为token IDs，并将它们移到GPU上
train_encodings = tokenizer(raw_train_data['review-text-cleaned'].tolist(), truncation=True,padding=True, return_tensors="pt")
val_encodings = tokenizer(raw_val_data['review-text-cleaned'].tolist(), truncation=True, padding=True, return_tensors="pt")

# 创建PyTorch数据集
train_dataset = CustomDataset(train_encodings, raw_train_data['rating'])
val_dataset = CustomDataset(val_encodings, raw_val_data['rating'])

num_epochs = 3
batch_size = 32
training_args = TrainingArguments(
    output_dir="./tinybert_classification",
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    evaluation_strategy="steps",
    save_steps=500,
    save_total_limit=2,
    load_best_model_at_end=True,
)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

trainer.train()


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-tiny and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Step,Training Loss,Validation Loss
500,0.4112,0.263963
1000,0.2703,0.242272
1500,0.2463,0.249252
2000,0.2307,0.228678
2500,0.2214,0.220849
3000,0.2117,0.221075
3500,0.2052,0.218826
4000,0.2049,0.216949


TrainOutput(global_step=4032, training_loss=0.24986653893239916, metrics={'train_runtime': 2039.8589, 'train_samples_per_second': 63.244, 'train_steps_per_second': 1.977, 'total_flos': 75869746528860.0, 'train_loss': 0.24986653893239916, 'epoch': 3.0})

In [39]:
trainer.save_model("./tinybert_classification")

In [40]:
raw_test_data = pd.read_csv("D:/unimelb-3rd/IML/ASS3/dataset/TEST_NO_LABELS.csv")
raw_test_data = raw_test_data.drop(["Unnamed: 0","dr-id-adjusted","dr_id_gender"], axis=1)
raw_test_data['review-text-cleaned'] =  clean_and_preprocess_text(raw_test_data['review-text-cleaned'])

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\94245\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\94245\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [44]:
test_encodings = tokenizer(raw_test_data['review-text-cleaned'].tolist(), truncation=True, padding=True, return_tensors="pt")
# 创建一个虚拟的标签列表，长度与测试数据集的样本数相同（虽然这些标签不会被使用，但是需要占位）
dummy_labels = [0] * len(raw_test_data)

# 使用测试数据进行编码
test_encodings = tokenizer(raw_test_data['review-text-cleaned'].tolist(), truncation=True, padding=True, return_tensors="pt")

# 创建测试数据集（虚拟标签将不会被使用）
test_dataset = CustomDataset(test_encodings, dummy_labels)

predictions = trainer.predict(test_dataset)
predicted_labels = predictions.predictions.argmax(axis=1)  # 提取每个样本的预测标签
predicted_labels[predicted_labels == 0] = -1
predicted_labels = predicted_labels.flatten()
result_df = pd.DataFrame({'id': np.arange(5514), 'rating': predicted_labels})
result_df.to_csv('bert2.csv', index=False, header=['id', 'rating'])
