In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [6]:
!pip install datasets evaluate transformers[sentencepiece]
!pip install accelerate
# To run the training on TPU, uncomment the following line:
# !pip install cloud-tpu-client==0.10 torch==1.9.0 https://storage.googleapis.com/tpu-pytorch/wheels/torch_xla-1.9-cp37-cp37m-linux_x86_64.whl
!apt install git-lfs

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m6.

In [7]:
!git config --global user.email "anhnguye@usc.edu"
!git config --global user.name "Andy Nguyen"

In [None]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [9]:
import math
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

from transformers import AutoModelForMaskedLM, AutoModelForSequenceClassification, AutoTokenizer, DataCollatorForLanguageModeling, TrainingArguments, Trainer, pipeline
import torch
from datasets import Dataset, DatasetDict, ClassLabel

# 1. Domain Adaptation

In [34]:
# filename = 'drive/My Drive/preprocessed_comments.csv'
filename = 'andy.csv'

data = pd.read_csv(filename)
print(f'Total rows: {data.shape[0]}\n')

data = pd.DataFrame(data['comment']) # drop unnecessary cols

data = data.dropna() # dropping Nones
data = data[data['comment'].str.strip() != ""] # dropping ''
print(f'Rows after dropping empty comments: {data.shape[0]}\n')

train, test = train_test_split(data, test_size=0.1, random_state=42)
print(f'Train rows: {train.shape[0]} | Test rows: {test.shape[0]}')

Total rows: 83770

Rows after dropping empty comments: 83769

Train rows: 75392 | Test rows: 8377


In [35]:
model_checkpoint = "distilbert-base-uncased"
model = AutoModelForMaskedLM.from_pretrained(model_checkpoint)
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
print(f'{model.num_parameters()} parameters model')

66985530 parameters model


In [4]:
mask_filler = pipeline("fill-mask", model="distilbert-base-uncased")
preds = mask_filler('One of the most [MASK] classes I have ever taken')

for pred in preds:
    print(pred['sequence'])

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

one of the most challenging classes i have ever taken
one of the most demanding classes i have ever taken
one of the most difficult classes i have ever taken
one of the most exciting classes i have ever taken
one of the most prestigious classes i have ever taken


In [37]:
# Create Dataset objects
train_dataset = Dataset.from_pandas(train[['comment']])
test_dataset = Dataset.from_pandas(test[['comment']])

# Create DatasetDict
dataset = DatasetDict({
    'train': train_dataset,
    'test': test_dataset,
})

dataset

DatasetDict({
    train: Dataset({
        features: ['comment', '__index_level_0__'],
        num_rows: 75392
    })
    test: Dataset({
        features: ['comment', '__index_level_0__'],
        num_rows: 8377
    })
})

In [39]:
sample = dataset["train"].shuffle(seed=42).select(range(3))

for row in sample:
    print(f"Comment: {row['comment']}\n")

Comment: dr neufeld is such an amaz teacher incred learn interest nice and help he is the full meal deal of professor he inspir me to includ religi studi in my major now i am in rgla and love everi minut of it just take a cours from him and you will not regret it

Comment: favourit prof at viu by far

Comment: dr hundley conduct the class veri casual which is awesom 2 exam one group project hand of homework and inclass activ and choic of 2 paper it sound like a lot of work but it is realli not she is veri straightforward i honestli wish we spent more time review befor the exam becaus i alway felt realli unprepar



In [40]:
def tokenize_function(examples):
    result = tokenizer(examples['comment'])
    if tokenizer.is_fast:
        result["word_ids"] = [result.word_ids(i) for i in range(len(result["input_ids"]))]
    return result


# Use batched=True to activate fast multithreading!
tokenized_dataset = dataset.map(
    tokenize_function, batched=True, remove_columns=["comment", "__index_level_0__"]
)
tokenized_dataset

Map:   0%|          | 0/75392 [00:00<?, ? examples/s]

Map:   0%|          | 0/8377 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids'],
        num_rows: 75392
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids'],
        num_rows: 8377
    })
})

In [41]:
chunk_size = 128

In [42]:
def group_texts(examples):
    # Concatenate all texts
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    # Compute length of concatenated texts
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the last chunk if it's smaller than chunk_size
    total_length = (total_length // chunk_size) * chunk_size
    # Split by chunks of max_len
    result = {
        k: [t[i : i + chunk_size] for i in range(0, total_length, chunk_size)]
        for k, t in concatenated_examples.items()
    }
    # Create a new labels column
    result["labels"] = result["input_ids"].copy()
    return result


In [43]:
lm_datasets = tokenized_dataset.map(group_texts, batched=True)
lm_datasets

Map:   0%|          | 0/75392 [00:00<?, ? examples/s]

Map:   0%|          | 0/8377 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 27114
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 2985
    })
})

In [44]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)

In [45]:
samples = [lm_datasets["train"][i] for i in range(2)]
for sample in samples:
    _ = sample.pop("word_ids")

print(f'Masked inputs:')
for chunk in data_collator(samples)["input_ids"]:
    print(f"\n{tokenizer.decode(chunk)}")

Masked inputs:

[CLS] amaz i love everi second of be in her class she seventies you what you need [MASK] know and ample you interest in class she is incred and is a wonder person gabi know how to make the class her [MASK] she is veri good i would suggest [MASK] ani [MASK] she ha to offer [SEP] [CLS] make you attend all hi class for particip [MASK] but [MASK]trem easi if you [MASK] in hi smaller class not the huge [MASK]ctur 40 uppsala class easili [SEP] [CLS] [MASK]us to record her lectur which would be use for those [MASK] [MASK] disabl or a poor [MASK] connect unple

##as to student extrem rude and unfair refus to give ani help remind [MASK] whi i [MASK] pay her salari [SEP] [CLS] use trans battleships [MASK] for hi class when he say aardvark he mean [MASK] when [MASK] say goos he [MASK] [MASK] you can not [MASK] a word thi [MASK] say he test you on materi that he doe not [MASK] to teach and he is off in [MASK] own littl world [MASK] he [MASK] he doe not notic nobo [MASK] [MASK] unde

In [46]:
batch_size = 64
# Show the training loss with every epoch
logging_steps = len(lm_datasets["train"]) // batch_size
model_name = model_checkpoint.split("/")[-1]

training_args = TrainingArguments(
    num_train_epochs=1,
    output_dir=f"{model_name}-finetuned-ratemyprof",
    overwrite_output_dir=True,
    eval_strategy="epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    push_to_hub=True,
    fp16=True,
    logging_steps=logging_steps,
)

In [47]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=lm_datasets["train"],
    eval_dataset=lm_datasets["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
)

  trainer = Trainer(


HfHubHTTPError: 401 Client Error: Unauthorized for url: https://huggingface.co/api/repos/create (Request ID: Root=1-6745191f-65e50ed6057d42121248da7e;8959e685-ad07-4655-94c9-641bc9e5a6d6)

Invalid username or password.

In [None]:
eval_results = trainer.evaluate()
print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Perplexity: 386.85


In [None]:
for i in range(50):
    trainer.train()
    eval_results = trainer.evaluate()
    print(f"Perplexity (after {i+1} epochs): {math.exp(eval_results['eval_loss']):.2f}")

Epoch,Training Loss,Validation Loss
1,4.931,4.528771
2,4.4592,4.290766
3,4.3495,4.274695


Perplexity (after 1 epochs): 73.17


Epoch,Training Loss,Validation Loss
1,4.2506,4.13098
2,4.1514,4.036473
3,4.0921,3.989091


Perplexity (after 2 epochs): 57.09


Epoch,Training Loss,Validation Loss
1,4.0735,4.015939
2,3.9868,3.937064
3,3.9603,3.872381


Perplexity (after 3 epochs): 51.20


In [None]:
trainer.push_to_hub()

events.out.tfevents.1732517606.6591239748dc.134.29:   0%|          | 0.00/6.96k [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

events.out.tfevents.1732517704.6591239748dc.134.30:   0%|          | 0.00/359 [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/herald-of-spring/distilbert-base-uncased-finetuned-ratemyprof/commit/a0d63b03b249599006151d154dc4f2a14891b512', commit_message='End of training', commit_description='', oid='a0d63b03b249599006151d154dc4f2a14891b512', pr_url=None, pr_revision=None, pr_num=None)

# 2. Fine-Tuning

In [None]:
mask_filler = pipeline("fill-mask", model="herald-of-spring/distilbert-base-uncased-finetuned-ratemyprof")
preds = mask_filler('One of the most [MASK] classes I have ever taken')

for pred in preds:
    print(pred['sequence'])

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


one of the most challenging classes i have ever taken
one of the most amazing classes i have ever taken
one of the most difficult classes i have ever taken
one of the most inspiring classes i have ever taken
one of the most enjoyable classes i have ever taken


In [None]:
model_checkpoint = "herald-of-spring/distilbert-base-uncased-finetuned-ratemyprof"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=1, problem_type='regression')

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at herald-of-spring/distilbert-base-uncased-finetuned-ratemyprof and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
def tokenize_function(examples):
    result = tokenizer(examples['Comment'], padding=True, truncation=True)
    if tokenizer.is_fast:
        result["word_ids"] = [result.word_ids(i) for i in range(len(result["input_ids"]))]
    return result

In [None]:
# Create Dataset objects
train_star = Dataset.from_pandas(sup[['Comment', 'Star Rating']])
test_star = Dataset.from_pandas(test[['Comment', 'Star Rating']])

# Create DatasetDict
star_data = DatasetDict({
    'train': train_star,
    'test': test_star,
})

star_data

DatasetDict({
    train: Dataset({
        features: ['Comment', 'Star Rating', '__index_level_0__'],
        num_rows: 6880
    })
    test: Dataset({
        features: ['Comment', 'Star Rating', '__index_level_0__'],
        num_rows: 6880
    })
})

In [None]:
star_data = star_data.cast_column("Star Rating", ClassLabel(names=list(range(1,6))))
star_data['train'].features

Casting the dataset:   0%|          | 0/6880 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/6880 [00:00<?, ? examples/s]

{'Comment': Value(dtype='string', id=None),
 'Star Rating': ClassLabel(names=[1, 2, 3, 4, 5], id=None),
 '__index_level_0__': Value(dtype='int64', id=None)}

In [None]:
tokenized_star = dataset.map(
    tokenize_function, batched=True, remove_columns=["Comment", "__index_level_0__"]
)
tokenized_star

Map:   0%|          | 0/6880 [00:00<?, ? examples/s]

Map:   0%|          | 0/6880 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['Star Rating', 'input_ids', 'attention_mask', 'word_ids'],
        num_rows: 6880
    })
    test: Dataset({
        features: ['Star Rating', 'input_ids', 'attention_mask', 'word_ids'],
        num_rows: 6880
    })
})

In [None]:
tokenized_star = tokenized_star.rename_column("Star Rating", "labels")
tokenized_star.set_format("torch")
tokenized_star

DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'attention_mask', 'word_ids'],
        num_rows: 6880
    })
    test: Dataset({
        features: ['labels', 'input_ids', 'attention_mask', 'word_ids'],
        num_rows: 6880
    })
})

In [None]:
batch_size = 64
# Show the training loss with every epoch
logging_steps = len(tokenized_star["train"]) // batch_size
model_name = model_checkpoint.split("/")[-1]

training_args = TrainingArguments(
    output_dir=f"{model_name}-star",
    overwrite_output_dir=True,
    eval_strategy="epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    push_to_hub=True,
    fp16=True,
    logging_steps=logging_steps,
)

In [None]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=-1)
    mse = mean_squared_error(labels, predictions)
    return {"mse": mse}

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_star["train"],
    eval_dataset=tokenized_star["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


In [None]:
print('Initial loss:', trainer.evaluate()['eval_loss'])

Initial loss: 14.68386173248291


In [None]:
trainer.train()
print(f"Loss: {trainer.evaluate()['eval_loss']:.2f}")

Epoch,Training Loss,Validation Loss,Model Preparation Time,Mse
1,2.7184,0.746526,0.0017,14.908576
2,0.6278,0.662922,0.0017,14.908576
3,0.4895,0.661218,0.0017,14.908576


Loss: 0.66


In [None]:
trainer.push_to_hub()

In [None]:
train_diff = Dataset.from_pandas(sup[['Comment', 'Course Difficulty']])
test_diff = Dataset.from_pandas(test[['Comment', 'Course Difficulty']])

diff_data = DatasetDict({
    'train': train_diff,
    'test': test_diff,
})

diff_data

DatasetDict({
    train: Dataset({
        features: ['Comment', 'Course Difficulty', '__index_level_0__'],
        num_rows: 6880
    })
    test: Dataset({
        features: ['Comment', 'Course Difficulty', '__index_level_0__'],
        num_rows: 6880
    })
})

In [None]:
train_tags = Dataset.from_pandas(sup.drop(columns=['Star Rating', 'Course Difficulty']))
test_tags = Dataset.from_pandas(test.drop(columns=['Star Rating', 'Course Difficulty']))

tags_data = DatasetDict({
    'train': train_tags,
    'test': test_tags,
})

tags_data

DatasetDict({
    train: Dataset({
        features: ['Comment', 'gives_good_feedback', 'caring', 'respected', 'participation_matters', 'clear_grading_criteria', 'amazing_lectures', 'inspirational', 'tough_grader', 'hilarious', 'get_ready_to_read', 'lots_of_homework', 'accessible_outside_class', 'lecture_heavy', 'extra_credit', 'graded_by_few_things', 'group_projects', 'would_take_again', 'skip_class_you_wont_pass', 'test_heavy', 'so_many_papers', 'beware_of_pop_quizzes', 'tests_are_tough', '__index_level_0__'],
        num_rows: 6880
    })
    test: Dataset({
        features: ['Comment', 'gives_good_feedback', 'caring', 'respected', 'participation_matters', 'clear_grading_criteria', 'amazing_lectures', 'inspirational', 'tough_grader', 'hilarious', 'get_ready_to_read', 'lots_of_homework', 'accessible_outside_class', 'lecture_heavy', 'extra_credit', 'graded_by_few_things', 'group_projects', 'would_take_again', 'skip_class_you_wont_pass', 'test_heavy', 'so_many_papers', 'beware_of_pop_