## Before Start

An initial data exploration was done to first gain some insights. The classes look balanced in the train data at about 50% each, and most reviews are of length 100-200. Even after removing stopwords, the most common word (br) is still a stopword since it is just a break in the text, and does not have any meaning. It should be added to the dictionary to be removed.

In [3]:
import pandas as pd
train_data = pd.read_csv('/Users/kiriharari/Desktop/EE6405/Movie/movie_train.csv')
train_data.columns=['text','labels']
train_data.head()

Unnamed: 0,text,labels
0,"Incredibly intriguing and captivating, I found...",1
1,"Great movie! oh yeah! Full of energy, full of ...",1
2,I couldn't believe it when I put this movie in...,0
3,Not much to say on this one. A plot you can pr...,0
4,With the dialogue in the dubbed version of thi...,0


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter
from nltk.tokenize import word_tokenize

# train_data = pd.read_csv('/content/gdrive/MyDrive/news_fulltrain.csv', header=None)
# print(train_data.head())

print("Class Distribution:")
print(train_data['label'].value_counts(normalize=True))

review_lengths = train_data['text'].apply(lambda x: len(word_tokenize(x)))
plt.figure(figsize=(9, 6))
plt.hist(review_lengths, bins=50, alpha=0.7)
plt.title('Distribution of Review Lengths')
plt.xlabel('Reviews Length')
plt.ylabel('Reviews Num')
plt.show()

stop_words = set(stopwords.words('english'))
all_words = [word for review in train_data['text'] for word in word_tokenize(review.lower()) if word.isalpha() and word not in stop_words]

word_freq = Counter(all_words).most_common(20)

words, frequencies = zip(*word_freq)
plt.figure(figsize=(9, 6))
plt.bar(words, frequencies)
plt.title('Top 20 Common Words (Before removing br)')
plt.xticks(rotation=45)
plt.xlabel('Words')
plt.ylabel('Frequency')
plt.show()

stop_words = set(stopwords.words('english'))
stop_words.add('br')
all_words = [word for review in train_data['text'] for word in word_tokenize(review.lower()) if word.isalpha() and word not in stop_words]

word_freq = Counter(all_words).most_common(20)

words, frequencies = zip(*word_freq)
plt.figure(figsize=(10, 6))
plt.bar(words, frequencies)
plt.title('Top 20 Common Words (After removing br)')
plt.xticks(rotation=45)
plt.xlabel('Words')
plt.ylabel('Frequency')
plt.show()


In [14]:
from sklearn.utils import resample
import torch
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix, roc_curve, auc
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
# import os
# os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

train_data = pd.read_csv('/Users/kiriharari/Desktop/EE6405/Movie/movie_train.csv')
train_data.columns=['text','labels']
train_data['labels'].unique()

class_0 = train_data[train_data.labels == 0]
class_1 = train_data[train_data.labels == 1]
majority_class = 0 if (len(class_0)) > (len(class_1)) else 1
minority_class = 1 - majority_class

#Downsamle
class_majority_downsampled = resample(class_0 if majority_class == 0 else class_1,
                                      replace=False,
                                      n_samples=len(class_1),
                                      random_state=42)
#Merge balanced data
balanced_data = pd.concat([class_majority_downsampled, class_1] if majority_class == 0 else [class_0, class_majority_downsampled])

train_balance_data = balanced_data.reset_index(drop=True)

test_data = pd.read_csv('/Users/kiriharari/Desktop/EE6405/Movie/movie_test.csv')

test_data.columns = ['text', 'labels']
class_0 = test_data[test_data.labels == 0]
class_1 = test_data[test_data.labels == 1]
majority_class = 0 if (len(class_0)) > (len(class_1)) else 1
minority_class = 1 - majority_class

#Downsamle
class_majority_downsampled = resample(class_0 if majority_class == 0 else class_1,
                                      replace=False,
                                      n_samples=len(class_1),
                                      random_state=42)
#Merge balanced data
balanced_data = pd.concat([class_majority_downsampled, class_1] if majority_class == 0 else [class_0, class_majority_downsampled])

test_balance_data = balanced_data.reset_index(drop=True)
print(test_balance_data.shape)
train_balance_data.to_csv("balanced_train.csv")
test_balance_data.to_csv("balanced_test.csv")

(7938, 2)


In [15]:
import torch
from datasets import Dataset, load_metric
import evaluate
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    AutoConfig,
)
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
def tokenize(data):
    tokenized_data = tokenizer(data["text"],
                     padding = "max_length",
                     truncation = True,
                     max_length=128,
                     return_tensors='pt')
    tokenized_data['labels'] = torch.tensor(data["labels"])
    return tokenized_data
# tokenized_train = train_data_balanced.map(tokenize, batched=True)
# tokenized_test = test_data_balanced.map(tokenize, batched=True)
config = AutoConfig.from_pretrained("bert-base-cased")
model = AutoModelForSequenceClassification.from_pretrained('bert-base-cased', num_labels = 2)

train_dataset = Dataset.from_pandas(train_balance_data)
test_dataset = Dataset.from_pandas(test_balance_data)

tokenized_train = train_dataset.map(tokenize, batched=True)
tokenized_test = test_dataset.map(tokenize, batched=True)

training_args = TrainingArguments(
    output_dir="./result",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,                     
    evaluation_strategy="epoch",
    learning_rate=5e-5,
)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=1)
    f1 = load_metric("f1")
    precision = load_metric("precision")
    recall = load_metric("recall")
    accuracy = load_metric("accuracy")

    f1_score = f1.compute(predictions=predictions, references=labels, average='binary')
    precision_score = precision.compute(predictions=predictions, references=labels, average='binary')
    recall_score = recall.compute(predictions=predictions, references=labels, average='binary')
    accuracy_score = accuracy.compute(predictions=predictions, references=labels)

    return {
        "accuracy": accuracy_score['accuracy'],
        "f1": f1_score['f1'],}
# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    compute_metrics=compute_metrics,
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/32000 [00:00<?, ? examples/s]

Map:   0%|          | 0/7938 [00:00<?, ? examples/s]

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [16]:
trainer.train()

  0%|          | 0/12000 [00:00<?, ?it/s]

{'loss': 0.5031, 'grad_norm': 5.5322089195251465, 'learning_rate': 4.791666666666667e-05, 'epoch': 0.12}
{'loss': 0.4444, 'grad_norm': 7.926072597503662, 'learning_rate': 4.5833333333333334e-05, 'epoch': 0.25}
{'loss': 0.3981, 'grad_norm': 6.4934868812561035, 'learning_rate': 4.375e-05, 'epoch': 0.38}
{'loss': 0.4005, 'grad_norm': 16.78427505493164, 'learning_rate': 4.166666666666667e-05, 'epoch': 0.5}
{'loss': 0.396, 'grad_norm': 9.174877166748047, 'learning_rate': 3.958333333333333e-05, 'epoch': 0.62}
{'loss': 0.3846, 'grad_norm': 8.750117301940918, 'learning_rate': 3.7500000000000003e-05, 'epoch': 0.75}
{'loss': 0.3892, 'grad_norm': 10.195127487182617, 'learning_rate': 3.541666666666667e-05, 'epoch': 0.88}
{'loss': 0.3826, 'grad_norm': 18.408754348754883, 'learning_rate': 3.3333333333333335e-05, 'epoch': 1.0}


  0%|          | 0/993 [00:00<?, ?it/s]

  f1 = load_metric("f1")
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


{'eval_loss': 0.3793993890285492, 'eval_accuracy': 0.8652053413958176, 'eval_f1': 0.8701456310679612, 'eval_runtime': 264.9472, 'eval_samples_per_second': 29.961, 'eval_steps_per_second': 3.748, 'epoch': 1.0}
{'loss': 0.3178, 'grad_norm': 13.15282154083252, 'learning_rate': 3.125e-05, 'epoch': 1.12}
{'loss': 0.2959, 'grad_norm': 0.9394056797027588, 'learning_rate': 2.916666666666667e-05, 'epoch': 1.25}
{'loss': 0.3029, 'grad_norm': 0.46596112847328186, 'learning_rate': 2.7083333333333332e-05, 'epoch': 1.38}
{'loss': 0.2975, 'grad_norm': 0.2115456610918045, 'learning_rate': 2.5e-05, 'epoch': 1.5}
{'loss': 0.3103, 'grad_norm': 25.81688117980957, 'learning_rate': 2.2916666666666667e-05, 'epoch': 1.62}
{'loss': 0.3016, 'grad_norm': 0.06070864200592041, 'learning_rate': 2.0833333333333336e-05, 'epoch': 1.75}
{'loss': 0.2899, 'grad_norm': 8.122897148132324, 'learning_rate': 1.8750000000000002e-05, 'epoch': 1.88}
{'loss': 0.2805, 'grad_norm': 8.181387901306152, 'learning_rate': 1.666666666666

  0%|          | 0/993 [00:00<?, ?it/s]

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


{'eval_loss': 0.4966689348220825, 'eval_accuracy': 0.8677248677248677, 'eval_f1': 0.8635654885654886, 'eval_runtime': 230.6883, 'eval_samples_per_second': 34.41, 'eval_steps_per_second': 4.305, 'epoch': 2.0}
{'loss': 0.1749, 'grad_norm': 45.78474807739258, 'learning_rate': 1.4583333333333335e-05, 'epoch': 2.12}
{'loss': 0.1606, 'grad_norm': 0.10074486583471298, 'learning_rate': 1.25e-05, 'epoch': 2.25}
{'loss': 0.1845, 'grad_norm': 88.7400894165039, 'learning_rate': 1.0416666666666668e-05, 'epoch': 2.38}
{'loss': 0.1663, 'grad_norm': 0.10826538503170013, 'learning_rate': 8.333333333333334e-06, 'epoch': 2.5}
{'loss': 0.174, 'grad_norm': 0.194576695561409, 'learning_rate': 6.25e-06, 'epoch': 2.62}
{'loss': 0.1643, 'grad_norm': 9.042320251464844, 'learning_rate': 4.166666666666667e-06, 'epoch': 2.75}
{'loss': 0.1665, 'grad_norm': 0.11958027631044388, 'learning_rate': 2.0833333333333334e-06, 'epoch': 2.88}
{'loss': 0.1431, 'grad_norm': 0.15002697706222534, 'learning_rate': 0.0, 'epoch': 3.

  0%|          | 0/993 [00:00<?, ?it/s]

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


{'eval_loss': 0.5846240520477295, 'eval_accuracy': 0.8751574703955657, 'eval_f1': 0.8760165144501438, 'eval_runtime': 203.2291, 'eval_samples_per_second': 39.059, 'eval_steps_per_second': 4.886, 'epoch': 3.0}
{'train_runtime': 11145.5319, 'train_samples_per_second': 8.613, 'train_steps_per_second': 1.077, 'train_loss': 0.2928744951883952, 'epoch': 3.0}


TrainOutput(global_step=12000, training_loss=0.2928744951883952, metrics={'train_runtime': 11145.5319, 'train_samples_per_second': 8.613, 'train_steps_per_second': 1.077, 'train_loss': 0.2928744951883952, 'epoch': 3.0})

In [17]:
trainer.evaluate()

  0%|          | 0/993 [00:00<?, ?it/s]

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


{'eval_loss': 0.5846240520477295,
 'eval_accuracy': 0.8751574703955657,
 'eval_f1': 0.8760165144501438,
 'eval_runtime': 172.513,
 'eval_samples_per_second': 46.014,
 'eval_steps_per_second': 5.756,
 'epoch': 3.0}

In [18]:
# 指定保存模型的路径
model_path = "/Users/kiriharari/Desktop/EE6405/Movie/saved_model"

# 保存模型
model.save_pretrained(model_path)

# 同时，保存 tokenizer 到相同的路径
tokenizer.save_pretrained(model_path)


('/Users/kiriharari/Desktop/EE6405/Movie/saved_model/tokenizer_config.json',
 '/Users/kiriharari/Desktop/EE6405/Movie/saved_model/special_tokens_map.json',
 '/Users/kiriharari/Desktop/EE6405/Movie/saved_model/vocab.txt',
 '/Users/kiriharari/Desktop/EE6405/Movie/saved_model/added_tokens.json',
 '/Users/kiriharari/Desktop/EE6405/Movie/saved_model/tokenizer.json')

In [19]:
import torch
from datasets import Dataset, load_metric
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    AutoConfig,
)
model_path = "/Users/kiriharari/Desktop/EE6405/Movie/saved_model"

# 加载模型
model = AutoModelForSequenceClassification.from_pretrained(model_path)

# 加载 tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_path)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=1)
    f1 = load_metric("f1")
    precision = load_metric("precision")
    recall = load_metric("recall")
    accuracy = load_metric("accuracy")

    f1_score = f1.compute(predictions=predictions, references=labels, average='binary')
    precision_score = precision.compute(predictions=predictions, references=labels, average='binary')
    recall_score = recall.compute(predictions=predictions, references=labels, average='binary')
    accuracy_score = accuracy.compute(predictions=predictions, references=labels)

    return {
        "accuracy": accuracy_score['accuracy'],
        "f1": f1_score['f1'],
        "precision": precision_score['precision'],
        "recall": recall_score['recall']
    }

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    compute_metrics=compute_metrics
)

trainer.evaluate()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


  0%|          | 0/993 [00:00<?, ?it/s]

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


{'eval_loss': 0.5846240520477295,
 'eval_accuracy': 0.8751574703955657,
 'eval_f1': 0.8760165144501438,
 'eval_precision': 0.8700298210735586,
 'eval_recall': 0.8820861678004536,
 'eval_runtime': 170.588,
 'eval_samples_per_second': 46.533,
 'eval_steps_per_second': 5.821}

In [20]:
test_string_satire = '''
When so many actors seem content to churn out performances for a quick paycheck, a performer who adheres to his principles really stands out. Thats why Jeff Bridges made waves this week when he announced that from now on, he will only perform nude scenes. In an interview in this months GQ, the Big Lebowski star made it clear that he was more than ready to move on to a new phase in his career, leaving his clothed roles in the past. Ive been there and Ive done that, said Bridges, rattling off a laundry list of the films hes appeared in covered up. Now, I can finally afford to only take on roles that excite me. Right now, those are roles with nude scenes. Why waste my time with anything else? Powerful. Though he made it clear that he doesnt regret his previous non-nude roles, Jeff admitted that hed always struggled with pressure from directors and studios to stay clothed on camera. No more towels; no more bathrobes; no more carefully placed plants, he added. Even if my character isnt written as nude, any director I work with will have to figure out how to make him that way. Itll be a challenge for both of us, and one I cant wait to tackle. For their part, Jeffs fans have been nothing but supportive. Wow! Whether or not you agree with him, youve got to have respect for a Hollywood star with that much conviction. You keep doing you, Jeff!
'''

test_string_reliable = '''
The Alberta province health minister wants to know if swine flu shots were 'inappropriately diverted' to the Calgary Flames while thousands had to stand in line for hours for the vaccine. Alberta Health Minister Ron Liepert says he doesn't know where the NHL team got the vaccine, adding that Alberta Health Services is the only supplier in the province. Team president Ken King says the club contacted the department and asked for the clinic. Health officials have begun an investigation into the special clinic, which was held for the players and their families last Friday. Liepert says the vaccine would be diverted only with the approval of the chief medical officer of health, but he doesn't know if that was the case. Alberta's opposition parties say professional ice hockey players shouldn't be getting the vaccine ahead of cancer patients and pregnant women.
'''
training_args = TrainingArguments(
    output_dir="./results",
    per_device_eval_batch_size=1  # 设置为1因为我们只预测一个样本
)
trainer = Trainer(model=model, args=training_args)
print("Satire")
satire_input_x = pd.DataFrame({'labels':[0], 'text': test_string_satire})

reliable_input_x = pd.DataFrame({'labels':[0], 'text': test_string_reliable})

satire_dataset = Dataset.from_pandas(satire_input_x)
reliable_dataset = Dataset.from_pandas(reliable_input_x)

satire_input_x = satire_dataset.map(tokenize, batched=True)
reliable_input_x = reliable_dataset.map(tokenize, batched=True)

# satire_input_x['labels'] = torch.tensor(0)
# Check the number of tokens
# print("Input IDs shape:", satire_input_x['input_ids'].shape)  # 应显示 (1, 128) 或其他类似批处理形状
# print("Input IDs shape:", tokenized_test['input_ids'].shape) 
predictions = trainer.predict(satire_input_x)


logits = predictions.predictions
probs = torch.softmax(torch.tensor(logits), dim=-1)
predictions = torch.argmax(probs, dim=-1)

print("Probabilities:", probs)
print("Predictions:", predictions)

print("Reliable")

predictions = trainer.predict(reliable_input_x)


logits = predictions.predictions
probs = torch.softmax(torch.tensor(logits), dim=-1)
predictions = torch.argmax(probs, dim=-1)

print("Probabilities:", probs)
print("Predictions:", predictions)

Satire


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Probabilities: tensor([[0.9943, 0.0057]])
Predictions: tensor([0])
Reliable


  0%|          | 0/1 [00:00<?, ?it/s]

Probabilities: tensor([[0.2309, 0.7691]])
Predictions: tensor([1])
