## Before Start

An initial data exploration was done to first gain some insights. The classes look balanced in the train data at about 50% each, and most reviews are of length 100-200. Even after removing stopwords, the most common word (br) is still a stopword since it is just a break in the text, and does not have any meaning. It should be added to the dictionary to be removed.

In [69]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
data = pd.read_csv('/Users/kiriharari/Desktop/EE6405/BERT_TWEET/twitter_E6oV3lV.csv', header=None)
data = data.iloc[1:, 1:]
data.columns=['labels','text']
print(data.head())

def remove_stopwords(text):
    stop_words = set(stopwords.words('english'))
    stop_words.add('br')
    word_tokens = word_tokenize(text)
    filtered_text = [word for word in word_tokens if word.lower() not in stop_words]
    return ' '.join(filtered_text)

def preprocess_text(data):
    data['text'] = data['text'].apply(remove_stopwords)#Remove stopwords
    data['text'] = data['text'].apply(lambda x: re.sub('https?:\/\/.*[\r\n]*', ' ', x))#Remove URLs
    data['text'] = data['text'].apply(lambda x: re.sub('[^a-zA-Z0-9 \n]', ' ', x))#Remove non-alphanumeric characters
    data['text'] = data['text'].apply(lambda x: re.sub('@[\w]*', '', x))#Remove Twitter usernames
    data['text'] = data['text'].apply(lambda x: re.sub('\d+', ' ', x))#Remove digits
    data['text'] = data['text'].apply(lambda x: re.sub('user', '', x))#Remove Twitter usernames
    data['text'] = data['text'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>1]))#Remove Short Words
    data['text'] = data['text'].apply(lambda x: re.sub(r'[^\w\s]', '', x))#Remove punctuation
    data['text'] = data['text'].str.lower()
    return data
print("Class Distribution:")
print(data['labels'].value_counts(normalize=True))

  labels                                               text
1      0   @user when a father is dysfunctional and is s...
2      0  @user @user thanks for #lyft credit i can't us...
3      0                                bihday your majesty
4      0  #model   i love u take with u all the time in ...
5      0             factsguide: society now    #motivation
Class Distribution:
labels
0    0.929854
1    0.070146
Name: proportion, dtype: float64


In [70]:
from sklearn.utils import resample

class_0 = data[data.labels == '0']
class_1 = data[data.labels == '1']
majority_class = 0 if (len(class_0)) > (len(class_1)) else 1
minority_class = 1 - majority_class

#Downsamle
class_majority_downsampled = resample(class_0 if majority_class == 0 else class_1,
                                      replace=False,
                                      n_samples=len(class_1),
                                      random_state=42)
#Merge balanced data
balanced_data = pd.concat([class_majority_downsampled, class_1] if majority_class == 0 else [class_0, class_majority_downsampled])
#print(balanced_data['label'].value_counts())
print("Class Distribution:")
print(balanced_data['labels'].value_counts(normalize=True))
print(balanced_data)

Class Distribution:
labels
0    0.5
1    0.5
Name: proportion, dtype: float64
      labels                                               text
8825       0  #body to body massage with a   ending oil #mas...
31855      0   @user @ my call back!  #casting #castingcall ...
28080      0  help creates the #environment of #togetherness...
29215      0  summer with friendâ¨ð¥ #summer  #friend #li...
20026      0  follow me on snapchat at awesomecutenes7 #snap...
...      ...                                                ...
31935      1  lady banned from kentucky mall. @user  #jcpenn...
31947      1  @user omfg i'm offended! i'm a  mailbox and i'...
31948      1  @user @user you don't have the balls to hashta...
31949      1   makes you ask yourself, who am i? then am i a...
31961      1  @user #sikh #temple vandalised in in #calgary,...

[4484 rows x 2 columns]


In [71]:
from sklearn.model_selection import train_test_split
import numpy as np
train_data, test_data = train_test_split(balanced_data, test_size=0.2, random_state=42)
train_data = preprocess_text(train_data)
test_data = preprocess_text(test_data)
train_data.to_csv('train_data.csv', index=False)
test_data.to_csv('test_data.csv', index=False)

train_data = pd.read_csv('train_data.csv')
test_data = pd.read_csv('test_data.csv')

train_data.columns=['labels','text']
test_data.columns=['labels','text']

train_data.head()
train_data = train_data.dropna()
test_data = test_data.dropna()
train_data[train_data.isnull().values == True]
test_data[test_data.isnull().values == True]

Unnamed: 0,labels,text


In [72]:
from sklearn.utils import resample
import torch
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix, roc_curve, auc
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
# import os
# os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

# train_data = pd.read_csv('/Users/kiriharari/Downloads/news_fulltrain.csv', header=None)
train_data.columns=['labels','text']
# train_data = train_data[(train_data['labels'] == 1) | (train_data['labels'] == 4)]
train_data['labels'].unique()
train_data_balanced = train_data.copy()


test_data.columns = ['labels', 'text']
# test_data = test_data[(test_data['labels'] == 1) | (test_data['labels'] == 4)]

test_data_balanced = test_data.copy()
test_data_balanced.head()
train_data_balanced.head()
# test_data.columns = ['label', 0, 'text', 1,2,3,4,5,6]
# test_data = test_data.loc[:, ['label', 'text']]

# train_data['text'] = train_data['text'].apply(remove_stopwords)
# test_data['text'] = test_data['text'].apply(remove_stopwords)

# #limit set to not run out of ram
# tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# def tokenize_and_encode(data):
#     inputs = tokenizer.batch_encode_plus(
#         data['text'].tolist(),
#         add_special_tokens=True,
#         max_length=128,
#         padding='max_length',
#         truncation=True,
#         return_tensors='pt'
#     )
#     labels = torch.tensor(data['label'].tolist())
#     return TensorDataset(inputs['input_ids'], inputs['attention_mask'], labels)

# train_dataset = tokenize_and_encode(train_data)
# test_dataset = tokenize_and_encode(test_data)

# train_dataloader = DataLoader(train_dataset, sampler=RandomSampler(train_dataset), batch_size=16)
# test_dataloader = DataLoader(test_dataset, sampler=SequentialSampler(test_dataset), batch_size=16)
train_data_balanced = train_data_balanced.reset_index(drop=True)
test_data_balanced = test_data_balanced.reset_index(drop=True)

In [73]:
import torch
from datasets import Dataset, load_metric
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    AutoConfig,
)
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
def tokenize(data):
    tokenized_data = tokenizer(data["text"],
                     padding = "max_length",
                     truncation = True,
                     max_length=128,
                     return_tensors='pt')
    tokenized_data['labels'] = torch.tensor(data["labels"])
    return tokenized_data
# tokenized_train = train_data_balanced.map(tokenize, batched=True)
# tokenized_test = test_data_balanced.map(tokenize, batched=True)
config = AutoConfig.from_pretrained("bert-base-cased")
model = AutoModelForSequenceClassification.from_pretrained('bert-base-cased', num_labels = 2)

train_dataset = Dataset.from_pandas(train_data_balanced)
test_dataset = Dataset.from_pandas(test_data_balanced)

tokenized_train = train_dataset.map(tokenize, batched=True)
tokenized_test = test_dataset.map(tokenize, batched=True)

training_args = TrainingArguments(
    output_dir="./result",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,                     
    evaluation_strategy="epoch",
    learning_rate=5e-5,
)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=1)
    f1 = load_metric("f1")
    precision = load_metric("precision")
    recall = load_metric("recall")
    accuracy = load_metric("accuracy")

    f1_score = f1.compute(predictions=predictions, references=labels, average='binary')
    precision_score = precision.compute(predictions=predictions, references=labels, average='binary')
    recall_score = recall.compute(predictions=predictions, references=labels, average='binary')
    accuracy_score = accuracy.compute(predictions=predictions, references=labels)

    return {
        "accuracy": accuracy_score['accuracy'],
        "f1": f1_score['f1'],
        "precision": precision_score['precision'],
        "recall": recall_score['recall']
    }
# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    compute_metrics=compute_metrics,
)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/3582 [00:00<?, ? examples/s]

Map:   0%|          | 0/895 [00:00<?, ? examples/s]

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [74]:
print(len(set(tokenized_train['labels'])))

2


In [75]:
trainer.train()

  0%|          | 0/1344 [00:00<?, ?it/s]

  0%|          | 0/112 [00:00<?, ?it/s]

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


{'eval_loss': 0.368895560503006, 'eval_accuracy': 0.8592178770949721, 'eval_f1': 0.859375, 'eval_precision': 0.8690744920993227, 'eval_recall': 0.8498896247240618, 'eval_runtime': 24.2523, 'eval_samples_per_second': 36.904, 'eval_steps_per_second': 4.618, 'epoch': 1.0}
{'loss': 0.3997, 'grad_norm': 0.10481197386980057, 'learning_rate': 3.1398809523809525e-05, 'epoch': 1.12}


  0%|          | 0/112 [00:00<?, ?it/s]

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


{'eval_loss': 0.42266491055488586, 'eval_accuracy': 0.8726256983240224, 'eval_f1': 0.878723404255319, 'eval_precision': 0.8480492813141683, 'eval_recall': 0.9116997792494481, 'eval_runtime': 25.1202, 'eval_samples_per_second': 35.629, 'eval_steps_per_second': 4.459, 'epoch': 2.0}
{'loss': 0.2308, 'grad_norm': 0.4954136610031128, 'learning_rate': 1.2797619047619047e-05, 'epoch': 2.23}


  0%|          | 0/112 [00:00<?, ?it/s]

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


{'eval_loss': 0.5378416180610657, 'eval_accuracy': 0.88268156424581, 'eval_f1': 0.8834628190899, 'eval_precision': 0.8883928571428571, 'eval_recall': 0.8785871964679912, 'eval_runtime': 30.0671, 'eval_samples_per_second': 29.767, 'eval_steps_per_second': 3.725, 'epoch': 3.0}
{'train_runtime': 1113.3128, 'train_samples_per_second': 9.652, 'train_steps_per_second': 1.207, 'train_loss': 0.26385126795087543, 'epoch': 3.0}


TrainOutput(global_step=1344, training_loss=0.26385126795087543, metrics={'train_runtime': 1113.3128, 'train_samples_per_second': 9.652, 'train_steps_per_second': 1.207, 'train_loss': 0.26385126795087543, 'epoch': 3.0})

In [76]:
trainer.evaluate()

  0%|          | 0/112 [00:00<?, ?it/s]

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


{'eval_loss': 0.5378416180610657,
 'eval_accuracy': 0.88268156424581,
 'eval_f1': 0.8834628190899,
 'eval_precision': 0.8883928571428571,
 'eval_recall': 0.8785871964679912,
 'eval_runtime': 27.9163,
 'eval_samples_per_second': 32.06,
 'eval_steps_per_second': 4.012,
 'epoch': 3.0}

In [78]:
# 指定保存模型的路径
model_path = "/Users/kiriharari/Desktop/EE6405/BERT_TWEET/saved_model"

# 保存模型
model.save_pretrained(model_path)

# 同时，保存 tokenizer 到相同的路径
tokenizer.save_pretrained(model_path)


('/Users/kiriharari/Desktop/EE6405/BERT_TWEET/saved_model/tokenizer_config.json',
 '/Users/kiriharari/Desktop/EE6405/BERT_TWEET/saved_model/special_tokens_map.json',
 '/Users/kiriharari/Desktop/EE6405/BERT_TWEET/saved_model/vocab.txt',
 '/Users/kiriharari/Desktop/EE6405/BERT_TWEET/saved_model/added_tokens.json',
 '/Users/kiriharari/Desktop/EE6405/BERT_TWEET/saved_model/tokenizer.json')

In [79]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import numpy as np
from datasets import load_metric


model_path = "/Users/kiriharari/Desktop/EE6405/BERT_TWEET/saved_model"

# 加载模型
model = AutoModelForSequenceClassification.from_pretrained(model_path)

# 加载 tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_path)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=1)
    f1 = load_metric("f1")
    precision = load_metric("precision")
    recall = load_metric("recall")
    accuracy = load_metric("accuracy")

    f1_score = f1.compute(predictions=predictions, references=labels, average='binary')
    precision_score = precision.compute(predictions=predictions, references=labels, average='binary')
    recall_score = recall.compute(predictions=predictions, references=labels, average='binary')
    accuracy_score = accuracy.compute(predictions=predictions, references=labels)

    return {
        "accuracy": accuracy_score['accuracy'],
        "f1": f1_score['f1'],
        "precision": precision_score['precision'],
        "recall": recall_score['recall']
    }

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    compute_metrics=compute_metrics
)

trainer.evaluate()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


  0%|          | 0/112 [00:00<?, ?it/s]

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


{'eval_loss': 0.5378416180610657,
 'eval_accuracy': 0.88268156424581,
 'eval_f1': 0.8834628190899,
 'eval_precision': 0.8883928571428571,
 'eval_recall': 0.8785871964679912,
 'eval_runtime': 24.0661,
 'eval_samples_per_second': 37.189,
 'eval_steps_per_second': 4.654}

## This is a SAMPLE TEXT