In [262]:
# !pip install transformers
# !pip install datasets
# !pip install transformers[torch]
# !pip install accelerate -U
# !pip install evaluate

import pandas as pd
import numpy as np

from transformers import AutoTokenizer
from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict

import evaluate

In [263]:
df = pd.read_csv("labeled_data.csv")
df.head()

Unnamed: 0,column_a,count,hate_speech,offensive_language,neither,class,tweet
0,0,3,0,0,3,2,!!! RT @mayasolovely: As a woman you shouldn't...
1,1,3,0,3,0,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...
2,2,3,0,3,0,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...
3,3,3,0,2,1,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...
4,4,6,0,6,0,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...


In [264]:
df.shape

(24783, 7)

In [265]:
df = df[['tweet','class']]

df.rename(columns ={'class':'label','tweet':'text'}, inplace = True)

In [266]:
df_train, df_test = train_test_split(df, test_size=0.2, random_state=42)


In [267]:
df_train.shape

(19826, 2)

In [268]:
df_test.shape

(4957, 2)

In [269]:
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# Create Datasets from DataFrames
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

# train_dataset = train_dataset.rename_column("__index_level_0__", 'input_ids')
# test_dataset = test_dataset.rename_column("__index_level_0__", 'input_ids')


# Create a DatasetDict
dataset = DatasetDict({'train': train_dataset, 'test': test_dataset})

In [270]:
dataset['train'][0]

{'text': 'RT @FunSizedYogi: @TheBlackVoice well how else will white ppl get us to forget our horrific past other than to paint a pretty picture of ho&#8230;',
 'label': 0,
 '__index_level_0__': 15272}

In [271]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True)
tokenized_test_dataset = test_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/19826 [00:00<?, ? examples/s]

Map:   0%|          | 0/4957 [00:00<?, ? examples/s]

In [272]:
tokenized_test_dataset = tokenized_test_dataset.remove_columns('__index_level_0__')
tokenized_train_dataset = tokenized_train_dataset.remove_columns('__index_level_0__')

In [273]:
tweets = tokenized_train_dataset['text']
labels = np.array(tokenized_train_dataset["label"])

In [274]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=3)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [275]:
metric = evaluate.load("accuracy")

In [276]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [277]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(output_dir="test_trainer", evaluation_strategy="epoch", remove_unused_columns=False)

In [278]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_test_dataset,
    compute_metrics=compute_metrics
)

In [279]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,0.2957,0.297049,0.91043
2,0.2527,0.342799,0.908412


Epoch,Training Loss,Validation Loss,Accuracy
1,0.2957,0.297049,0.91043
2,0.2527,0.342799,0.908412
3,0.2016,0.361427,0.909018


TrainOutput(global_step=7437, training_loss=0.2721494649292337, metrics={'train_runtime': 6051.5304, 'train_samples_per_second': 9.829, 'train_steps_per_second': 1.229, 'total_flos': 1.5649459859625984e+16, 'train_loss': 0.2721494649292337, 'epoch': 3.0})

In [281]:
trainer.save_model('saved_model')