In [1]:
import numpy as np
import pandas as pd
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, DataCollatorWithPadding
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

In [2]:
torch.cuda.is_available()

True

In [3]:
torch.cuda.device_count()

1

In [4]:
torch.cuda.device(torch.cuda.current_device())

<torch.cuda.device at 0x226030e2350>

In [5]:
torch.cuda.get_device_name(torch.cuda.current_device())

'NVIDIA GeForce RTX 2060'

In [6]:
class SentimentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(text, truncation=True, padding='max_length', max_length=512, return_tensors='pt')
        item = {key: val.squeeze() for key, val in encoding.items()}
        item['labels'] = torch.tensor(label)
        return item

In [7]:
train_df = pd.read_csv("data/train.csv", encoding='ISO-8859-1')
val_df = pd.read_csv("data/test.csv", encoding='ISO-8859-1')

In [8]:
train_df

Unnamed: 0,textID,text,selected_text,sentiment,Time of Tweet,Age of User,Country,Population -2020,Land Area (Km²),Density (P/Km²)
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral,morning,0-20,Afghanistan,38928346,652860.0,60
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative,noon,21-30,Albania,2877797,27400.0,105
2,088c60f138,my boss is bullying me...,bullying me,negative,night,31-45,Algeria,43851044,2381740.0,18
3,9642c003ef,what interview! leave me alone,leave me alone,negative,morning,46-60,Andorra,77265,470.0,164
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative,noon,60-70,Angola,32866272,1246700.0,26
...,...,...,...,...,...,...,...,...,...,...
27476,4eac33d1c0,wish we could come see u on Denver husband l...,d lost,negative,night,31-45,Ghana,31072940,227540.0,137
27477,4f4c4fc327,I`ve wondered about rake to. The client has ...,", don`t force",negative,morning,46-60,Greece,10423054,128900.0,81
27478,f67aae2310,Yay good for both of you. Enjoy the break - y...,Yay good for both of you.,positive,noon,60-70,Grenada,112523,340.0,331
27479,ed167662a5,But it was worth it ****.,But it was worth it ****.,positive,night,70-100,Guatemala,17915568,107160.0,167


In [9]:
train_df = train_df[['text','sentiment']]

train_df.loc[train_df['sentiment'] == 'neutral', 'sentiment'] = 1
train_df.loc[train_df['sentiment'] == 'negative', 'sentiment'] = 0
train_df.loc[train_df['sentiment'] == 'positive', 'sentiment'] = 2

train_df = train_df.rename(columns={'sentiment':'label'}).reset_index(drop=True)

train_df = train_df.dropna()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df.loc[train_df['sentiment'] == 'neutral', 'sentiment'] = 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df.loc[train_df['sentiment'] == 'negative', 'sentiment'] = 0
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df.loc[train_df['sentiment'] == 'positive', 'sentiment'] = 2


In [10]:
val_df

Unnamed: 0,textID,text,sentiment,Time of Tweet,Age of User,Country,Population -2020,Land Area (Km²),Density (P/Km²)
0,f87dea47db,Last session of the day http://twitpic.com/67ezh,neutral,morning,0-20,Afghanistan,38928346.0,652860.0,60.0
1,96d74cb729,Shanghai is also really exciting (precisely -...,positive,noon,21-30,Albania,2877797.0,27400.0,105.0
2,eee518ae67,"Recession hit Veronique Branquinho, she has to...",negative,night,31-45,Algeria,43851044.0,2381740.0,18.0
3,01082688c6,happy bday!,positive,morning,46-60,Andorra,77265.0,470.0,164.0
4,33987a8ee5,http://twitpic.com/4w75p - I like it!!,positive,noon,60-70,Angola,32866272.0,1246700.0,26.0
...,...,...,...,...,...,...,...,...,...
4810,,,,,,,,,
4811,,,,,,,,,
4812,,,,,,,,,
4813,,,,,,,,,


In [11]:
val_df = val_df[['text','sentiment']]

val_df.loc[val_df['sentiment'] == 'neutral', 'sentiment'] = 1
val_df.loc[val_df['sentiment'] == 'negative', 'sentiment'] = 0
val_df.loc[val_df['sentiment'] == 'positive', 'sentiment'] = 2

val_df = val_df.rename(columns={'sentiment':'label'})

val_df = val_df.dropna().reset_index(drop=True)

val_df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  val_df.loc[val_df['sentiment'] == 'neutral', 'sentiment'] = 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  val_df.loc[val_df['sentiment'] == 'negative', 'sentiment'] = 0
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  val_df.loc[val_df['sentiment'] == 'positive', 'sentiment'] = 2


Unnamed: 0,text,label
0,Last session of the day http://twitpic.com/67ezh,1
1,Shanghai is also really exciting (precisely -...,2
2,"Recession hit Veronique Branquinho, she has to...",0
3,happy bday!,2
4,http://twitpic.com/4w75p - I like it!!,2
...,...,...
3529,"its at 3 am, im very tired but i can`t sleep ...",0
3530,All alone in this old house again. Thanks for...,2
3531,I know what you mean. My little dog is sinkin...,0
3532,_sutra what is your next youtube video gonna b...,2


In [12]:
model_names = [
    "bert-base-uncased",
    "distilbert-base-uncased",
    "roberta-base",
    "xlnet-base-cased",
    "albert-base-v2",
    "google/electra-base-discriminator"
]
model_names

['bert-base-uncased',
 'distilbert-base-uncased',
 'roberta-base',
 'xlnet-base-cased',
 'albert-base-v2',
 'google/electra-base-discriminator']

In [13]:
model_names = [
    "bert-base-uncased"
]
model_names

['bert-base-uncased']

In [None]:
model_names = [
    "distilbert-base-uncased",
    "google/electra-base-discriminator",
]
model_names

In [14]:
model_names = [
    "distilbert-base-uncased",
    "google/electra-base-discriminator",
    "albert-base-v2"
]
model_names

['distilbert-base-uncased', 'google/electra-base-discriminator']

In [15]:
results_df = pd.DataFrame(columns=["Model", "Accuracy", "F1", "Precision", "Recall"])

results_df

Unnamed: 0,Model,Accuracy,F1,Precision,Recall


In [None]:
for model_name in model_names:
    print(f"Training and evaluating {model_name}...")

    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3)

    train_dataset = SentimentDataset(train_df['text'].tolist(), train_df['label'].tolist(), tokenizer)
    val_dataset = SentimentDataset(val_df['text'].tolist(), val_df['label'].tolist(), tokenizer)
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

    training_args = TrainingArguments(
        output_dir=f'./results/{model_name.replace("/", "_")}',
        num_train_epochs=3,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        warmup_steps=500,
        weight_decay=0.01,
        logging_dir=f'./logs/{model_name.replace("/", "_")}',
        logging_steps=10,
        evaluation_strategy="steps",
        eval_steps=50,
        save_steps=50,
        save_total_limit=2,
    )

    def compute_metrics(p):
        preds = p.predictions.argmax(-1)
        labels = p.label_ids
        precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted', zero_division = np.nan)
        acc = accuracy_score(labels, preds)
        return {
            'accuracy': acc,
            'f1': f1,
            'precision': precision,
            'recall': recall
        }

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics
    )

    trainer.train()

    eval_results = trainer.evaluate()
    print(f"Evaluation results for {model_name}: {eval_results}")

    results_df = results_df.append({
        "Model": model_name,
        "Accuracy": eval_results['eval_accuracy'],
        "F1": eval_results['eval_f1'],
        "Precision": eval_results['eval_precision'],
        "Recall": eval_results['eval_recall']
    }, ignore_index=True)

    model.save_pretrained(f'./fine_tuned_models/{model_name.replace("/", "_")}')
    tokenizer.save_pretrained(f'./fine_tuned_models/{model_name.replace("/", "_")}')

Training and evaluating distilbert-base-uncased...


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
50,1.0907,1.086072,0.44652,0.351076,0.477868,0.44652
100,1.0479,1.055145,0.490662,0.40822,0.491301,0.490662
150,1.0035,0.908263,0.57725,0.509987,0.71054,0.57725
200,0.7332,0.716027,0.694397,0.680043,0.721491,0.694397
250,0.6297,0.656054,0.733729,0.734638,0.744472,0.733729
300,0.5991,0.677797,0.731466,0.72867,0.732085,0.731466
350,0.6957,0.657604,0.72043,0.721379,0.742957,0.72043
400,0.6003,0.649419,0.728353,0.728279,0.736579,0.728353
450,0.5343,0.666129,0.747878,0.746313,0.748557,0.747878
500,0.5726,0.665968,0.745614,0.74698,0.755824,0.745614


In [None]:
print(results_df)