## Read Excel file

In [1]:
import pandas as pd
df = pd.read_excel("../Humour_style.xlsx")
df.head()


Unnamed: 0,JOKES,LABELS
0,"They say good things take time, so that’s why ...",0
1,“The only way to keep your health is to eat wh...,0
2,“I do a thing called ‘what I want.’”,0
3,“The secret of staying young is to live honest...,0
4,“Weak people revenge. Strong people forgive. I...,0


## Process Text Data for Hugging Face Transformers library.

In [2]:
# AutoTokenizer Provides pre-trained models and tools for working with transformer-based models.
from transformers import AutoTokenizer

# creates an instance of the AutoTokenizer class and loads the pre-trained tokenizer for the 'bert-base-uncased' model. 
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased') 

##### Processing Data Method

Process_data(Instance) Method takes a row of data as input, processes the text in the 'JOKES' column, and returns a dictionary containing the tokenized and encoded version of the text along with its corresponding label. 

In [3]:
def process_data(instance):

    text = instance['JOKES']        # get Text/Sentences
    text = str(text)                # Covert to String datatype
    text = ' '.join(text.split())   # Tokenise by spliting

    encodings = tokenizer(text, padding="max_length", truncation=True, max_length=512)

    label = instance['LABELS']

    encodings['label'] = label
    encodings['text'] = text

    return encodings

In [4]:
print(process_data({
    'JOKES': 'I am too fat today',
    'LABELS': '1'
}))

{'input_ids': [101, 1045, 2572, 2205, 6638, 2651, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [5]:
processed_data = []

for i in range(len(df[:1263])):
    processed_data.append(process_data(df.iloc[i]))

## Generate the dataset

In [6]:
from sklearn.model_selection import train_test_split

new_df = pd.DataFrame(processed_data)

train_df, valid_df = train_test_split(
    new_df,
    test_size=0.2,
    random_state=2022
)

In [7]:
import pyarrow as pa
from datasets import Dataset

train_hg = Dataset(pa.Table.from_pandas(train_df))
valid_hg = Dataset(pa.Table.from_pandas(valid_df))

In [8]:
print(type(train_hg))

<class 'datasets.arrow_dataset.Dataset'>


## Create a model

In [9]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels=5
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
%pip install accelerate -U

In [10]:
import torch
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(output_dir="./result", evaluation_strategy="epoch")

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_hg,
    eval_dataset=valid_hg,
    tokenizer=tokenizer
)

trainer = trainer.to('cuda') if torch.cuda.is_available() else trainer





## Train and Evaluate the model

In [11]:
trainer.train()

  0%|          | 0/381 [00:00<?, ?it/s]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


  0%|          | 0/32 [00:00<?, ?it/s]

{'eval_loss': 0.619380533695221, 'eval_runtime': 372.6553, 'eval_samples_per_second': 0.679, 'eval_steps_per_second': 0.086, 'epoch': 1.0}


  0%|          | 0/32 [00:00<?, ?it/s]

{'eval_loss': 0.5555532574653625, 'eval_runtime': 366.5049, 'eval_samples_per_second': 0.69, 'eval_steps_per_second': 0.087, 'epoch': 2.0}


  0%|          | 0/32 [00:00<?, ?it/s]

{'eval_loss': 0.5708801746368408, 'eval_runtime': 283.1333, 'eval_samples_per_second': 0.894, 'eval_steps_per_second': 0.113, 'epoch': 3.0}
{'train_runtime': 12667.2245, 'train_samples_per_second': 0.239, 'train_steps_per_second': 0.03, 'train_loss': 0.519550193326054, 'epoch': 3.0}


TrainOutput(global_step=381, training_loss=0.519550193326054, metrics={'train_runtime': 12667.2245, 'train_samples_per_second': 0.239, 'train_steps_per_second': 0.03, 'train_loss': 0.519550193326054, 'epoch': 3.0})

In [12]:
trainer.evaluate()

  0%|          | 0/32 [00:00<?, ?it/s]

{'eval_loss': 0.5708801746368408,
 'eval_runtime': 297.7402,
 'eval_samples_per_second': 0.85,
 'eval_steps_per_second': 0.107,
 'epoch': 3.0}

## Save the model

In [13]:
model.save_pretrained('./model/')

## Load the model

In [11]:
from transformers import AutoModelForSequenceClassification

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

new_model = AutoModelForSequenceClassification.from_pretrained('./model/').to(device)

In [12]:
from transformers import AutoTokenizer

new_tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

## Get predictions

In [13]:
import torch
import numpy as np

def get_prediction(text):
    encoding = new_tokenizer(text, return_tensors="pt", padding="max_length", truncation=True, max_length=128)
    encoding = {k: v.to(trainer.model.device) for k,v in encoding.items()}

    outputs = new_model(**encoding)

    logits = outputs.logits
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    softmax = torch.nn.Softmax(dim=0)
    print(softmax)
    probs = softmax(logits.squeeze().cpu())
    probs = probs.detach().numpy()
    label = np.argmax(probs, axis=-1)
    
    if label == 0:
        return {
            'sentiment': 'Self-enhancing',
            'probability': probs[0]
        }
    elif label == 1:
        return {
            'style': 'Self-deprecating',
            'probability': probs[1]
        }
    elif label == 2:
        return {
            'style': 'affiliative',
            'probability': probs[2]
        }
    elif label == 3:
        return {
            'style': 'aggressive',
            'probability': probs[3]
        }
    else:
        return {
            'style': 'Neutral',
            'probability': probs[4]
        }

In [19]:
print(get_prediction(" Why did the computer go to therapy? It had too many bytes of emotional baggage! But don't worry, it's learning to debug its feelings and finding inner peace in the 'delete' button!"))
print(get_prediction(" Why did the tomato turn red? Because it saw the salad dressing. I, on the other hand, turn red every time I try to tell a joke! " ))

Softmax(dim=0)
{'style': 'affiliative', 'probability': 0.95184726}
Softmax(dim=0)
{'style': 'Self-deprecating', 'probability': 0.9739666}


In [20]:
from sklearn.metrics import accuracy_score, f1_score, classification_report

def get_predictions(test_data, test_labels, new_model, new_tokenizer):
    all_predictions = []

    # Set the model to evaluation mode
    new_model.eval()

    for text in test_data:
        encoding = new_tokenizer(text, return_tensors="pt", padding="max_length", truncation=True, max_length=128)
        encoding = {k: v.to(new_model.device) for k, v in encoding.items()}

        outputs = new_model(**encoding)
        logits = outputs.logits

        softmax = torch.nn.Softmax(dim=0)
        probs = softmax(logits.squeeze().cpu())
        probs = probs.detach().numpy()
        label = np.argmax(probs, axis=-1)

        all_predictions.append(label)

    # Convert ground truth labels to a list
    ground_truth_labels = test_labels

    # Calculate accuracy and F-score
    accuracy = accuracy_score(ground_truth_labels, all_predictions)
    f_score = f1_score(ground_truth_labels, all_predictions, average='weighted')
    # Calculate precision, recall, and F1-score
    report = classification_report(ground_truth_labels, all_predictions, target_names=["label_0", "label_1", "label_2", "label_3", "label_4"])

    # Print the classification report
    print(report)

    return all_predictions

    # Print the results
    #print(f'Accuracy: {accuracy}')
    #print(f'F-Score: {f_score}')

    #return all_predictions

# Assuming you have test_data and test_labels for the test set
# Call the function to get predictions and evaluate performance
# Assuming your DataFrame has columns 'text' and 'label'

text_data = valid_hg['text']
labels = valid_hg['label']

# Convert the text data and labels to lists
#text_data_list = text_data.to_list()
#labels_list = labels.to_list()

predictions = get_predictions(text_data, labels, new_model, new_tokenizer)


              precision    recall  f1-score   support

     label_0       0.89      0.90      0.89        61
     label_1       0.90      0.78      0.83        45
     label_2       0.57      0.62      0.60        40
     label_3       0.73      0.72      0.73        50
     label_4       0.93      0.96      0.95        57

    accuracy                           0.81       253
   macro avg       0.80      0.80      0.80       253
weighted avg       0.82      0.81      0.82       253

