# Imports

In [1]:
import os

import warnings

from IPython.display import HTML

## General imports 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import string

## Torch library
import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader

## Sklearn ilbrary
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score

## Transformers library
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import DataCollatorWithPadding
from transformers import AdamW, get_scheduler


import re
from collections import Counter

from tqdm.auto import tqdm


2024-07-04 09:39:42.912464: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-04 09:39:42.912582: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-04 09:39:43.044339: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [2]:
# Loading the data

df = pd.read_csv('/kaggle/input/nlp-getting-started/train.csv')
test = pd.read_csv('/kaggle/input/nlp-getting-started/test.csv')
sample_sub = pd.read_csv('/kaggle/input/nlp-getting-started/sample_submission.csv')
df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


# Data Cleaning 

Now we need to remove certain text parts. Those parts does not contribute to the overall score and also might confuse the text

 - Removing urls
 - Removing HTML tags
 - Removing Emojis
 - Removing punctuations

In [3]:
####### Removing urls
def remove_URL(text):
    url = re.compile(r'https?://\S+|www\.\S+')
    return url.sub(r'',text)

####### Removing HTML tags
def remove_html(text):
    html=re.compile(r'<.*?>')
    return html.sub(r'',text)

####### Removing Emojis
def remove_emoji(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)


####### Removing punctuations
def remove_punct(text):
    table=str.maketrans('','',string.punctuation)
    return text.translate(table)

In [4]:
##### Performing the function will built on all the text using lambda principal #####
df['text']=df['text'].apply(lambda x : remove_URL(x))
df['text']=df['text'].apply(lambda x : remove_html(x))
df['text']=df['text'].apply(lambda x: remove_emoji(x))
df['text']=df['text'].apply(lambda x : remove_punct(x))

test['text']=test['text'].apply(lambda x : remove_URL(x))
test['text']=test['text'].apply(lambda x : remove_html(x))
test['text']=test['text'].apply(lambda x: remove_emoji(x))
test['text']=test['text'].apply(lambda x : remove_punct(x))

# Modeling - Architcture 

In [5]:
# Load model directly

tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment-latest")
model = AutoModelForSequenceClassification.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment-latest")



config.json:   0%|          | 0.00/929 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/501M [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()
Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


# 1. Tokenize
The first step after the preprocessing is to tokenize the data. Tokenization is the process where words are converted into numbers and ids. Each word or part of a word (depend on the tokenizer) are converted into numbers that are readable for the machine

Later we splitting the data into train and validation sets 

In [6]:
# Tokenize the train dataset and add target column 
def tokenize_train(df):
    
    tokenized_texts = []
    
    # For each text & label we tokenize the text and add the label target into the dictonary
    for text, label in zip(df["text"], df["target"]):
        tokenized_text = tokenizer(text, truncation=True)
        tokenized_text["labels"] = label
        tokenized_texts.append(tokenized_text)
        
    return tokenized_texts

# Tokenize the test dataset 
def tokenize_test(test):
    tokenized_texts = []
    for text in test["text"]:
          tokenized_texts.append(tokenizer(text, truncation=True))
    return tokenized_texts
            
tokenized_texts = tokenize_train(df)
test_texts = tokenize_test(test)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [7]:
# Split the whole text set into train and validation
train_texts, val_texts = train_test_split(tokenized_texts, test_size=0.2, random_state=42)  # Set a seed for reproducibility

print(len(train_texts))
print(len(val_texts))

6090
1523


# 2. Split to batches 

In [8]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

## Set train texts DataLoader
train_dataloader = DataLoader(
    train_texts, shuffle=True, batch_size=8, collate_fn=data_collator
)

## Set val texts DataLoader
eval_dataloader = DataLoader(
    val_texts, batch_size=8, collate_fn=data_collator
)

## Set test texts DataLoader
test_dataloader = DataLoader(
    test_texts, batch_size=8, collate_fn=data_collator
)

In [9]:
## Small test to see that everything went well with the Dataloader method
for batch in train_dataloader:
    break
{k: v.shape for k, v in batch.items()}

{'input_ids': torch.Size([8, 37]),
 'attention_mask': torch.Size([8, 37]),
 'labels': torch.Size([8])}

# 3. Train the model

In [10]:
# Create optimizer
optimizer = AdamW(model.parameters(), lr=3e-5)

# Move to GPU to train
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

# Set epochs numbers, number of training steps and learning rate
num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)

progress_bar = tqdm(range(num_training_steps))

# Train the model
model.train()
for epoch in range(num_epochs):
    # In each batch of the train_dataloader compute 
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)



  0%|          | 0/2286 [00:00<?, ?it/s]

# Evaluate the model using the validation set
use metrics from evaluate library in order to give an estimation 

In [11]:
!pip install evaluate
import evaluate

metric = evaluate.load("glue", "mrpc")
model.eval()
for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=batch["labels"])

metric.compute()

  pid, fd = os.forkpty()
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting evaluate
  Downloading evaluate-0.4.2-py3-none-any.whl.metadata (9.3 kB)
Downloading evaluate-0.4.2-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.2


Downloading builder script:   0%|          | 0.00/5.75k [00:00<?, ?B/s]

{'accuracy': 0.8240315167432699, 'f1': 0.7947932618683}

# Making the predictions on the test set

In [12]:
model.eval()
    
predictions = []  # List to store predictions

for batch in test_dataloader:  
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)
        logits = outputs.logits
        predictions.extend(torch.argmax(logits, dim=-1).cpu().tolist())  # Append predictions to list

In [13]:
# Insert the predictions into the submission file
sample_sub["target"] = predictions
sample_sub.to_csv('submission.csv',index=False)