In [6]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [7]:
import torch
import pandas as pd
import numpy as np
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import AutoModelForSequenceClassification, AutoTokenizer, AdamW, get_linear_schedule_with_warmup
from sklearn.preprocessing import LabelEncoder
import tqdm
import plotly.express as px
import plotly.graph_objects as go

In [25]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


In [9]:
train_df = pd.read_csv('train.csv',on_bad_lines='skip', delimiter='\t')
test_df = pd.read_csv('test.csv',on_bad_lines='skip', delimiter='\t')
validation_df = pd.read_csv('dev.csv',on_bad_lines='skip', delimiter='\t')

In [11]:
fig = go.Figure()

groupby_label = train_df.groupby('label')['label'].count()

fig.add_trace(go.Bar(
    x=list(sorted(groupby_label.index)),
    y=groupby_label.tolist(),
    text=groupby_label.tolist(),
    textposition='auto'
))

fig.update_layout(
    xaxis_title_text='Label',
    yaxis_title_text='Frequency',
    bargap=0.2,
    bargroupgap=0.2)

fig.show()

In [12]:
fig = go.Figure()

groupby_label = test_df.groupby('label')['label'].count()

fig.add_trace(go.Bar(
    x=list(sorted(groupby_label.index)),
    y=groupby_label.tolist(),
    text=groupby_label.tolist(),
    textposition='auto'
))

fig.update_layout(
    xaxis_title_text='Label',
    yaxis_title_text='Frequency',
    bargap=0.2,
    bargroupgap=0.2)

fig.show()

In [13]:
fig = go.Figure()

groupby_label = validation_df.groupby('label')['label'].count()

fig.add_trace(go.Bar(
    x=list(sorted(groupby_label.index)),
    y=groupby_label.tolist(),
    text=groupby_label.tolist(),
    textposition='auto'
))

fig.update_layout(
    xaxis_title_text='Label',
    yaxis_title_text='Frequency',
    bargap=0.2,
    bargroupgap=0.2)

fig.show()

In [14]:
model_name = "HooshvareLab/bert-fa-base-uncased-sentiment-snappfood"
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
tokenizer =AutoTokenizer.from_pretrained(model_name)

Downloading (…)lve/main/config.json:   0%|          | 0.00/621 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/651M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/62.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/1.20M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [17]:
train_df_subset = train_df.sample(n=500, random_state=42)
test_df_subset = test_df.sample(n=500, random_state=42)
val_df_subset = validation_df.sample(n=500, random_state=42)

In [21]:
le = LabelEncoder()
train_labels = le.fit_transform(train_df_subset['label'])
test_labels = le.transform(test_df_subset['label'])
val_labels = le.transform(val_df_subset['label'])

train_encodings = tokenizer(train_df_subset['comment'].values.tolist(), truncation=True, padding=True)
test_encodings = tokenizer(test_df_subset['comment'].values.tolist(), truncation=True, padding=True)
val_encodings = tokenizer(val_df_subset['comment'].values.tolist(), truncation=True, padding=True)

In [22]:
train_dataset = TensorDataset(torch.tensor(train_encodings['input_ids']),
                              torch.tensor(train_encodings['attention_mask']),
                              torch.tensor(train_labels))
test_dataset = TensorDataset(torch.tensor(test_encodings['input_ids']),
                             torch.tensor(test_encodings['attention_mask']),
                             torch.tensor(test_labels))
val_dataset = TensorDataset(torch.tensor(val_encodings['input_ids']),
                             torch.tensor(val_encodings['attention_mask']),
                             torch.tensor(val_labels))
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=4)
val_dataloader = DataLoader(val_dataset, shuffle=True, batch_size=4)
test_dataloader = DataLoader(test_dataset, batch_size=4)

In [23]:
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)
epochs = 5
total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps=0,
                                            num_training_steps=total_steps)





In [27]:
from sklearn.metrics import f1_score

In [28]:
train_losses = []
val_losses = []
train_f1_scores = []
val_f1_scores = []

for epoch in range(epochs):
    model.train()
    epoch_train_losses = []
    epoch_val_losses = []
    epoch_train_preds = []
    epoch_train_labels = []
    epoch_val_preds = []
    epoch_val_labels = []
    
    for step, (train_batch, val_batch) in tqdm.tqdm(enumerate(zip(train_dataloader, val_dataloader))):
        # Train step
        model.train()
        train_batch = tuple(t.to(device) for t in train_batch)
        train_inputs = {'input_ids': train_batch[0], 'attention_mask': train_batch[1], 'labels': train_batch[2]}
        
        model.zero_grad()
        train_outputs = model(**train_inputs)
        train_loss = train_outputs.loss
        epoch_train_losses.append(train_loss.item())
        
        train_logits = train_outputs.logits.detach().cpu().numpy()
        train_preds = np.argmax(train_logits, axis=1)
        train_labels = train_batch[2].detach().cpu().numpy()
        epoch_train_preds.extend(train_preds)
        epoch_train_labels.extend(train_labels)
        
        train_loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()
        
        # Validation step
        model.eval()
        val_batch = tuple(t.to(device) for t in val_batch)
        val_inputs = {'input_ids': val_batch[0], 'attention_mask': val_batch[1], 'labels': val_batch[2]}
        
        with torch.no_grad():
            val_outputs = model(**val_inputs)
            val_loss = val_outputs.loss
            epoch_val_losses.append(val_loss.item())
            
            val_logits = val_outputs.logits.detach().cpu().numpy()
            val_preds = np.argmax(val_logits, axis=1)
            val_labels = val_batch[2].detach().cpu().numpy()
            epoch_val_preds.extend(val_preds)
            epoch_val_labels.extend(val_labels)
    
    # Calculate metrics for the epoch
    epoch_train_loss = np.mean(epoch_train_losses)
    epoch_val_loss = np.mean(epoch_val_losses)
    epoch_train_f1 = f1_score(epoch_train_labels, epoch_train_preds)
    epoch_val_f1 = f1_score(epoch_val_labels, epoch_val_preds)
    
    train_losses.append(epoch_train_loss)
    val_losses.append(epoch_val_loss)
    train_f1_scores.append(epoch_train_f1)
    val_f1_scores.append(epoch_val_f1)
    
    print(f'Epoch {epoch + 1}:')
    print(f'Train Loss: {epoch_train_loss:.5f} | Val Loss: {epoch_val_loss:.5f}')
    print(f'Train F1-Score: {epoch_train_f1:.5f} | Val F1-Score: {epoch_val_f1:.5f}')
    print('-' * 50)

# Plot loss and accuracy during training
iterations = np.arange(1, len(train_losses) + 1)
fig = go.Figure()
fig.add_trace(go.Scatter(x=iterations, y=train_losses, mode='lines+markers', name='Train Loss'))
fig.add_trace(go.Scatter(x=iterations, y=val_losses, mode='lines+markers', name='Val Loss'))
fig.update_layout(title='Training Progress', xaxis_title='Iterations', yaxis_title='Loss')
fig.show()

fig = go.Figure()
fig.add_trace(go.Scatter(x=iterations, y=train_f1_scores, mode='lines+markers', name='Train F1-Score'))
fig.add_trace(go.Scatter(x=iterations, y=val_f1_scores, mode='lines+markers', name='Val F1-Score'))
fig.update_layout(title='Training Progress', xaxis_title='Iterations', yaxis_title='F1-Score')
fig.show()

125it [18:30,  8.89s/it]


Epoch 1:
Train Loss: 0.08438 | Val Loss: 0.71081
Train F1-Score: 0.98491 | Val F1-Score: 0.86667
--------------------------------------------------


125it [17:55,  8.60s/it]


Epoch 2:
Train Loss: 0.02620 | Val Loss: 0.82837
Train F1-Score: 0.99623 | Val F1-Score: 0.86550
--------------------------------------------------


125it [17:53,  8.59s/it]


Epoch 3:
Train Loss: 0.01394 | Val Loss: 0.86396
Train F1-Score: 0.99811 | Val F1-Score: 0.87179
--------------------------------------------------


125it [17:52,  8.58s/it]


Epoch 4:
Train Loss: 0.01161 | Val Loss: 0.89058
Train F1-Score: 0.99811 | Val F1-Score: 0.86282
--------------------------------------------------


125it [17:56,  8.61s/it]

Epoch 5:
Train Loss: 0.00717 | Val Loss: 0.89350
Train F1-Score: 0.99811 | Val F1-Score: 0.86508
--------------------------------------------------
Epoch 5:
Train Loss: 0.00717 | Val Loss: 0.89350
Train F1-Score: 0.99811 | Val F1-Score: 0.86508
--------------------------------------------------






In [None]:
model.eval()
test_predictions = []
for batch in test_dataloader:
    # Move the batch to the device
    batch = tuple(t.to(device) for t in batch)
    inputs = {'input_ids': batch[0], 'attention_mask': batch[1]}

    # Make predictions for this batch
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        probabilities = torch.softmax(logits, dim=1)
        batch_predictions = torch.argmax(probabilities, dim=1)
        test_predictions.extend(batch_predictions.tolist())

# Print the predicted labels for the first 10 examples in the test set
test_df_subset['predicted_label'] = le.inverse_transform(test_predictions)
print(test_df_subset[['comment', 'label', 'predicted_label']].head(10))