In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install transformers

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification
import torch.optim as optim
import torch
from torch.utils.data import DataLoader, TensorDataset
from tqdm import tqdm

# Load the dataset
dataset = pd.read_csv('../input/dataset/dataset.csv')

# Split the dataset into training, testing, and validation sets (80-10-10 split)
train_set, temp_set = train_test_split(dataset, test_size=0.2, random_state=42)
test_set, val_set = train_test_split(temp_set, test_size=0.5, random_state=42)

# Tokenization
tokenizer = BertTokenizer.from_pretrained("bert-large-cased")

def tokenize_data(data):
    return tokenizer(data['text'].tolist(), padding='max_length', truncation=True, max_length=256, return_tensors="pt")

train_tokens = tokenize_data(train_set)
test_tokens = tokenize_data(test_set)
val_tokens = tokenize_data(val_set)

# DataLoader
BATCH_SIZE = 8

train_dataset = TensorDataset(train_tokens['input_ids'], train_tokens['attention_mask'], torch.tensor(train_set['label'].values))
test_dataset = TensorDataset(test_tokens['input_ids'], test_tokens['attention_mask'], torch.tensor(test_set['label'].values))
val_dataset = TensorDataset(val_tokens['input_ids'], val_tokens['attention_mask'], torch.tensor(val_set['label'].values))

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)

# Load BERT Model
model = BertForSequenceClassification.from_pretrained("bert-large-cased", num_labels=2)  # Assuming binary classification

# Training Loop
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

optimizer = optim.AdamW(model.parameters(), lr=2e-5)
EPOCHS = 3

for epoch in range(EPOCHS):
    model.train()
    train_loss = 0
    for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}/{EPOCHS}"):
        optimizer.zero_grad()
        input_ids, attention_mask, labels = [b.to(device) for b in batch]
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        train_loss += loss.item()

    model.eval()
    val_loss = 0
    for batch in val_loader:
        input_ids, attention_mask, labels = [b.to(device) for b in batch]
        with torch.no_grad():
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        val_loss += outputs.loss.item()
    print(f"Epoch: {epoch+1}, Training Loss: {train_loss/len(train_loader)}, Validation Loss: {val_loss/len(val_loader)}")

# Evaluation
correct = 0
total = 0

model.eval()
for batch in test_loader:
    input_ids, attention_mask, labels = [b.to(device) for b in batch]
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
    _, predicted = torch.max(outputs.logits, 1)
    total += labels.size(0)
    correct += (predicted == labels).sum().item()

print(f"Accuracy: {100 * correct / total}%")

# Try again

In [1]:
!pip install transformers
!pip install torch
!pip install tqdm



In [4]:
import pandas as pd
import torch
from torch.utils.data import DataLoader, TensorDataset
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Load the dataset
dataset = pd.read_csv('../input/dataset/dataset.csv')
train, temp = train_test_split(dataset, test_size=0.2, random_state=42)
test, val = train_test_split(temp, test_size=0.5, random_state=42)

# Initialize the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-large-cased')

# Define a function to tokenize the datasets
def tokenize_data(texts, labels, max_length=64):
    input_data = tokenizer(texts, padding='max_length', truncation=True, max_length=max_length, return_tensors='pt')
    labels = torch.tensor(labels)
    return input_data['input_ids'], input_data['attention_mask'], labels

# Tokenize the datasets
train_input_ids, train_attention_mask, train_labels = tokenize_data(train['text'].tolist(), train['humor'].astype(int).tolist())
test_input_ids, test_attention_mask, test_labels = tokenize_data(test['text'].tolist(), test['humor'].astype(int).tolist())
val_input_ids, val_attention_mask, val_labels = tokenize_data(val['text'].tolist(), val['humor'].astype(int).tolist())

# Initialize the BERT model
model = BertForSequenceClassification.from_pretrained('bert-large-cased', num_labels=2)
model = model.to('cuda' if torch.cuda.is_available() else 'cpu')

# Create data loaders
batch_size = 32
train_data = TensorDataset(train_input_ids, train_attention_mask, train_labels)
train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size)
val_data = TensorDataset(val_input_ids, val_attention_mask, val_labels)
val_loader = DataLoader(val_data, shuffle=False, batch_size=batch_size)

# Optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=2e-5)
total_steps = len(train_loader) * 3
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Training function
def train_model(model, train_loader, val_loader, optimizer, scheduler, num_epochs=3):
    for epoch in range(num_epochs):
        model.train()
        total_loss = 0
        progress = tqdm(train_loader, desc=f"Epoch {epoch + 1}", position=0, leave=True)
        for batch in progress:
            batch = tuple(t.to('cuda' if torch.cuda.is_available() else 'cpu') for t in batch)
            input_ids, attention_mask, labels = batch
            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs[0]
            total_loss += loss.item()
            loss.backward()
            optimizer.step()
            scheduler.step()
            progress.set_postfix({'training_loss': '{:.3f}'.format(loss.item() / len(batch))})
        avg_train_loss = total_loss / len(train_loader)
        print(f"Average training loss: {avg_train_loss}")

train_model(model, train_loader, val_loader, optimizer, scheduler)

# Evaluation function
def evaluate_model(model, test_loader):
    model.eval()
    predictions, true_labels = [], []
    for batch in test_loader:
        batch = tuple(t.to('cuda' if torch.cuda.is_available() else 'cpu') for t in batch)
        input_ids, attention_mask, labels = batch
        with torch.no_grad():
            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs[0]
            predictions.extend(logits.argmax(dim=1).tolist())
            true_labels.extend(labels.tolist())
    return predictions, true_labels

# Evaluate and print report
test_data = TensorDataset(test_input_ids, test_attention_mask, test_labels)
test_loader = DataLoader(test_data, shuffle=False, batch_size=batch_size)
predictions, true_labels = evaluate_model(model, test_loader)
print(classification_report(true_labels, predictions, target_names=["Not Humor", "Humor"]))

Downloading vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading tokenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/762 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-large-cased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1: 100%|██████████| 5000/5000 [55:09<00:00,  1.51it/s, training_loss=0.004]


Average training loss: 0.05569313136705023


Epoch 2: 100%|██████████| 5000/5000 [55:06<00:00,  1.51it/s, training_loss=0.001]


Average training loss: 0.015078376171481432


Epoch 3: 100%|██████████| 5000/5000 [55:06<00:00,  1.51it/s, training_loss=0.000]


Average training loss: 0.0031810589800723393
              precision    recall  f1-score   support

   Not Humor       0.99      0.99      0.99     10022
       Humor       0.99      0.99      0.99      9978

    accuracy                           0.99     20000
   macro avg       0.99      0.99      0.99     20000
weighted avg       0.99      0.99      0.99     20000



In [5]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Assuming predictions and true_labels are available from the evaluate_model function

# Calculate metrics using sklearn
accuracy = accuracy_score(true_labels, predictions)
precision = precision_score(true_labels, predictions)
recall = recall_score(true_labels, predictions)
f1 = f1_score(true_labels, predictions)

# Print the metrics
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1-score: {f1}")

Accuracy: 0.99045
Precision: 0.9903797975749074
Recall: 0.990479053918621
F1-score: 0.990429423260009
