In [14]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import seaborn as sns

from transformers import AutoTokenizer, pipeline
from sklearn.preprocessing import OneHotEncoder
from datasets import load_dataset

In [15]:
train = pd.read_csv('data/train.csv')

In [16]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Step 1: Load the dataset
df = pd.read_csv('data/train.csv')

# Step 2: Define the label column
label_column = 'discourse_effectiveness'  # Replace with your actual label column name

# Optional: Inspect class distribution
print("Original Dataset Class Distribution:")
print(df[label_column].value_counts(normalize=True))

# Step 3: First split - Train and Temp
train_df, temp_df = train_test_split(
    df,
    test_size=0.2,  # 20% to temp
    random_state=42,
    stratify=df[label_column]
)

# Step 4: Second split - Validation and Test
validation_df, test_df = train_test_split(
    temp_df,
    test_size=0.5,  # 10% each
    random_state=42,
    stratify=temp_df[label_column]
)

# Step 5: Save the splits to CSV files
train_df.to_csv('train.csv', index=False)
validation_df.to_csv('validation.csv', index=False)
test_df.to_csv('test.csv', index=False)

# Step 6: Verify the splits
print("\nTraining Set Class Distribution:")
print(train_df[label_column].value_counts(normalize=True))

print("\nValidation Set Class Distribution:")
print(validation_df[label_column].value_counts(normalize=True))

print("\nTest Set Class Distribution:")
print(test_df[label_column].value_counts(normalize=True))

print("\nData successfully split and saved to CSV files.")


Original Dataset Class Distribution:
discourse_effectiveness
Adequate       0.570570
Effective      0.253665
Ineffective    0.175765
Name: proportion, dtype: float64

Training Set Class Distribution:
discourse_effectiveness
Adequate       0.570549
Effective      0.253672
Ineffective    0.175779
Name: proportion, dtype: float64

Validation Set Class Distribution:
discourse_effectiveness
Adequate       0.570729
Effective      0.253536
Ineffective    0.175734
Name: proportion, dtype: float64

Test Set Class Distribution:
discourse_effectiveness
Adequate       0.570574
Effective      0.253739
Ineffective    0.175687
Name: proportion, dtype: float64

Data successfully split and saved to CSV files.


In [17]:
dataset = load_dataset('csv', data_files={'train': 'train.csv',
                                         'validation': 'validation.csv',
                                         'test': 'test.csv'})

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

In [18]:
label_mapping = {
    "Effective": 0,
    "Adequate": 1,
    "Ineffective": 2
}

def encode_labels(example):
    example['labels'] = label_mapping[example['discourse_effectiveness']]
    return example

dataset = dataset.map(encode_labels)

Map:   0%|          | 0/29412 [00:00<?, ? examples/s]

Map:   0%|          | 0/3676 [00:00<?, ? examples/s]

Map:   0%|          | 0/3677 [00:00<?, ? examples/s]

In [19]:
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [20]:
def tokenize_function(example):
    return tokenizer(example['discourse_text'], padding='max_length', truncation=True, max_length=128)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/29412 [00:00<?, ? examples/s]

Map:   0%|          | 0/3676 [00:00<?, ? examples/s]

Map:   0%|          | 0/3677 [00:00<?, ? examples/s]

In [21]:
tokenized_datasets = tokenized_datasets.remove_columns(['discourse_id', 'essay_id', 'discourse_text', 'discourse_effectiveness'])  # Remove unnecessary columns
tokenized_datasets.set_format('torch')

In [22]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(tokenized_datasets['train'], shuffle=True, batch_size=16)
eval_dataloader = DataLoader(tokenized_datasets['validation'], batch_size=16)

In [23]:
from transformers import AutoModelForSequenceClassification

num_labels = 3  # Effective, Adequate, Not Effective
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [24]:
from transformers import Trainer, TrainingArguments
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    accuracy = accuracy_score(labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='weighted')
    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
    }

training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation'],
    compute_metrics=compute_metrics,
)



In [25]:
trainer.train()

  0%|          | 0/5517 [00:00<?, ?it/s]

{'loss': 0.8299, 'grad_norm': 5.196715831756592, 'learning_rate': 1.818742069965561e-05, 'epoch': 0.27}
{'loss': 0.7755, 'grad_norm': 3.8216562271118164, 'learning_rate': 1.637484139931122e-05, 'epoch': 0.54}
{'loss': 0.7617, 'grad_norm': 3.496605157852173, 'learning_rate': 1.456226209896683e-05, 'epoch': 0.82}


  0%|          | 0/230 [00:00<?, ?it/s]

{'eval_loss': 0.7599582076072693, 'eval_accuracy': 0.6556039173014145, 'eval_precision': 0.6693911797503348, 'eval_recall': 0.6556039173014145, 'eval_f1': 0.608390573423682, 'eval_runtime': 33.7357, 'eval_samples_per_second': 108.965, 'eval_steps_per_second': 6.818, 'epoch': 1.0}
{'loss': 0.7262, 'grad_norm': 4.870213985443115, 'learning_rate': 1.2749682798622441e-05, 'epoch': 1.09}
{'loss': 0.6588, 'grad_norm': 8.46845817565918, 'learning_rate': 1.093710349827805e-05, 'epoch': 1.36}
{'loss': 0.6571, 'grad_norm': 3.6206326484680176, 'learning_rate': 9.12452419793366e-06, 'epoch': 1.63}
{'loss': 0.6469, 'grad_norm': 7.938916206359863, 'learning_rate': 7.311944897589271e-06, 'epoch': 1.9}


  0%|          | 0/230 [00:00<?, ?it/s]

{'eval_loss': 0.7511910796165466, 'eval_accuracy': 0.6730141458106638, 'eval_precision': 0.6687811801738767, 'eval_recall': 0.6730141458106638, 'eval_f1': 0.6385681376738567, 'eval_runtime': 33.5462, 'eval_samples_per_second': 109.58, 'eval_steps_per_second': 6.856, 'epoch': 2.0}
{'loss': 0.543, 'grad_norm': 9.194948196411133, 'learning_rate': 5.4993655972448805e-06, 'epoch': 2.18}
{'loss': 0.5194, 'grad_norm': 8.439209938049316, 'learning_rate': 3.68678629690049e-06, 'epoch': 2.45}
{'loss': 0.5044, 'grad_norm': 7.556225299835205, 'learning_rate': 1.8742069965560993e-06, 'epoch': 2.72}
{'loss': 0.5056, 'grad_norm': 9.115159034729004, 'learning_rate': 6.162769621170926e-08, 'epoch': 2.99}


  0%|          | 0/230 [00:00<?, ?it/s]

{'eval_loss': 0.8244093060493469, 'eval_accuracy': 0.6667573449401524, 'eval_precision': 0.6595218811539684, 'eval_recall': 0.6667573449401524, 'eval_f1': 0.6485072665278387, 'eval_runtime': 33.5719, 'eval_samples_per_second': 109.496, 'eval_steps_per_second': 6.851, 'epoch': 3.0}
{'train_runtime': 2738.664, 'train_samples_per_second': 32.219, 'train_steps_per_second': 2.014, 'train_loss': 0.647691592198687, 'epoch': 3.0}


TrainOutput(global_step=5517, training_loss=0.647691592198687, metrics={'train_runtime': 2738.664, 'train_samples_per_second': 32.219, 'train_steps_per_second': 2.014, 'total_flos': 5804018881661952.0, 'train_loss': 0.647691592198687, 'epoch': 3.0})

In [26]:
results = trainer.evaluate(tokenized_datasets['test'])
print(results)

  0%|          | 0/230 [00:00<?, ?it/s]

{'eval_loss': 0.794953465461731, 'eval_accuracy': 0.6766385640467772, 'eval_precision': 0.6695896145247198, 'eval_recall': 0.6766385640467772, 'eval_f1': 0.6595828867045886, 'eval_runtime': 33.6244, 'eval_samples_per_second': 109.355, 'eval_steps_per_second': 6.84, 'epoch': 3.0}
