In [4]:
import pandas as pd
from datasets import Dataset


In [5]:
df = pd.read_csv('aug.csv')


In [6]:


# Convert pandas DataFrame to Hugging Face Dataset object
dataset = Dataset.from_pandas(df)


In [7]:
dataset[0]

{'Unnamed: 0': 0, 'text': 'Hardwork and devotion', 'label': 1}

In [8]:
# Split the dataset: 80% for training, 20% for testing
train_test_split = dataset.train_test_split(test_size=0.2)

# Rename for clarity
train_dataset = train_test_split['train']
test_dataset = train_test_split['test']

Tokenizer


In [9]:
from transformers import AutoTokenizer

model = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model)

# Create a function to tokenize the text
def tokenize_function(df):
    return tokenizer(df["text"], padding="max_length", truncation=True)

# Apply the function to your datasets
tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True, remove_columns=['text'])
tokenized_test_dataset = test_dataset.map(tokenize_function, batched=True, remove_columns=['text'])

Map: 100%|██████████| 1171/1171 [00:00<00:00, 4643.96 examples/s]
Map: 100%|██████████| 293/293 [00:00<00:00, 4805.05 examples/s]


Fine Tuning begins

In [10]:
from transformers import AutoModelForSequenceClassification

id2label= {
    0 : 'extrinsic',
    1 : 'intrinsic'
    }
label2id={
    'extrinsic' : 0,
    'intrinsic' : 1
}

model1 = AutoModelForSequenceClassification.from_pretrained(
    model,
    num_labels = 2,
    id2label = id2label,
    label2id = label2id
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Defining parameters

In [11]:
from transformers import TrainingArguments, Trainer
from sklearn.metrics import accuracy_score, f1_score
import numpy as np

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    
    acc = accuracy_score(labels, predictions)
    f1 = f1_score(labels, predictions, average = 'weighted')
    
    return{'accuracy': acc, "f1": f1}

#Training arguments
training_args = TrainingArguments(
    output_dir="./motivation_model",      # Where to save the model
    evaluation_strategy="epoch",          # Evaluate at the end of each epoch
    save_strategy="epoch",                # Save at the end of each epoch
    num_train_epochs=3,                   # 3 epochs is usually a good start
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    learning_rate=2e-5,
    weight_decay=0.01,
    load_best_model_at_end=True,          # Load the best model (by F1/accuracy)
    metric_for_best_model="f1",
)
    
tokenized_train_dataset = tokenized_train_dataset.rename_column("label", "labels")
tokenized_test_dataset = tokenized_test_dataset.rename_column("label", "labels")    
#trainer
trainer = Trainer(
    model = model1,
    args = training_args,
    train_dataset =tokenized_train_dataset,
    eval_dataset=tokenized_test_dataset,
    tokenizer = tokenizer,
    compute_metrics=compute_metrics,
)    
    
    




  trainer = Trainer(


In [12]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.105706,0.969283,0.969339
2,No log,0.026333,0.993174,0.993169
3,No log,0.018986,0.996587,0.996586


TrainOutput(global_step=222, training_loss=0.1122857428885795, metrics={'train_runtime': 177.9615, 'train_samples_per_second': 19.74, 'train_steps_per_second': 1.247, 'total_flos': 465357971478528.0, 'train_loss': 0.1122857428885795, 'epoch': 3.0})

In [13]:
results = trainer.evaluate()
print(results)

{'eval_loss': 0.018985819071531296, 'eval_accuracy': 0.9965870307167235, 'eval_f1': 0.996585825416149, 'eval_runtime': 5.1839, 'eval_samples_per_second': 56.521, 'eval_steps_per_second': 3.665, 'epoch': 3.0}


In [14]:
trainer.save_model("model")