In [None]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding, TrainingArguments, Trainer
from datasets import Dataset
import evaluate

In [None]:
# Load the CSV file
data = pd.read_csv("master.csv")

# Copy the data into separate data frames 
# Split the data based on the "Question" column values
Binary = data.copy()

Emotion = data.copy()
Emotion = Emotion[(Emotion["Question"] >= 1) & (Emotion["Question"] <= 3)]

Social = data.copy()
Social = Social[(Social["Question"] >= 4) & (Social["Question"] <= 6)]

Motivation = data.copy()
Motivation = Motivation[(Motivation["Question"] >= 7) & (Motivation["Question"] <= 9)]

In [None]:
class BinaryTextClassifier:
    def __init__(self, model_name, dataframe, base_model='distilbert-base-uncased', train_ratio=0.7, validation_ratio=0.15, test_ratio=0.15, seed=42):
        self.model_name = model_name
        self.model_folder = f"./models/{model_name}"
        if not os.path.exists(self.model_folder):
            os.makedirs(self.model_folder)
       
        self.tokenizer = AutoTokenizer.from_pretrained(base_model)
        self.model = AutoModelForSequenceClassification.from_pretrained(base_model, num_labels=2, id2label={0: "Low", 1: "High"}, label2id={"Low": 0, "High": 1})
        
        self.train_df, self.validation_df, self.test_df = self.train_validation_test_split(dataframe, train_ratio, validation_ratio, test_ratio, seed)

        self.train_df.to_csv(self.model_folder + "/train_data.csv")
        self.validation_df.to_csv(self.model_folder + "/validation_data.csv")
        self.test_df.to_csv(self.model_folder + "/test_data.csv")

        self.train_df = self.prepare_data(self.train_df)
        self.validation_df = self.prepare_data(self.validation_df)
        self.test_df = self.prepare_data(self.test_df)

        self.train, self.validation, self.test = self.pre_process_data(self.train_df, self.validation_df, self.test_df)

    def prepare_data(self, df):
        # Encode level and set expected names
        df['label'] = df['Level'].replace({'Low': 0, 'High': 1})
        df['text'] = df['Response']

        # Drop "pretty" columns
        df = df.drop(columns=['Level'])
        df = df.drop(columns=['Question'])
        df = df.drop(columns=['Response'])
        return df

    def train_validation_test_split(self, df, train_ratio, validation_ratio, test_ratio, random_seed):
        if train_ratio + validation_ratio + test_ratio != 1.0:
            raise ValueError("Ratios must sum up to 1.")
        
        # Calculate the intermediate test set size
        intermediate_test_ratio = test_ratio / (validation_ratio + test_ratio)

        # Split the data into a train set and a temporary set (validation + test)
        train_df, temp_df = train_test_split(df, test_size=(validation_ratio + test_ratio), random_state=random_seed)
        
        # Split the temporary set into validation and test sets
        validation_df, test_df,  = train_test_split(temp_df, test_size=intermediate_test_ratio, random_state=random_seed)
        return train_df, validation_df, test_df

    def pre_process_data(self, train_df, validation_df, test_df):
        def tokenize_function(dataset): 
            return self.tokenizer(dataset["text"])
        
        train_dataset = Dataset.from_pandas(train_df)
        val_dataset = Dataset.from_pandas(validation_df)
        test_dataset = Dataset.from_pandas(test_df)

        train_dataset = train_dataset.map(tokenize_function)
        val_dataset = val_dataset.map(tokenize_function)
        test_dataset = test_dataset.map(tokenize_function)

        train_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])
        val_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])
        test_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])
        return train_dataset, val_dataset, test_dataset
    
    def train_model(self):
        data_collator = DataCollatorWithPadding(tokenizer=self.tokenizer)
        accuracy = evaluate.load("accuracy")

        def compute_metrics(eval_pred):
            predictions, labels = eval_pred
            predictions = np.argmax(predictions, axis=1)
            return accuracy.compute(predictions=predictions, references=labels)

        training_args = TrainingArguments(
            output_dir=self.model_folder,
            learning_rate=2e-5,
            per_device_train_batch_size=16,
            per_device_eval_batch_size=16,
            num_train_epochs=2,
            weight_decay=0.01,
            evaluation_strategy="epoch",
            save_strategy="epoch",
            load_best_model_at_end=False,
            push_to_hub=False,
        )

        trainer = Trainer(
            model=self.model,
            args=training_args,
            train_dataset=self.train,
            eval_dataset=self.validation,
            tokenizer=self.tokenizer,
            data_collator=data_collator,
            compute_metrics=compute_metrics,
        )

        trainer.train()

In [None]:
# Initialize a TextClassifier instance for each dataset
Binary_classifier = BinaryTextClassifier("Binary", Binary)
Emotion_classifier = BinaryTextClassifier("Emotion", Emotion)
Social_classifier = BinaryTextClassifier("Social", Social)
Motivation_classifier = BinaryTextClassifier("Motivation", Motivation)

# Train each model
Binary_classifier.train_model()
Emotion_classifier.train_model()
Social_classifier.train_model()
Motivation_classifier.train_model()