# Text Classification With Argument Level Textual Features

### Importing the required libraries

In [None]:
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd
import spacy

### Reading the dataset

In [None]:
df = pd.read_csv('compiled_output.csv')

### Removing rows with empty or irrelevant content

In [None]:
df = df[df['Text'].str.strip().notna()]  # Remove rows with empty strings
df = df[df['Text'].str.strip() != '']  # Remove rows with only whitespace
df = df[~df['Text'].str.contains(r'\[.*\]')]  # Remove rows with non-textual content like '[2]'


### Maping labels to integers

In [None]:
label_mapping = {
    'background_claim': 0,  # Example: background_claim -> 0
    'own_claim': 1,         # Example: own_claim -> 1
    'data': 2               # Example: data -> 2
}
df['Label'] = df['Label'].map(label_mapping)


### Spliting the dataset into train and test sets

In [None]:
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df['Text'].tolist(), df['Label'].tolist(), test_size=0.3, random_state=42
)

### Tokenizing the texts using SciBERT tokenizer

In [None]:
tokenizer = BertTokenizer.from_pretrained('allenai/scibert_scivocab_uncased')
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=128)
test_encodings = tokenizer(test_texts, truncation=True, padding=True, max_length=128)


### Converting to Dataset format

In [None]:
train_dataset = {
    'input_ids': train_encodings['input_ids'],
    'attention_mask': train_encodings['attention_mask'],
    'labels': train_labels,
    'features': train_encodings['features']
}

test_dataset = {
    'input_ids': test_encodings['input_ids'],
    'attention_mask': test_encodings['attention_mask'],
    'labels': test_labels,
    'features': test_encodings['features']
}

# Loading pre-trained BERT model for sequence classification 


In [None]:
model = BertForSequenceClassification.from_pretrained('allenai/scibert_scivocab_uncased', num_labels=3)


### Training arguments

In [None]:
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=1,              # number of training epochs
    per_device_train_batch_size=8,   # batch size for training
    per_device_eval_batch_size=16,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
    evaluation_strategy="epoch",     # Evaluate after each epoch
    save_strategy="epoch",           # Save model after each epoch
)

### Computing metrics function for multi-class classification

In [None]:
def compute_metrics(p):
    predictions = p.predictions.argmax(axis=-1)  # Convert logits to class predictions
    accuracy = accuracy_score(p.label_ids, predictions)
    return {'accuracy': accuracy}


### Trainer setup

In [None]:
trainer = Trainer(
    model=model,                         # the model to train
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=test_dataset,           # evaluation dataset
    compute_metrics=compute_metrics      # pass the compute metrics function
)

### Training and evaluating the model

In [None]:
trainer.train()

eval_results = trainer.evaluate()

print("\nEvaluation Results:")
print(eval_results)

test_preds = trainer.predict(test_dataset)
test_preds_labels = test_preds.predictions.argmax(axis=-1)

accuracy = accuracy_score(test_labels, test_preds_labels)
print(f"Accuracy: {accuracy * 100:.2f}%")

print("\nClassification Report:")
print(classification_report(test_labels, test_preds_labels))
