In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
import torch
from datasets import Dataset

# Read the data
df = pd.read_csv('compiled_output.csv')  # Replace with your file path

# Step 1: Remove rows with empty or irrelevant content
df = df[df['Text'].str.strip().notna()]  # Remove rows with empty strings
df = df[df['Text'].str.strip() != '']  # Remove rows with only whitespace
df = df[~df['Text'].str.contains(r'\[.*\]')]  # Remove rows with non-textual content like '[2]'

# Step 2: Check if there are still any rows left after filtering
print(f"Remaining rows after filtering: {len(df)}")
print(df.head())

# Step 3: Prepare the Dataset for BERT

# Map labels to integers
label_mapping = {'background_claim': 0, 'own_claim': 1}  # Modify this based on your labels
df['Label'] = df['Label'].map(label_mapping)

# Step 4: Split the dataset into train and test sets
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df['Text'].tolist(), df['Label'].tolist(), test_size=0.3, random_state=42
)

# Step 5: Tokenize the texts using BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize texts
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=512)
test_encodings = tokenizer(test_texts, truncation=True, padding=True, max_length=512)

# Convert to Dataset format
train_dataset = Dataset.from_dict({
    'input_ids': train_encodings['input_ids'],
    'attention_mask': train_encodings['attention_mask'],
    'labels': train_labels
})

test_dataset = Dataset.from_dict({
    'input_ids': test_encodings['input_ids'],
    'attention_mask': test_encodings['attention_mask'],
    'labels': test_labels
})

# Step 6: Load pre-trained BERT model for sequence classification
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(label_mapping))

# Step 7: Training arguments
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=3,              # number of training epochs
    per_device_train_batch_size=8,   # batch size for training
    per_device_eval_batch_size=16,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
    evaluation_strategy="epoch",     # Evaluate after each epoch
    save_strategy="epoch",           # Save model after each epoch
)

# Step 8: Trainer setup
trainer = Trainer(
    model=model,                         # the model to train
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=test_dataset,           # evaluation dataset
    compute_metrics=lambda p: {
        'accuracy': accuracy_score(p.predictions.argmax(axis=-1), p.label_ids)
    }
)

# Step 9: Train the model
trainer.train()

# Step 10: Evaluate the model
eval_results = trainer.evaluate()

print("\nEvaluation Results:")
print(eval_results)

# Step 11: Make predictions on the test set
test_preds = trainer.predict(test_dataset)
test_preds_labels = test_preds.predictions.argmax(axis=-1)

# Step 12: Accuracy and Classification Report
accuracy = accuracy_score(test_labels, test_preds_labels)
print(f"Accuracy: {accuracy * 100:.2f}%")

print("\nClassification Report:")
print(classification_report(test_labels, test_preds_labels))


Remaining rows after filtering: 13454
                                                Text             Label
0  complicated 3D character models are widely use...  background_claim
1  The range of breathtaking realistic 3D models ...  background_claim
2         a production cannot afford major revisions  background_claim
3  providing a flexible and efficient solution to...         own_claim
4  Skeleton Subspace Deformation (SSD) is the pre...  background_claim


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


ValueError: Target size (torch.Size([8])) must be the same as input size (torch.Size([8, 2]))

In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
import torch
from datasets import Dataset

# Read the data
df = pd.read_csv('compiled_output.csv')  # Replace with your file path

# Step 1: Remove rows with empty or irrelevant content
df = df[df['Text'].str.strip().notna()]  # Remove rows with empty strings
df = df[df['Text'].str.strip() != '']  # Remove rows with only whitespace
df = df[~df['Text'].str.contains(r'\[.*\]')]  # Remove rows with non-textual content like '[2]'

# Step 2: Check if there are still any rows left after filtering
print(f"Remaining rows after filtering: {len(df)}")
print(df.head())

# Step 3: Prepare the Dataset for BERT

# Map labels to integers
label_mapping = {'background_claim': 0, 'own_claim': 1}  # Modify this based on your labels
df['Label'] = df['Label'].map(label_mapping)

# Step 4: Split the dataset into train and test sets
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df['Text'].tolist(), df['Label'].tolist(), test_size=0.3, random_state=42
)

# Step 5: Tokenize the texts using BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize texts
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=512)
test_encodings = tokenizer(test_texts, truncation=True, padding=True, max_length=512)

# Convert to Dataset format
train_dataset = Dataset.from_dict({
    'input_ids': train_encodings['input_ids'],
    'attention_mask': train_encodings['attention_mask'],
    'labels': train_labels
})

test_dataset = Dataset.from_dict({
    'input_ids': test_encodings['input_ids'],
    'attention_mask': test_encodings['attention_mask'],
    'labels': test_labels
})

# Step 6: Load pre-trained BERT model for sequence classification (binary classification)
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# Step 7: Training arguments
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=3,              # number of training epochs
    per_device_train_batch_size=8,   # batch size for training
    per_device_eval_batch_size=16,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
    evaluation_strategy="epoch",     # Evaluate after each epoch
    save_strategy="epoch",           # Save model after each epoch
)

# Step 8: Compute metrics function for binary classification
def compute_metrics(p):
    predictions = p.predictions.argmax(axis=-1)  # Convert logits to class predictions
    accuracy = accuracy_score(p.label_ids, predictions)
    return {'accuracy': accuracy}

# Step 9: Trainer setup
trainer = Trainer(
    model=model,                         # the model to train
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=test_dataset,           # evaluation dataset
    compute_metrics=compute_metrics      # pass the compute metrics function
)

# Step 10: Train the model
trainer.train()

# Step 11: Evaluate the model
eval_results = trainer.evaluate()

print("\nEvaluation Results:")
print(eval_results)

# Step 12: Make predictions on the test set
test_preds = trainer.predict(test_dataset)
test_preds_labels = test_preds.predictions.argmax(axis=-1)

# Step 13: Accuracy and Classification Report
accuracy = accuracy_score(test_labels, test_preds_labels)
print(f"Accuracy: {accuracy * 100:.2f}%")

print("\nClassification Report:")
print(classification_report(test_labels, test_preds_labels))


Remaining rows after filtering: 13454
                                                Text             Label
0  complicated 3D character models are widely use...  background_claim
1  The range of breathtaking realistic 3D models ...  background_claim
2         a production cannot afford major revisions  background_claim
3  providing a flexible and efficient solution to...         own_claim
4  Skeleton Subspace Deformation (SSD) is the pre...  background_claim


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  0%|          | 0/3534 [03:38<?, ?it/s]
  0%|          | 0/3534 [00:00<?, ?it/s]

ValueError: Target size (torch.Size([8])) must be the same as input size (torch.Size([8, 2]))

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
import torch
from datasets import Dataset

# Step 1: Read the data
df = pd.read_csv('compiled_output.csv')  # Replace with your file path

# Step 2: Remove rows with empty or irrelevant content
df = df[df['Text'].str.strip().notna()]  # Remove rows with empty strings
df = df[df['Text'].str.strip() != '']  # Remove rows with only whitespace
df = df[~df['Text'].str.contains(r'\[.*\]')]  # Remove rows with non-textual content like '[2]'

# Step 3: Map labels to integers for 3 classes (Adjust this based on your exact label names)
label_mapping = {
    'background_claim': 0,  # Example: background_claim -> 0
    'own_claim': 1,         # Example: own_claim -> 1
    'data': 2               # Example: data -> 2
}
df['Label'] = df['Label'].map(label_mapping)

# Step 4: Split the dataset into train and test sets
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df['Text'].tolist(), df['Label'].tolist(), test_size=0.3, random_state=42
)

# Step 5: Tokenize the texts using BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize texts
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=512)
test_encodings = tokenizer(test_texts, truncation=True, padding=True, max_length=512)

# Convert to Dataset format
train_dataset = Dataset.from_dict({
    'input_ids': train_encodings['input_ids'],
    'attention_mask': train_encodings['attention_mask'],
    'labels': train_labels
})

test_dataset = Dataset.from_dict({
    'input_ids': test_encodings['input_ids'],
    'attention_mask': test_encodings['attention_mask'],
    'labels': test_labels
})

# Step 6: Load pre-trained BERT model for sequence classification (with 3 labels)
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)

# Step 7: Training arguments
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=3,              # number of training epochs
    per_device_train_batch_size=8,   # batch size for training
    per_device_eval_batch_size=16,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
    evaluation_strategy="epoch",     # Evaluate after each epoch
    save_strategy="epoch",           # Save model after each epoch
)

# Step 8: Compute metrics function for multi-class classification
def compute_metrics(p):
    predictions = p.predictions.argmax(axis=-1)  # Convert logits to class predictions
    accuracy = accuracy_score(p.label_ids, predictions)
    return {'accuracy': accuracy}

# Step 9: Trainer setup
trainer = Trainer(
    model=model,                         # the model to train
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=test_dataset,           # evaluation dataset
    compute_metrics=compute_metrics      # pass the compute metrics function
)

# Step 10: Train the model
trainer.train()

# Step 11: Evaluate the model
eval_results = trainer.evaluate()

print("\nEvaluation Results:")
print(eval_results)

# Step 12: Make predictions on the test set
test_preds = trainer.predict(test_dataset)
test_preds_labels = test_preds.predictions.argmax(axis=-1)

# Step 13: Accuracy and Classification Report
accuracy = accuracy_score(test_labels, test_preds_labels)
print(f"Accuracy: {accuracy * 100:.2f}%")

print("\nClassification Report:")
print(classification_report(test_labels, test_preds_labels))


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  0%|          | 0/3534 [05:02<?, ?it/s]
                                        
  0%|          | 0/3534 [04:23<?, ?it/s]           

{'loss': 1.1035, 'grad_norm': 7.584229946136475, 'learning_rate': 1.0000000000000002e-06, 'epoch': 0.01}


                                        
  0%|          | 0/3534 [05:45<?, ?it/s]           

{'loss': 1.0631, 'grad_norm': 7.1570515632629395, 'learning_rate': 2.0000000000000003e-06, 'epoch': 0.02}


                                        
  0%|          | 0/3534 [07:03<?, ?it/s]           

{'loss': 1.0409, 'grad_norm': 6.543478488922119, 'learning_rate': 3e-06, 'epoch': 0.03}


                                        
  0%|          | 0/3534 [08:02<?, ?it/s]           

{'loss': 1.0286, 'grad_norm': 4.701966762542725, 'learning_rate': 4.000000000000001e-06, 'epoch': 0.03}


                                        
  0%|          | 0/3534 [08:56<?, ?it/s]           

{'loss': 1.0501, 'grad_norm': 5.742342948913574, 'learning_rate': 5e-06, 'epoch': 0.04}


                                        
  0%|          | 0/3534 [09:50<?, ?it/s]           

{'loss': 1.0134, 'grad_norm': 6.658573627471924, 'learning_rate': 6e-06, 'epoch': 0.05}


                                        
  0%|          | 0/3534 [10:44<?, ?it/s]           

{'loss': 1.0149, 'grad_norm': 8.458107948303223, 'learning_rate': 7.000000000000001e-06, 'epoch': 0.06}


                                        
  0%|          | 0/3534 [11:37<?, ?it/s]           

{'loss': 0.9531, 'grad_norm': 9.56867504119873, 'learning_rate': 8.000000000000001e-06, 'epoch': 0.07}


                                        
  0%|          | 0/3534 [13:32<?, ?it/s]            

{'loss': 0.7823, 'grad_norm': 8.547475814819336, 'learning_rate': 9e-06, 'epoch': 0.08}


                                        
  0%|          | 0/3534 [15:01<?, ?it/s]            

{'loss': 0.8002, 'grad_norm': 7.4282636642456055, 'learning_rate': 1e-05, 'epoch': 0.08}


                                        
  0%|          | 0/3534 [17:13<?, ?it/s]             

{'loss': 0.7856, 'grad_norm': 7.944399833679199, 'learning_rate': 1.1000000000000001e-05, 'epoch': 0.09}


