In [29]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel, PeftConfig
import torch

In [30]:
# change direcotry
import sys
sys.path.append('/Users/lenap/Documents/Git/Argument-Mining')
print(sys.path)

['/', '/opt/anaconda3/lib/python312.zip', '/opt/anaconda3/lib/python3.12', '/opt/anaconda3/lib/python3.12/lib-dynload', '', '/opt/anaconda3/lib/python3.12/site-packages', '/opt/anaconda3/lib/python3.12/site-packages/aeosa', '/opt/anaconda3/lib/python3.12/site-packages/setuptools/_vendor', '/var/folders/7b/bpwbm4g168sb_l8mr76jv93w0000gn/T/tmptc799eb4', '/Users/lenap/Documents/Git/Argument-Mining', '/Users/lenap/Documents/Git/Argument-Mining', '/Users/lenap/Documents/Git/Argument-Mining']


In [31]:
from db.queries import get_training_data, get_test_data

# Load test data
claims_test, premises_test, relationships_test = get_test_data()

In [32]:
import pandas as pd

rows = []
for i in range(len(claims_test)):
    rows.append({"text": str(claims_test[i]), "true_type": "claim", "true_stance": relationships_test[i]})
    rows.append({"text": str(premises_test[i]), "true_type": "premise", "true_stance": relationships_test[i]})

test_df = pd.DataFrame(rows)

test_df['text'] = test_df['text'].str.replace(r"ADU with the id \d+ text:\s*", "", regex=True)
test_df['text'] = test_df['text'].str.replace(r"\s+Type:.*$", "", regex=True)
test_df['true_stance'] = test_df['true_stance'].str.replace('stance_', '')

print(test_df.head())

                                                text true_type true_stance
0  Should we be able to choose our own High Schoo...     claim         con
1  no because the people that built your school w...   premise         con
2    should we be able to express religion in school     claim         pro
3  in public school you have a right to express y...   premise         pro
4                   Should we be able to time travel     claim         con


In [33]:
# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained("/Users/lenap/Documents/TinyLlama-1.1B-Chat-v1.0_finetuned-ARMIN")

# Load base model
base_model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0" 
base_model = AutoModelForCausalLM.from_pretrained(base_model_name)

# Load the PEFT adapter
model = PeftModel.from_pretrained(base_model, "/Users/lenap/Documents/TinyLlama-1.1B-Chat-v1.0_finetuned-ARMIN")

# Set evaluation mode
model.eval()

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(32000, 2048)
        (layers): ModuleList(
          (0-21): 22 x LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): lora.Linear(
                (base_layer): Linear(in_features=2048, out_features=2048, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.1, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=2048, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=2048, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): Linear(in_featu

In [38]:
def classify_text(text):
    prompt = f"<|system|>You are an argumentation analysis model.<|user|>Identify whether this is a claim or a premise, and say whether it supports or opposes its main claim:**\n\n**{text}<|assistant|>"
    inputs = tokenizer(
        prompt, 
        return_tensors="pt", 
        max_length=1800,  # Leave room for generation
        truncation=True
    ).to(model.device)
    
    outputs = model.generate(
        **inputs,
        max_new_tokens=100,
        do_sample=False,
        eos_token_id=tokenizer.eos_token_id
    )
    decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return decoded.split(prompt)[-1].strip()

In [40]:
import re

# function to parse LLM output
def parse_classification(llm_output):
    output_lower = llm_output.lower()
    
    # Extract type (claim or premise)
    if 'claim' in output_lower:
        pred_type = 'claim'
    elif 'premise' in output_lower:
        pred_type = 'premise'
    else:
        pred_type = 'unknown'
    
    # Extract stance (pro/con, supports/opposes)
    if any(word in output_lower for word in ['supports', 'pro', 'favor', 'agrees']):
        pred_stance = 'pro'
    elif any(word in output_lower for word in ['opposes', 'con', 'against', 'disagrees']):
        pred_stance = 'con'
    else:
        pred_stance = 'unknown'
    
    return pred_type, pred_stance


In [41]:
# Take only the first 10 rows
test_df_small = test_df.head(10).copy()

# Run classification on the small dataset
print("Running LLM inference on first 10 texts...")
test_df_small['llm_raw_output'] = test_df_small['text'].apply(classify_text)

# Parse the outputs
print("Parsing LLM outputs...")
parsed_results = test_df_small['llm_raw_output'].apply(parse_classification)
test_df_small['predicted_type'] = [result[0] for result in parsed_results]
test_df_small['predicted_stance'] = [result[1] for result in parsed_results]

# Display results
print("Results for first 10 texts:")
print(test_df_small[['text', 'true_type', 'true_stance', 'predicted_type', 'predicted_stance', 'llm_raw_output']])

# Quick accuracy check
type_correct = (test_df_small['true_type'] == test_df_small['predicted_type']).sum()
stance_correct = (test_df_small['true_stance'] == test_df_small['predicted_stance']).sum()
both_correct = ((test_df_small['true_type'] == test_df_small['predicted_type']) & 
                (test_df_small['true_stance'] == test_df_small['predicted_stance'])).sum()

print(f"\nQuick Results (n=10):")
print(f"Type accuracy: {type_correct}/10 = {type_correct/10:.1%}")
print(f"Stance accuracy: {stance_correct}/10 = {stance_correct/10:.1%}")
print(f"Both correct: {both_correct}/10 = {both_correct/10:.1%}")

Running LLM inference on first 10 texts...
Parsing LLM outputs...
Results for first 10 texts:
                                                text true_type true_stance  \
0  Should we be able to choose our own High Schoo...     claim         con   
1  no because the people that built your school w...   premise         con   
2    should we be able to express religion in school     claim         pro   
3  in public school you have a right to express y...   premise         pro   
4                   Should we be able to time travel     claim         con   
5  I don't think we should time travel because it...   premise         con   
6                      Should we be allowed to clone     claim         pro   
7  I think that we should NOT have the right to c...   premise         pro   
8                Should we be allowed to drive at 14     claim         con   
9                                               Uhg!   premise         con   

  predicted_type predicted_stance  \
0         

In [None]:
# Apply classification and parsing
print("Running LLM inference on text data...")
test_df['llm_raw_output'] = test_df['text'].apply(classify_text)

# Parse the outputs
print("Parsing LLM outputs...")
parsed_results = test_df['llm_raw_output'].apply(parse_classification)
test_df['predicted_type'] = [result[0] for result in parsed_results]
test_df['predicted_stance'] = [result[1] for result in parsed_results]

# Display results with comparison
comparison_df = test_df[['text', 'true_type', 'true_stance', 'predicted_type', 'predicted_stance', 'llm_raw_output']].copy()
print(comparison_df.head())

# Calculate accuracy
type_accuracy = (test_df['true_type'] == test_df['predicted_type']).mean()
stance_accuracy = (test_df['true_stance'] == test_df['predicted_stance']).mean()
both_accuracy = ((test_df['true_type'] == test_df['predicted_type']) & 
                 (test_df['true_stance'] == test_df['predicted_stance'])).mean()

print(f"\nAccuracy Results:")
print(f"Type Classification Accuracy: {type_accuracy:.3f}")
print(f"Stance Classification Accuracy: {stance_accuracy:.3f}")
print(f"Both Correct Accuracy: {both_accuracy:.3f}")

# Save detailed results
test_df.to_csv('detailed_classification_results.csv', index=False)

Running LLM inference on text data...


KeyboardInterrupt: 