In [2]:
import pandas as pd
import ast

posts_file = '/home/stud/haroonm0/localdisk/Fact_check/dataset/posts.csv'
fact_checks_file = '/home/stud/haroonm0/localdisk/Fact_check/dataset/fact_checks.csv'
pairs_file = '/home/stud/haroonm0/localdisk/Fact_check/dataset/pairs.csv'


posts_df = pd.read_csv(posts_file)
fact_checks_df = pd.read_csv(fact_checks_file)
pairs_df = pd.read_csv(pairs_file)

def safe_eval(val):
    try:
        return ast.literal_eval(val)
    except (ValueError, SyntaxError):
        return val


def extract_original_language_ocr(ocr):
    try:
        ocr_data = ast.literal_eval(ocr)  
        if isinstance(ocr_data, list) and len(ocr_data) > 0:
            return ocr_data[0][0]  
        return ""
    except (ValueError, SyntaxError):
        return ""

def combine_text(row):
    ocr_text = extract_original_language_ocr(row['ocr'])
    if pd.isna(row['text']):  
        return ocr_text
    return ocr_text + " " + row['text'] 
posts_df['combined_text'] = posts_df.apply(combine_text, axis=1)

def extract_original_language_claim(claim):
    claim_data = safe_eval(claim)  
    if isinstance(claim_data, tuple) and len(claim_data) > 0:
        return claim_data[0] 
    return claim

fact_checks_df['claim_original'] = fact_checks_df['claim'].apply(extract_original_language_claim)

merged_df = pairs_df.merge(posts_df[['post_id', 'combined_text']], on='post_id', how='left')


merged_df = merged_df.merge(fact_checks_df[['fact_check_id', 'claim_original']], on='fact_check_id', how='left')


final_df = merged_df[['post_id', 'fact_check_id', 'combined_text', 'claim_original']]

# Step 5: Save the prepared data to a new CSV file for manual checking
final_df.to_csv('combined_data.csv', index=False)

print("Preprocessing complete. The data has been saved to 'combined_data.csv'.")


Preprocessing complete. The data has been saved to 'combined_data.csv'.


In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split


data = pd.read_csv('combined_data.csv')


train_df, temp_df = train_test_split(data, test_size=0.2, random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)


train_df.to_csv('train_data.csv', index=False)
val_df.to_csv('val_data.csv', index=False)
test_df.to_csv('test_data.csv', index=False)

print(f"Train size: {len(train_df)}, Validation size: {len(val_df)}, Test size: {len(test_df)}")


Train size: 20594, Validation size: 2574, Test size: 2575


In [8]:
import pandas as pd
import os

# Function to generate negative pairs
def generate_negative_pairs(df):
    negative_pairs = []
    for idx, row in df.iterrows():
        # Select a random row for a wrong claim (ensure it's not the same as the current row)
        wrong_claim = df.sample(1).iloc[0]['claim_original']
        while wrong_claim == row['claim_original']:
            wrong_claim = df.sample(1).iloc[0]['claim_original']
        negative_pairs.append((row['combined_text'], wrong_claim))
    
    # Create a DataFrame for negative pairs
    negative_pairs_df = pd.DataFrame(negative_pairs, columns=['combined_text', 'claim_original'])
    return negative_pairs_df

# Function to process a dataset (generate pairs and save)
def process_and_save_dataset(data_path, output_file):
    print(f"Processing dataset: {data_path}")
    data = pd.read_csv(data_path)
    print(f"Data loaded successfully! Number of records: {len(data)}")

    # Generate positive pairs
    print("Generating positive pairs...")
    positive_pairs = data[['combined_text', 'claim_original']]
    print(f"Positive pairs (first 5):\n{positive_pairs.head()}")

    # Generate negative pairs
    print("Generating negative pairs...")
    negative_df = generate_negative_pairs(data)
    print(f"Negative pairs (first 5):\n{negative_df.head()}")
    print("Negative pairs generated!")

    # Combine positive and negative pairs
    print("Combining positive and negative pairs...")
    all_pairs = pd.DataFrame(positive_pairs, columns=['combined_text', 'claim_original'])
    all_pairs['label'] = 1  # Label positive pairs with 1
    negative_df['label'] = 0  # Label negative pairs with 0

    final_df = pd.concat([all_pairs, negative_df]).sample(frac=1).reset_index(drop=True)
    print(f"Combined pairs (first 5):\n{final_df.head()}")

    # Save the final DataFrame to CSV
    print(f"Saving to {output_file}...")
    final_df.to_csv(output_file, index=False)
    print(f"Dataset saved successfully at {output_file}!\n")

# File paths for the datasets
datasets = {
    "train": "train_data.csv",
    "val": "val_data.csv",
    "test": "test_data.csv"
}

# Directory to save processed datasets
output_dir = "processed_data"
os.makedirs(output_dir, exist_ok=True)

# Process each dataset
for name, data_path in datasets.items():
    output_file = os.path.join(output_dir, f"final_{name}_data.csv")
    process_and_save_dataset(data_path, output_file)


Processing dataset: train_data.csv
Data loaded successfully! Number of records: 20594
Generating positive pairs...
Positive pairs (first 5):
                                       combined_text  \
0   ('Fue en 1908. Los belgas leyendo la Biblia a...   
1  TN Todo Noticias [USER] Noticias ULTIMO MOMENT...   
2  ان شهادت ه امام من شریک تمبوتراب اسکاوٹ ('Masj...   
3   ('Video of a man burning up Trump ballots and...   
4   ('Well Facebook said that if you support the ...   

                                      claim_original  
0  Los belgas leyendo la Biblia antes de colgar a...  
1  Falleció el periodista Carlos Ferrara, que hab...  
2  Photo shows Syed Murad Ali Shah, chief ministe...  
3  Pro-Trump Ballots Burned In Viral Video Were O...  
4  Facebook posts claim the social network will r...  
Generating negative pairs...
Negative pairs (first 5):
                                       combined_text  \
0   ('Fue en 1908. Los belgas leyendo la Biblia a...   
1  TN Todo Noticias [USER

In [3]:
import os
import logging
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments
from peft import get_peft_model, LoraConfig
from trl import SFTTrainer
from datasets import load_dataset

# Configure logging
log_file = "training.log"
logging.basicConfig(
    level=logging.DEBUG,
    format="%(asctime)s - %(levelname)s - %(message)s",
    handlers=[
        logging.FileHandler(log_file, mode='w'),
        logging.StreamHandler()
    ]
)

logging.info("Script started.")

# Define directory paths
data_dir = "processed_data"
train_file = os.path.join(data_dir, "final_train_data.csv")
val_file = os.path.join(data_dir, "final_val_data.csv")
test_file = os.path.join(data_dir, "final_test_data.csv")

# Step 1: Load Preprocessed Data as Hugging Face Dataset
logging.info("Loading preprocessed datasets...")
train_dataset = load_dataset("csv", data_files={"train": train_file})["train"]
val_dataset = load_dataset("csv", data_files={"val": val_file})["val"]
test_dataset = load_dataset("csv", data_files={"test": test_file})["test"]

logging.info(f"Datasets loaded successfully! Train: {len(train_dataset)}, Val: {len(val_dataset)}, Test: {len(test_dataset)}")

# Step 2: Load Model and Tokenizer
model_name = "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit"
logging.info(f"Loading model: {model_name}")
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    load_in_4bit=True
)

# LoRA Configuration
lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    lora_dropout=0.1,
    task_type="CAUSAL_LM"
)
model = get_peft_model(model, lora_config)
logging.info("Model loaded and LoRA configuration applied.")

tokenizer = AutoTokenizer.from_pretrained(model_name)

# Step 3: Define Tokenization Function
def tokenize_with_instruction(example, tokenizer, max_seq_length):
    """
    Tokenize the dataset example for instruction-based fine-tuning.
    """
    instruction = f"Instruction: Determine if the claim matches the post. Label: {example['label']}"
    inputs = f"{instruction}\nInput: Post: {example['combined_text']}\nClaim: {example['claim_original']}"
    tokenized = tokenizer(
        inputs,
        truncation=True,
        max_length=max_seq_length,
        padding="max_length"
    )
    tokenized["labels"] = tokenized["input_ids"].copy()
    return tokenized

max_seq_length = 2048

# Tokenize the datasets
logging.info("Tokenizing datasets...")
tokenized_train = train_dataset.map(
    lambda x: tokenize_with_instruction(x, tokenizer, max_seq_length),
    batched=False,
    keep_in_memory=True
)
tokenized_val = val_dataset.map(
    lambda x: tokenize_with_instruction(x, tokenizer, max_seq_length),
    batched=False,
    keep_in_memory=True
)
tokenized_test = test_dataset.map(
    lambda x: tokenize_with_instruction(x, tokenizer, max_seq_length),
    batched=False,
    keep_in_memory=True
)
logging.info("Datasets tokenized successfully.")

# Step 4: Define Formatting Function
def formatting_func(example):
    """
    Format the dataset example for debugging or visualization.
    """
    return (
        f"Instruction: Determine if the claim matches the post. Label: {example['label']}\n"
        f"Input: Post: {example['combined_text']}\n"
        f"Claim: {example['claim_original']}"
    )

# Debugging: Save the first 5 tokenized examples to a text file
def debug_tokenization(dataset, tokenizer, max_seq_length, output_file):
    """
    Debug tokenization for the first 5 examples in the dataset.
    """
    logging.info("Starting tokenization debug for first 5 entries.")
    results = []
    for i, example in enumerate(dataset):
        if i >= 5:
            break
        logging.info(f"Example {i} input: {example}")
        tokenized = tokenize_with_instruction(example, tokenizer, max_seq_length)
        logging.info(f"Example {i} tokenized: {tokenized}")
        results.append({"input": example, "tokenized": tokenized})
    with open(output_file, "w") as f:
        for result in results:
            f.write("Input:\n")
            f.write(f"{result['input']}\n")
            f.write("Tokenized Output:\n")
            f.write(f"{result['tokenized']}\n")
            f.write("="*50 + "\n")
    logging.info(f"Debug results saved to {output_file}.")

debug_tokenization(train_dataset, tokenizer, max_seq_length, "debug_train.txt")

# Step 5: Configure and Train with SFTTrainer
logging.info("Configuring trainer...")
training_args = TrainingArguments(
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    warmup_steps=5,
    max_steps=1000,
    learning_rate=2e-4,
    fp16=True,
    logging_steps=10,
    output_dir="outputs",
    save_steps=100,
    report_to="none",
)
logging.info("Training arguments configured.")

trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    dataset_text_field=None,
    max_seq_length=max_seq_length,
    formatting_func=formatting_func,
    args=training_args,
)

# Training
logging.info("Starting training...")
try:
    trainer.train()
    logging.info("Training completed successfully.")
except Exception as e:
    logging.error(f"Training failed: {str(e)}")
    raise

# Save the Model
output_dir = "outputs"
logging.info(f"Saving the model to {output_dir}...")
try:
    trainer.save_model(output_dir=output_dir)
    logging.info("Model saved successfully.")
except Exception as e:
    logging.error(f"Failed to save the model: {str(e)}")
    raise

logging.info("Script completed.")


2024-12-21 01:16:33,691 - INFO - Script started.
2024-12-21 01:16:33,692 - INFO - Loading preprocessed datasets...
2024-12-21 01:16:33,693 - DEBUG - Resetting dropped connection: s3.amazonaws.com
2024-12-21 01:16:34,049 - DEBUG - https://s3.amazonaws.com:443 "HEAD /datasets.huggingface.co/datasets/datasets/csv/csv.py HTTP/11" 200 0
2024-12-21 01:16:34,064 - DEBUG - Attempting to acquire lock 139796186002912 on /home/stud/haroonm0/.cache/huggingface/datasets/_home_stud_haroonm0_.cache_huggingface_datasets_csv_default-bfc2c097ad229f80_0.0.0_9ea1179385ff7ad1e756d327ffccaa3b801175702a2d91528226ba2c66873f52.lock
2024-12-21 01:16:34,079 - DEBUG - Lock 139796186002912 acquired on /home/stud/haroonm0/.cache/huggingface/datasets/_home_stud_haroonm0_.cache_huggingface_datasets_csv_default-bfc2c097ad229f80_0.0.0_9ea1179385ff7ad1e756d327ffccaa3b801175702a2d91528226ba2c66873f52.lock
2024-12-21 01:16:34,084 - DEBUG - open file: /home/stud/haroonm0/.cache/huggingface/datasets/csv/default-bfc2c097ad22

OutOfMemoryError: CUDA out of memory. Tried to allocate 112.00 MiB. GPU 0 has a total capacity of 23.55 GiB of which 38.69 MiB is free. Including non-PyTorch memory, this process has 23.48 GiB memory in use. Of the allocated memory 22.93 GiB is allocated by PyTorch, and 105.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

The most similar claim to the post is: 'Smoking is harmful to your health.'
