In [1]:
import sys
print(sys.executable)



c:\Users\bhara\Desktop\Python\I2SC_task\.venv\Scripts\python.exe


In [2]:
import pandas as pd

# Define column names based on LIAR dataset structure
columns = [
    "id", "label", "statement", "subject", "speaker", "job", 
    "state", "party", "barely_true_counts", "false_counts", 
    "half_true_counts", "mostly_true_counts", "pants_on_fire_counts", "context"
]

# Load the dataset

df = pd.read_csv('train.tsv', sep='\t', header=None, names=columns)

# Keep only relevant columns
df = df[["statement", "label", "context", "speaker",'subject']]


In [3]:
import string
import pandas as pd
import nltk
from nltk.corpus import stopwords

# Download NLTK stopwords (only needed once)
nltk.download('stopwords')

# Define text cleaning function
def clean_text(text):
    if isinstance(text, str):
        text = text.lower()  # Convert to lowercase
        text = text.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation
        stop_words = set(stopwords.words('english'))
        text = ' '.join([word for word in text.split() if word not in stop_words])

    else:
        text = ''
    
    return text

# Preprocess only text-based columns
text_columns = ['statement', 'context', 'subject','speaker'] 
for column in text_columns:
    df[column] = df[column].astype(str).apply(clean_text)



[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\bhara\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
df.to_csv("preprocessed_train.csv", index=False)

In [5]:
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\bhara\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\bhara\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [6]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from rank_bm25 import BM25Okapi

# Load preprocessed train data
train_df = pd.read_csv("preprocessed_train.csv")

# Ensure 'statement' column is treated as text
train_df["statement"] = train_df["statement"].astype(str)

# Tokenize the statements

tokenized_statements = [word_tokenize(statement) for statement in train_df["statement"]]

# Initialize BM25 Index
bm25 = BM25Okapi(tokenized_statements)

# Function to retrieve top-k similar claims
def retrieve_claim(query, top_k=3):
    query_tokens = word_tokenize(query.lower())  # Tokenize the query
    scores = bm25.get_scores(query_tokens)  # Get BM25 scores for all claims
    top_indices = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)[:top_k]  # Get top-k matches
    
    results = []
    for idx in top_indices:
        results.append({
            "statement": train_df.iloc[idx]["statement"],
            "label": train_df.iloc[idx]["label"],
            "context": train_df.iloc[idx]["context"],
            "speaker": train_df.iloc[idx]["speaker"],
            "score": scores[idx]  # BM25 relevance score
        })
    
    return results

# Example Query
query = "Since Republicans took over after the 2010 election, the graduation rate in Wisconsin has gone from 86 percent to 88 percent. The black graduation rate has gone from 60 percent to 65 percent. The Latino graduation rate has gone from 65 percent to 71 percent."

retrieved_claims = retrieve_claim(query)

# Print Results
for claim in retrieved_claims:
    print(f"Statement: {claim['statement']}")
    print(f"Label: {claim['label']}")
    print(f"Context: {claim['context']}")
    print(f"speaker {claim['speaker']}")
    print(f"BM25 Score: {claim['score']:.4f}")
    print("-" * 80)


Statement: since republicans took 2010 election graduation rate wisconsin gone 86 percent 88 percent black graduation rate gone 60 percent 65 percent latino graduation rate gone 65 percent 71 percent
Label: half-true
Context: nan
speaker dalekooyenga
BM25 Score: 123.0457
--------------------------------------------------------------------------------
Statement: says milwaukee blacks 55 percent male unemployment 60 percent truancy 50 percent graduation rate worst reading scores lead infant mortality
Label: half-true
Context: remarks
speaker davidclarkejr
BM25 Score: 43.5969
--------------------------------------------------------------------------------
Statement: rate uninsured americans 88 percent
Label: half-true
Context: medium post
speaker barackobama
BM25 Score: 42.6420
--------------------------------------------------------------------------------


In [11]:
import os
from groq import Groq
from dotenv import load_dotenv
load_dotenv()


api_key = os.environ.get("GROQ_API_KEY")

client = Groq(api_key=api_key)

def generate_fact_checking_response(query):
    # Retrieve top claim (replace with your actual claim retrieval logic)
    retrieved_claims = retrieve_claim(query, top_k=1)

    if not retrieved_claims:
        return "I'm sorry, I couldn't find relevant information to fact-check this statement."

    claim = retrieved_claims[0]["statement"]
    label = retrieved_claims[0]["label"]
    context = retrieved_claims[0]["context"]
    speaker = retrieved_claims[0]["speaker"]

    # Construct the prompt for the LLM
    # Using a system role can help guide the model's behavior
    system_prompt = """
    You are a fact-checking assistant. Your task is to determine if a provided claim is True or False, based strictly on the provided claim, label, context, and speaker.
    Provide a direct, brief response in the format: "If you are referring to a claim by [speaker], it is categorically [True/False]."
    Do not use external knowledge. Focus only on the provided information.
    """

    user_prompt = f"""
Claim: "{claim}"
Label: "{label}"
Context: "{context}"
Speaker: "{speaker}"
Query: "{query}"

Task: Based strictly on the provided information (Claim, Label, Context, Speaker), determine whether the claim is "True" or "False." Provide a direct response in the following format:

"If you are referring to a claim by [speaker], it is categorically [True/False]."

Ensure the response is clear, brief, and focuses only on the provided information.
"""

    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": user_prompt}
    ]

    try:
        completion = client.chat.completions.create(
            model="meta-llama/llama-4-scout-17b-16e-instruct",
            messages=messages,
            max_tokens= 200, # Adjust max_tokens as needed
             temperature=0.1, # Keep temperature low for factual responses
        )
        # Extract the content from the completion object
        return completion.choices[0].message.content.strip()

    except Exception as e:
        print(f"An error occurred: {e}")
        return "An error occurred while processing the request."


# Example usage:
query = "Is the claim that the graduation rate in Wisconsin has improved since Republicans took over true?"
response = generate_fact_checking_response(query)
print(response)

If you are referring to a claim by dalekooyenga, it is categorically False.


In [12]:
import argparse

In [13]:
def main():
    # Set up command-line argument parsing
    parser = argparse.ArgumentParser(description="Fact-checking question answering system.")
    parser.add_argument("query", type=str, help="The query/question you want to verify.")
    args = parser.parse_args()
    
    # Generate the fact-checking response for the provided query
    response = generate_fact_checking_response(args.query)
    
    # Output the response
    print("System response:", response)

if __name__ == "__main__":
    main()

usage: ipykernel_launcher.py [-h] query
ipykernel_launcher.py: error: the following arguments are required: query


SystemExit: 2

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [None]:
import sys
print(sys.executable)



c:\Users\bhara\anaconda3\python.exe
