In [1]:
%pwd

'c:\\Users\\nihar\\OneDrive\\Desktop\\debatemodel\\research'

In [2]:
import os,glob
os.chdir('../')

In [3]:
%pwd
print(os.getcwd())  # current working dir
print(glob.glob("data/*.pdf"))  # list PDFs inside data/
print(glob.glob("data/*.csv"))  # list PDFs inside data/

c:\Users\nihar\OneDrive\Desktop\debatemodel
['data\\21 Lessons for the 21st Century by Yuval Noah Harari.pdf', 'data\\Argumentation and Debates.pdf', 'data\\Factfulness by Hans Rosling.pdf', 'data\\Thank You for Arguing_ What Aristotle, Lincoln, and Homer Simpson Can Teach Us .. by Jay Heinrichs.pdf', 'data\\The_Art_of_Public_Speaking.pdf', 'data\\Thinking, Fast and Slow by Daniel Kahneman.pdf']
['data\\1.csv', 'data\\2.csv', 'data\\3.csv', 'data\\4.csv', 'data\\5.csv', 'data\\6.csv', 'data\\7.csv', 'data\\8.csv', 'data\\9.csv']


In [5]:
from langchain_community.document_loaders import PyPDFLoader, DirectoryLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter


In [None]:
def load_pdf_file(path):
    loader = PyPDFLoader(path)
    documents = loader.load()
    return documents

# Use raw string (r"...") to avoid escape errors
path = r"data\21 Lessons for the 21st Century by Yuval Noah Harari.pdf"

extracted_data = load_pdf_file(path)
print(len(extracted_data))

340


In [6]:
# Quick Setup and Usage Example
# Save this as normalize_csvs.py and run it

import pandas as pd
import os

def quick_normalize_csvs(data_folder="data"):
    """Quick function to normalize your CSV files"""
    
    import glob
    csv_files = glob.glob(os.path.join(data_folder, "*.csv"))
    all_data = []
    
    for file_path in csv_files:
        print(f"Processing {file_path}...")
        
        try:
            df = pd.read_csv(file_path)
            file_name = os.path.basename(file_path).replace('.csv', '')
            
            for idx, row in df.iterrows():
                # Extract data with error handling
                normalized_row = {}
                
                # Basic info
                normalized_row['file_id'] = file_name
                normalized_row['pair_id'] = str(row.get('_id', ''))
                normalized_row['topic'] = str(row.get('_topic', ''))
                normalized_row['task'] = str(row.get('_task', 'ARG'))
                
                # Premise info (handle different formats)
                premise_id = ""
                premise_text = ""
                
                if 't/_id' in row.index:
                    premise_id = str(row['t/_id']) if pd.notna(row['t/_id']) else ""
                elif 't/0/_id' in row.index:
                    premise_id = str(row['t/0/_id']) if pd.notna(row['t/0/_id']) else ""
                
                if 't/__text' in row.index:
                    premise_text = str(row['t/__text']) if pd.notna(row['t/__text']) else ""
                elif 't/0/__text' in row.index:
                    premise_text = str(row['t/0/__text']) if pd.notna(row['t/0/__text']) else ""
                
                # Hypothesis info
                hypothesis_id = str(row.get('h/_id', '')) if pd.notna(row.get('h/_id', '')) else ""
                hypothesis_text = str(row.get('h/__text', '')) if pd.notna(row.get('h/__text', '')) else ""
                
                # COMBINE ENTAILMENT AND ARGUMENT INTO UNIFIED ARGUMENT_TYPE
                # First get entailment value
                entailment_raw = row.get('_entailment', '') or row.get('_ENTAILMENT', '')
                
                # Then get argument value
                argument_raw = ''
                if '_argument' in row.index and pd.notna(row['_argument']):
                    argument_raw = str(row['_argument']).lower()
                elif '_BAF' in row.index and pd.notna(row['_BAF']):
                    argument_raw = str(row['_BAF']).lower()
                
                # Unified argument type mapping
                if pd.isna(entailment_raw) and not argument_raw:
                    argument_type = 'neutral'
                else:
                    # Map entailment to argument type
                    if str(entailment_raw).upper() in ['YES', 'ENTAILMENT']:
                        argument_type = 'support'
                    elif str(entailment_raw).upper() in ['NO', 'CONTRADICTION']:
                        argument_type = 'attack'
                    else:
                        # Use the original argument value if available
                        argument_type = argument_raw if argument_raw else 'neutral'
                
                # Ensure we only have attack, support, or neutral
                if argument_type not in ['attack', 'support', 'neutral']:
                    argument_type = 'neutral'
                
                # Store normalized data
                normalized_row.update({
                    'premise_id': premise_id,
                    'premise_text': premise_text.strip(),
                    'hypothesis_id': hypothesis_id, 
                    'hypothesis_text': hypothesis_text.strip(),
                    'argument_type': argument_type,  # Unified column
                    'original_entailment': str(entailment_raw) if pd.notna(entailment_raw) else '',
                    'original_argument': argument_raw,
                    'complex_attack_type': str(row.get('_complex-attack', '')),
                    'source_file': file_path
                })
                
                all_data.append(normalized_row)
                
        except Exception as e:
            print(f"Error processing {file_path}: {e}")
            continue
    
    # Create DataFrame
    df_normalized = pd.DataFrame(all_data)
    
    # Clean empty rows
    df_normalized = df_normalized[
        (df_normalized['premise_text'] != '') | 
        (df_normalized['hypothesis_text'] != '')
    ]
    
    # Add helper columns
    df_normalized['unique_id'] = df_normalized['file_id'] + '_' + df_normalized['pair_id']
    df_normalized['content_length'] = (df_normalized['premise_text'].str.len() + 
                                     df_normalized['hypothesis_text'].str.len())
    
    return df_normalized

def create_rag_documents(df):
    """Convert to RAG-ready format"""
    rag_docs = []
    
    for _, row in df.iterrows():
        # Create combined context for better retrieval
        combined_context = f"TOPIC: {row['topic']} | ARGUMENT_TYPE: {row['argument_type']}"
        
        if row['premise_text'].strip():
            rag_docs.append({
                'doc_id': f"{row['unique_id']}_premise",
                'content': row['premise_text'], 
                'doc_type': 'premise',
                'topic': row['topic'],
                'argument_type': row['argument_type'],
                'metadata': combined_context,
                'full_context': f"{combined_context} | PREMISE: {row['premise_text']}"
            })
        
        if row['hypothesis_text'].strip():
            rag_docs.append({
                'doc_id': f"{row['unique_id']}_hypothesis",
                'content': row['hypothesis_text'],
                'doc_type': 'hypothesis', 
                'topic': row['topic'],
                'argument_type': row['argument_type'],
                'metadata': combined_context,
                'full_context': f"{combined_context} | HYPOTHESIS: {row['hypothesis_text']}"
            })
    
    return pd.DataFrame(rag_docs)

# Run the normalization
if __name__ == "__main__":
    print("üöÄ Starting CSV normalization with unified argument types...")
    
    # Normalize all CSVs in the data folder
    normalized_df = quick_normalize_csvs("data")  # Change "data" to your folder path
    
    print(f"‚úÖ Processed {len(normalized_df)} rows")
    
    # Save normalized data
    normalized_df.to_csv("normalized_debate_data_unified.csv", index=False)
    print("üíæ Saved: normalized_debate_data_unified.csv")
    
    # Create RAG format
    rag_df = create_rag_documents(normalized_df)
    rag_df.to_csv("rag_debate_documents_unified.csv", index=False) 
    print("üíæ Saved: rag_debate_documents_unified.csv")
    
    # Show statistics
    print(f"\nüìä STATISTICS:")
    print(f"Total pairs: {len(normalized_df)}")
    print(f"Unique topics: {normalized_df['topic'].nunique()}")
    print(f"Topics: {list(normalized_df['topic'].unique())}")
    print(f"Argument type distribution:")
    print(normalized_df['argument_type'].value_counts())
    
    print(f"\nRAG documents created: {len(rag_df)}")
    
    # Show preview
    print(f"\nüëÄ PREVIEW OF UNIFIED DATA:")
    print(normalized_df[['topic', 'premise_text', 'hypothesis_text', 'argument_type']].head(10))
    
    # Show mapping examples
    print(f"\nüîç MAPPING EXAMPLES:")
    sample = normalized_df[['original_entailment', 'original_argument', 'argument_type']].head(5)
    for _, row in sample.iterrows():
        print(f"Entailment: '{row['original_entailment']}' + Argument: '{row['original_argument']}' ‚Üí Unified: '{row['argument_type']}'")

üöÄ Starting CSV normalization with unified argument types...
Processing data\1.csv...
Processing data\2.csv...
Processing data\3.csv...
Processing data\4.csv...
Processing data\5.csv...
Processing data\6.csv...
Processing data\7.csv...
Processing data\8.csv...
Processing data\9.csv...
‚úÖ Processed 1123 rows
üíæ Saved: normalized_debate_data_unified.csv
üíæ Saved: rag_debate_documents_unified.csv

üìä STATISTICS:
Total pairs: 1123
Unique topics: 25
Topics: ['Internetaccess', 'Groundzeromosque', 'Militaryservice', 'Noflyzone', 'Securityprofiling', 'Solarenergy', 'Gasvehicles', 'Cellphones', 'Marijuanafree', 'Gaymarriage', 'Vegetarianism', 'Violentgames', 'Chinaonechildpolicy', 'Cocanarcotic', 'Childbeautycontests', 'Arminglibianrebels', 'Sobrietytest', 'Osamaphoto', 'Privatizingsocialsecurity', 'Tablet', 'Obesity', 'Abortion', 'Act1-TwelveAngryMan', 'Act2-TwelveAngryMan', 'Act3-TwelveAngryMan']
Argument type distribution:
argument_type
attack     629
support    344
neutral    150
N

In [7]:
from dotenv import load_dotenv

# Reload fresh from .env file
load_dotenv(override=True)  # ‚Üê Add override=True

# Now check again
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
from pinecone import Pinecone
pinecone_api_key=PINECONE_API_KEY
pc=Pinecone(api_key=pinecone_api_key)

Exception: The official Pinecone python package has been renamed from `pinecone-client` to `pinecone`. Please remove `pinecone-client` from your project dependencies and add `pinecone` instead. See the README at https://github.com/pinecone-io/pinecone-python-client for more information on using the python SDK.

In [8]:
# setup_pinecone.py
import os
import pandas as pd
from dotenv import load_dotenv
from sentence_transformers import SentenceTransformer
from pinecone import Pinecone, ServerlessSpec

# Load env vars
load_dotenv(override=True)
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")

# Load your RAG debate dataset
df = pd.read_csv("rag_debate_documents_unified.csv")

# Embedding model
model = SentenceTransformer("all-MiniLM-L6-v2")

# Initialize Pinecone client
pc = Pinecone(api_key=PINECONE_API_KEY)

index_name = "debate-knowledge-base"

# Create index if not exists
if index_name not in pc.list_indexes().names():
    pc.create_index(
        name=index_name,
        dimension=384,  # embedding dim for MiniLM
        metric="cosine",
        spec=ServerlessSpec(cloud="aws", region="us-east-1")
    )

# Connect to index
index = pc.Index(index_name)

# Prepare and upsert embeddings
embeddings = model.encode(df["content"].tolist(), convert_to_numpy=True, show_progress_bar=True)

vectors = []
for i, (emb, meta) in enumerate(zip(embeddings, df.to_dict(orient="records"))):
    vectors.append({
        "id": str(i),       # unique ID
        "values": emb.tolist(),
        "metadata": meta
    })

# Batch upload
batch_size = 100
for i in range(0, len(vectors), batch_size):
    index.upsert(vectors[i:i+batch_size])

print(f"‚úÖ Uploaded {len(vectors)} debate documents to Pinecone")


Exception: The official Pinecone python package has been renamed from `pinecone-client` to `pinecone`. Please remove `pinecone-client` from your project dependencies and add `pinecone` instead. See the README at https://github.com/pinecone-io/pinecone-python-client for more information on using the python SDK.

In [21]:
# setup_pinecone.py
import os
import pandas as pd
from dotenv import load_dotenv
from sentence_transformers import SentenceTransformer
from pinecone import Pinecone, ServerlessSpec

# Load API key from .env
load_dotenv(override=True)
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")

# Load your dataset
df = pd.read_csv("rag_debate_documents_unified.csv")

# Embedding model
model = SentenceTransformer("all-MiniLM-L6-v2")

# Initialize Pinecone client
pc = Pinecone(api_key=PINECONE_API_KEY)

index_name = "debate-knowledge-base"

# Create index if it does not exist
if index_name not in pc.list_indexes().names():
    pc.create_index(
        name=index_name,
        dimension=384,  # embedding size for MiniLM
        metric="cosine",
        spec=ServerlessSpec(cloud="aws", region="us-east-1")
    )

# Connect to the index
index = pc.Index(index_name)

# Encode embeddings
embeddings = model.encode(df["content"].tolist(), convert_to_numpy=True, show_progress_bar=True)

# Format vectors with metadata
vectors = []
for i, (emb, meta) in enumerate(zip(embeddings, df.to_dict(orient="records"))):
    vectors.append({
        "id": str(i),
        "values": emb.tolist(),
        "metadata": meta
    })

# Batch upload
batch_size = 100
for i in range(0, len(vectors), batch_size):
    index.upsert(vectors[i:i+batch_size])

print(f"‚úÖ Uploaded {len(vectors)} debate documents to Pinecone")


Batches: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 70/70 [00:16<00:00,  4.20it/s]


‚úÖ Uploaded 2237 debate documents to Pinecone


In [9]:
# debate_bot_pinecone.py
import os
from dotenv import load_dotenv
from sentence_transformers import SentenceTransformer
from pinecone import Pinecone

# Load env
load_dotenv(override=True)
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")

# Init Pinecone
pc = Pinecone(api_key=PINECONE_API_KEY)
index = pc.Index("debate-knowledge-base")

# Embedding model
model = SentenceTransformer("all-MiniLM-L6-v2")

def debate_response(user_input, user_side="support"):
    """
    user_side = 'support' or 'attack'
    The bot retrieves counter-arguments (opposite side).
    """
    user_emb = model.encode([user_input], convert_to_numpy=True)[0]

    results = index.query(
        vector=user_emb.tolist(),
        top_k=5,
        include_metadata=True
    )

    counter_args = [
        match["metadata"] for match in results["matches"]
        if match["metadata"]["argument_type"] != user_side
    ]
    return counter_args

if __name__ == "__main__":
    user_statement = "I believe internet access should be a human right."
    counter = debate_response(user_statement, user_side="support")

    print("üî• Counter Arguments Retrieved:")
    for arg in counter:
        print(f"- {arg['doc_type'].upper()} | {arg['topic']}: {arg['content'][:200]}...")


Exception: The official Pinecone python package has been renamed from `pinecone-client` to `pinecone`. Please remove `pinecone-client` from your project dependencies and add `pinecone` instead. See the README at https://github.com/pinecone-io/pinecone-python-client for more information on using the python SDK.

In [None]:
# import time

# def debate_session(topic, chatModel, retriever, prompt, rounds=6):
#     """
#     Conducts a turn-based debate for a given topic between the human (you)
#     and the debate assistant (AI).
#     Each side speaks alternately.
#     """

#     print(f"\nüé§ Debate Topic: {topic}")
#     print("=" * 70)

#     # Create retrieval + QA chain
#     question_answer_chain = create_stuff_documents_chain(chatModel, prompt)
#     rag_chain = create_retrieval_chain(retriever, question_answer_chain)

#     user_turn = f"I believe {topic} because..."
    
#     for i in range(rounds):
#         print(f"\nüßç‚Äç‚ôÇÔ∏è You: {user_turn}")
        
#         # Model‚Äôs turn ‚Äî responds using context
#         response = rag_chain.invoke({"input": user_turn})
#         model_reply = response["answer"]

#         print(f"\nü§ñ DebateBot: {model_reply}")

#         # Add a small pause for readability (simulate real-time debate)
#         time.sleep(2)

#         # Optional: Auto-generate a counter-argument for realism
#         if i < rounds - 1:
#             user_turn = f"However, I would argue that {topic} might also have opposing concerns, such as {model_reply.split('.')[0].lower()}..."
    
#     print("\nüèÅ Debate concluded! Great discussion üëè")

# # Example usage:
# debate_session(
#     topic="Artificial intelligence should be regulated by governments.",
#     chatModel=chatModel,
#     retriever=retreiver,
#     prompt=prompt,
#     rounds=6  # about 1 minute of debate
# )
