<a href="https://colab.research.google.com/github/NAFIUROHMAN/IBM-SkillsBuild-Capstone-Project-/blob/main/Capstone_IBM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install langchain langchain-community pandas numpy matplotlib seaborn scikit-learn replicate
!pip install python-dotenv



In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
import warnings
warnings.filterwarnings('ignore')

# LangChain imports
from langchain_community.llms import Replicate
from langchain.schema import HumanMessage, SystemMessage
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain

print("✅ Libraries imported successfully!")

✅ Libraries imported successfully!


In [11]:
from google.colab import userdata
import os

def setup_replicate_token():
    """Setup Replicate API Token dengan nama secret 'API_KEY'"""

    # Method 1: Dari Colab Secrets dengan nama "API_KEY"
    try:
        REPLICATE_API_TOKEN = userdata.get('API_KEY')
        print("✅ Token loaded from Colab Secrets (API_KEY)!")
        print(f"📝 Token preview: {REPLICATE_API_TOKEN[:10]}...")
        return REPLICATE_API_TOKEN
    except Exception as e:
        print(f"❌ Token not found in Colab Secrets as 'API_KEY': {e}")

    # Method 2: Input manual
    print("\n🔑 Please enter your Replicate API Token:")
    print("1. Go to: https://replicate.com/")
    print("2. Sign in and go to API Tokens")
    print("3. Copy your token (starts with r8_...)")

    token = input("Paste your token here: ").strip()

    if token.startswith('r8_'):
        # Save to environment for future use
        os.environ["REPLICATE_API_TOKEN"] = token
        print("✅ Token saved to environment!")
        return token
    else:
        print("❌ Invalid token format. Should start with 'r8_'")
        return None

# Setup token
REPLICATE_API_TOKEN = setup_replicate_token()

if REPLICATE_API_TOKEN:
    print(f"✅ Token ready! First 10 chars: {REPLICATE_API_TOKEN[:10]}...")
else:
    print("🚨 Continuing without LLM - chatbot will use rule-based responses only")

✅ Token loaded from Colab Secrets (API_KEY)!
📝 Token preview: r8_MJkoxoF...
✅ Token ready! First 10 chars: r8_MJkoxoF...


In [12]:
def get_llm_model():
    if not REPLICATE_API_TOKEN:
        print("❌ No API token available. Using rule-based mode only.")
        return None

    try:
        llm = Replicate(
            model="ibm-granite/granite-3.3-8b-instruct",
            replicate_api_token=REPLICATE_API_TOKEN,
            model_kwargs={
                "temperature": 0.7,
                "max_length": 1024,
                "top_p": 0.9,
                "top_k": 50
            }
        )

        # Test the model dengan prompt sederhana
        print("🧪 Testing model connection...")
        test_response = llm("Hello, respond with just 'OK' if working.")
        print(f"✅ Model test response: {test_response}")

        print("✅ IBM Granite Model initialized successfully!")
        return llm

    except Exception as e:
        print(f"❌ Error initializing model: {e}")
        print("🚨 Continuing with rule-based chatbot only")
        return None

llm = get_llm_model()

🧪 Testing model connection...


  test_response = llm("Hello, respond with just 'OK' if working.")


❌ Error initializing model: ReplicateError Details:
title: Unauthenticated
status: 401
detail: You did not pass an authentication token
🚨 Continuing with rule-based chatbot only


In [15]:
from google.colab import files
import io

# Upload file
print("📁 Upload your anxiety.csv file...")
uploaded = files.upload()

# Load dataset
file_name = list(uploaded.keys())[0]
df = pd.read_csv(io.BytesIO(uploaded[file_name]), delimiter=';')

print(f"✅ Dataset loaded: {df.shape}")
print(f"📊 Sample data:")
print(df.head(2))

📁 Upload your anxiety.csv file...


Saving psychological_state_dataset.csv to psychological_state_dataset (1).csv
✅ Dataset loaded: (1000, 1)
📊 Sample data:
  ID,Time,HRV (ms),GSR (μS),EEG Power Bands,Blood Pressure (mmHg),Oxygen Saturation (%),Heart Rate (BPM),Ambient Noise (dB),Cognitive Load,Mood State,Psychological State,Respiration Rate (BPM),Skin Temp (°C),Focus Duration (s),Task Type,Age,Gender,Educational Level,Study Major
0  1,2024-01-01 00:00:00,33.03973885213793,1.0318...                                                                                                                                                                                                                                 
1  2,2024-01-01 00:00:01,49.914651237079866,1.340...                                                                                                                                                                                                                                 


In [16]:
print("🔍 DATASET EXPLORATION:")
print(f"Shape: {df.shape}")
print(f"Columns: {len(df.columns)}")

# Show column categories
print("\n📋 COLUMN CATEGORIES:")
all_columns = df.columns.tolist()

# Categorize columns
demographic_cols = [col for col in all_columns if any(x in col.lower() for x in ['age', 'gender', 'ethnic', 'employ'])]
mental_health_cols = [col for col in all_columns if any(x in col.lower() for x in ['gad', 'depress', 'anx', 'ocd', 'ptsd', 'panic', 'phobi', 'bipolar'])]
therapy_cols = [col for col in all_columns if any(x in col.lower() for x in ['therapy', 'session', 'wait', 'appt', 'treatment'])]

print(f"Demographic columns: {len(demographic_cols)}")
print(f"Mental health columns: {len(mental_health_cols)}")
print(f"Therapy-related columns: {len(therapy_cols)}")

# Show sample of mental health columns
if mental_health_cols:
    print(f"\n🧠 Sample mental health columns: {mental_health_cols[:5]}")

🔍 DATASET EXPLORATION:
Shape: (1000, 1)
Columns: 1

📋 COLUMN CATEGORIES:
Demographic columns: 1
Mental health columns: 0
Therapy-related columns: 0


In [17]:
def clean_percentage(value):
    """Clean percentage values from format like '11% (3/28)'"""
    if pd.isna(value):
        return 0.0
    elif isinstance(value, str):
        try:
            if '%' in value and '(' in value:
                # Extract percentage from "11% (3/28)"
                return float(value.split('%')[0].strip())
            elif '%' in value:
                return float(value.replace('%', '').strip())
            else:
                return float(value)
        except:
            return 0.0
    elif isinstance(value, (int, float)):
        return float(value)
    else:
        return 0.0

def extract_numeric(value):
    """Extract numeric value from various formats"""
    if pd.isna(value):
        return 0.0
    try:
        return float(value)
    except:
        return 0.0

In [18]:
print("🧹 Cleaning data for chatbot...")

# Create a cleaned version for analysis
cleaned_data = df.copy()

# Clean key columns
columns_to_clean = []
for col in df.columns:
    if any(keyword in col.lower() for keyword in ['%', 'agree', 'disagree', 'problem', 'gad', 'depress', 'wait', 'session']):
        columns_to_clean.append(col)

print(f"Cleaning {len(columns_to_clean)} columns...")

# Apply cleaning
for col in columns_to_clean:
    if col in cleaned_data.columns:
        cleaned_data[f'{col}_clean'] = cleaned_data[col].apply(clean_percentage)

print("✅ Data cleaning completed!")

🧹 Cleaning data for chatbot...
Cleaning 1 columns...
✅ Data cleaning completed!


In [31]:
class MentalHealthChatbot:
    def __init__(self, df=None):
        self.df = df
        print("✅ Chatbot initialized successfully!")
        if df is not None:
            print(f"📊 Dataset loaded: {df.shape[0]} rows, {df.shape[1]} columns")

    def chat(self, user_input):
        user_input_lower = user_input.lower().strip()

        # Greeting responses
        if any(word in user_input_lower for word in ['hello', 'hi', 'hey', 'greetings']):
            return "Hello! I'm your Mental Health Data Assistant. I can help you analyze NHS mental health service data. What would you like to know?"

        # NHS information
        if 'nhs' in user_input_lower and any(word in user_input_lower for word in ['what', 'whats', "what's", 'explain']):
            return """NHS (National Health Service) is the UK's publicly funded healthcare system.
It provides free healthcare to UK residents, including comprehensive mental health services through various trusts and foundations."""

        # Dataset information
        if self.df is not None:
            if 'condition' in user_input_lower and ('what' in user_input_lower or 'show' in user_input_lower or 'list' in user_input_lower):
                condition_cols = [col for col in self.df.columns if 'condition' in col.lower() or 'disease' in col.lower()]
                if condition_cols:
                    conditions = self.df[condition_cols[0]].unique()[:10]
                    return f"Mental health conditions in our dataset:\n• " + "\n• ".join(conditions)
                else:
                    return f"No specific condition column found. Available columns: {', '.join(self.df.columns)}"

            if 'statistic' in user_input_lower or 'summary' in user_input_lower:
                stats = f"""📊 DATASET STATISTICS:
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
📋 Total records: {len(self.df):,}
📊 Columns: {len(self.df.columns)}
📍 Column names: {', '.join(self.df.columns[:5])}
{'...' if len(self.df.columns) > 5 else ''}
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"""
                return stats

            if 'trust' in user_input_lower or 'institution' in user_input_lower or 'organisation' in user_input_lower:
                org_cols = [col for col in self.df.columns if any(word in col.lower() for word in ['org', 'trust', 'hospital', 'service'])]
                if org_cols:
                    orgs = self.df[org_cols[0]].unique()[:8]
                    return f"🏥 NHS Organizations in dataset:\n• " + "\n• ".join(orgs)
                else:
                    return f"No organization column found clearly. Available columns: {', '.join(self.df.columns)}"

            if 'column' in user_input_lower or ('what' in user_input_lower and 'data' in user_input_lower):
                return f"📋 Dataset columns ({len(self.df.columns)} total):\n• " + "\n• ".join(self.df.columns)

        else:
            if any(word in user_input_lower for word in ['condition', 'statistic', 'trust', 'data']):
                return "⚠️ I need a dataset to provide specific information. Please load your mental health data first with: chatbot = MentalHealthChatbot(df)"

        # Help message
        if 'help' in user_input_lower:
            return self.get_help_message()

        # Default response
        return """I can help with NHS mental health data analysis! Try asking:

🔍 SAMPLE QUESTIONS:
• "What conditions are in the dataset?"
• "Show me statistics"
• "What NHS trusts are included?"
• "What's NHS?"
• "Show columns"
• "Help" - for more options

Just type your question naturally! 😊"""

    def get_help_message(self):
        return """🤖 MENTAL HEALTH DATA ASSISTANT - HELP MENU
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

📋 AVAILABLE COMMANDS:
• "hello" - Friendly greeting
• "what's NHS?" - Information about NHS
• "show conditions" - List mental health conditions
• "show statistics" - Dataset summary
• "show trusts" - List NHS organizations
• "show columns" - Display dataset structure
• "help" - Show this menu
• "quit" - Exit chat

💡 TIP: Ask natural questions like "What data do you have?" or "Tell me about depression statistics"

━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"""

# Initialize chatbot with your dataframe
chatbot = MentalHealthChatbot(df)

# Test it quickly
print("\n🧪 QUICK TEST:")
print("Q: Hello")
print("A:", chatbot.chat("hello"))
print("\nQ: What's NHS?")
print("A:", chatbot.chat("what's NHS?"))

✅ Chatbot initialized successfully!
📊 Dataset loaded: 1000 rows, 1 columns

🧪 QUICK TEST:
Q: Hello
A: Hello! I'm your Mental Health Data Assistant. I can help you analyze NHS mental health service data. What would you like to know?

Q: What's NHS?
A: NHS (National Health Service) is the UK's publicly funded healthcare system. 
It provides free healthcare to UK residents, including comprehensive mental health services through various trusts and foundations.


In [32]:
def run_interactive_chat():
    print("🤖 MENTAL HEALTH DATA CHATBOT ACTIVATED!")
    print("Type 'quit' to exit, 'help' for options\n")

    chat_history = []

    while True:
        user_input = input("👤 You: ").strip()

        if user_input.lower() in ['quit', 'exit', 'bye']:
            print("🤖 Thank you for using the Mental Health Data Assistant!")
            break

        if user_input.lower() == 'help':
            print(chatbot.get_help_message())
            continue

        if user_input:
            print("🤖 Analyzing...")
            response = chatbot.chat(user_input)
            print(f"🤖 Bot: {response}\n")

            # Save to history
            chat_history.append({"user": user_input, "bot": response})
        else:
            print("🤖 Please enter a question or type 'help' for options\n")

    return chat_history

# Ubah baris ini - hilangkan tanda #
chat_history = run_interactive_chat()

🤖 MENTAL HEALTH DATA CHATBOT ACTIVATED!
Type 'quit' to exit, 'help' for options

👤 You: helo
🤖 Analyzing...
🤖 Bot: I can help with NHS mental health data analysis! Try asking:

🔍 SAMPLE QUESTIONS:
• "What conditions are in the dataset?"
• "Show me statistics"  
• "What NHS trusts are included?"
• "What's NHS?"
• "Show columns"
• "Help" - for more options

Just type your question naturally! 😊

👤 You: show me statistics
🤖 Analyzing...
🤖 Bot: 📊 DATASET STATISTICS:
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
📋 Total records: 1,000
📊 Columns: 1
📍 Column names: ID,Time,HRV (ms),GSR (μS),EEG Power Bands,Blood Pressure (mmHg),Oxygen Saturation (%),Heart Rate (BPM),Ambient Noise (dB),Cognitive Load,Mood State,Psychological State,Respiration Rate (BPM),Skin Temp (°C),Focus Duration (s),Task Type,Age,Gender,Educational Level,Study Major

━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

👤 You: whats nhs?
🤖 Analyzing...
🤖 Bot: NHS (National Health Service) is the UK's publicly funded healthcare system. 
It p