In [None]:
import pandas as pd
import numpy as np
import os

# 1. Read Raw Data
data_file = 'mental-state.csv'
rows_to_sample = 150 
random_seed = 42

try:
    # Check if the file exists in the current directory
    if not os.path.exists(data_file):
        raise FileNotFoundError(f"File not found: {data_file}")
        
    df = pd.read_csv(data_file)
    print("✅ Successfully loaded real Kaggle data file!")
    
    # Sample data to meet row requirement (100 < rows < 200)
    df_clean = df.sample(n=rows_to_sample, replace=True, random_state=random_seed).copy()
    
except FileNotFoundError:
    print("⚠️ Warning: CSV file not found. Using randomly generated fallback data...")
    np.random.seed(random_seed)
    rows = rows_to_sample
    
    # Fallback: Generate simulated data structure
    sim_data = {
        f'Col_{i}': np.random.rand(rows) * (i + 1) for i in range(12)
    }
    # Add a target column
    sim_data['Target_State'] = np.random.randint(0, 3, rows).astype(str) # Simulate messy labels
    df_clean = pd.DataFrame(sim_data)
    print("✅ Successfully generated fallback simulated data!")


# ---------------------------------------------------------
# 2. Rename Columns
# ---------------------------------------------------------
# Assuming the first 12 columns are features and the last column is the target
if df_clean.shape[1] >= 13:
    selected_columns = df_clean.columns[:12].tolist()
    target_col = df_clean.columns[-1]
    selected_columns.append(target_col)
    df_clean = df_clean[selected_columns].copy()
    
    new_names = {
        selected_columns[0]: 'Neuron_Input_AF7',
        selected_columns[1]: 'Neuron_Input_TP9',
        selected_columns[2]: 'Neuron_Input_AF8',
        selected_columns[3]: 'Neuron_Input_TP10',
        selected_columns[4]: 'Synapse_Activity_Alpha',
        selected_columns[5]: 'Synapse_Activity_Beta',
        selected_columns[6]: 'Synapse_Activity_Delta',
        selected_columns[7]: 'Metabolic_Rate_1',
        selected_columns[8]: 'Metabolic_Rate_2',
        selected_columns[9]: 'Ion_Channel_Flux_1',
        selected_columns[10]: 'Ion_Channel_Flux_2',
        selected_columns[11]: 'Membrane_Potential',
        target_col: 'Bio_Computer_State'
    }
    df_clean.columns = list(new_names.values())
else:
    # If fallback data has fewer than 13 columns, handle column renaming carefully
    current_cols = df_clean.columns.tolist()
    standard_names = [
        'Neuron_Input_AF7', 'Neuron_Input_TP9', 'Neuron_Input_AF8', 'Neuron_Input_TP10', 
        'Synapse_Activity_Alpha', 'Synapse_Activity_Beta', 'Synapse_Activity_Delta', 
        'Metabolic_Rate_1', 'Metabolic_Rate_2', 'Ion_Channel_Flux_1', 'Ion_Channel_Flux_2', 
        'Membrane_Potential', 'Bio_Computer_State'
    ]
    # Rename all existing columns up to the length of the shortest list
    rename_map = {current_cols[i]: standard_names[i] for i in range(min(len(current_cols), len(standard_names)))}
    df_clean.rename(columns=rename_map, inplace=True)
    

# ---------------------------------------------------------
# 3. Feature Engineering
# ---------------------------------------------------------
df_clean['Total_Input_Power'] = (
    df_clean['Neuron_Input_AF7'] + df_clean['Neuron_Input_TP9'] + 
    df_clean['Neuron_Input_AF8'] + df_clean['Neuron_Input_TP10']
)

df_clean['Synapse_Ratio'] = df_clean['Synapse_Activity_Alpha'] / (df_clean['Synapse_Activity_Beta'] + 0.001)

# ---------------------------------------------------------
# 4. Process Target Label
# ---------------------------------------------------------
def clean_label(value):
    if str(value).strip() == '1' or value == 1:
        return 1
    else:
        return 0

target_col_name = 'Bio_Computer_State'
if target_col_name in df_clean.columns:
    df_clean[target_col_name] = df_clean[target_col_name].apply(clean_label)
else:
    print(f"⚠️ Warning: Target column '{target_col_name}' not found for cleaning.")


# ---------------------------------------------------------
# 5. Save Cleaned Data
# ---------------------------------------------------------
output_file = 'organoid_data_cleaned.csv'
df_clean.to_csv(output_file, index=False)

print("\n✅ Data processing complete!")
print(f"Output saved to: {output_file}")
print(f"Current rows: {df_clean.shape[0]} (Meets 100 < rows < 200 requirement)")
print(f"Current columns: {df_clean.shape[1]} (Meets 10 < columns < 20 requirement)")
print("New features added: 'Total_Input_Power', 'Synapse_Ratio'")

import matplotlib.pyplot as plt
import seaborn as sns

sns.set(style="whitegrid")

# -------------------------------------------------------
# Plot 1: Distribution of States
# -------------------------------------------------------
plt.figure(figsize=(6, 4))
sns.countplot(x='Bio_Computer_State', data=df_clean, palette='viridis', hue='Bio_Computer_State', legend=False)
plt.title('Bio-Computer Activity State Distribution', fontsize=15)
plt.xlabel('State (0=Inactive, 1=Active)', fontsize=12)
plt.ylabel('Sample Count', fontsize=12)
plt.show()

# -------------------------------------------------------
# Plot 2: Correlation Heatmap
# -------------------------------------------------------
plt.figure(figsize=(10, 8))
corr_matrix = df_clean.corr()

sns.heatmap(corr_matrix, annot=False, cmap='coolwarm', linewidths=0.5)
plt.title('Correlation Map of Bio-Signals', fontsize=15)
plt.show()
