# EDA Section
---

## Data import

In [1]:
# Import data.csv
import pandas as pd
df = pd.read_csv('data.csv')

# Checking
df.head()

Unnamed: 0,Age,Gender,Polyuria,Polydipsia,sudden weight loss,weakness,Polyphagia,Genital thrush,visual blurring,Itching,Irritability,delayed healing,partial paresis,muscle stiffness,Alopecia,Obesity,class
0,40,Male,No,Yes,No,Yes,No,No,No,Yes,No,Yes,No,Yes,Yes,Yes,Positive
1,58,Male,No,No,No,Yes,No,No,Yes,No,No,No,Yes,No,Yes,No,Positive
2,41,Male,Yes,No,No,Yes,Yes,No,No,Yes,No,Yes,No,Yes,Yes,No,Positive
3,45,Male,No,No,Yes,Yes,Yes,Yes,No,Yes,No,Yes,No,No,No,No,Positive
4,60,Male,Yes,Yes,Yes,Yes,Yes,No,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Positive


## Library import

In [9]:
import seaborn as sns
import matplotlib.pyplot as plt

## Seperate Column types

In [12]:
categorical_cols = df.select_dtypes(include=['object', 'bool']).columns
numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns

## Chi-Square Test of Independence

In [None]:
from scipy.stats import chi2_contingency

# Type of variables
dependent_var = 'class'
independent_vars = [col for col in categorical_cols if col != dependent_var]

# Loop through each independent variable and perform Chi-Squared test
results = {}
for var in independent_vars:
    contingency_table = pd.crosstab(df[var], df[dependent_var])
    chi2, p, dof, expected = chi2_contingency(contingency_table)
    results[var] = {'chi2': chi2, 'p-value': p, 'dof': dof}

# ANSI color codes
RED = '\033[91m'
GREEN = '\033[92m'
BOLD = '\033[1m'
RESET = '\033[0m'

significant_vars = []
non_significant_vars = []

# Header
print(f"{BOLD}=================== Chi-Squared Test Results ==================={RESET}")
print('-'*65)
print(f"{BOLD}{'Variable':<20} {'Chi2':<10} {'p-value':<10} {'dof':<5} Result{RESET}")
print('-'*65)

# Results
for var, res in results.items():
    if res['p-value'] < 0.05:
        color = GREEN
        significance = "มีความสัมพันธ์ทางสถิติ O"
        significant_vars.append(var)
    else:
        color = RED
        significance = "ไม่มีความสัมพันธ์ทางสถิติ X"
        non_significant_vars.append(var)
    
    # Color for clear visibility
    line = f"{var:<20} {res['chi2']:<10.4f} {res['p-value']:<10.4f} {res['dof']:<5} {significance}"
    print(f"{color}{line}{RESET}")

# Summary
print(f"\n{BOLD}============================ Summary ============================{RESET}")
print(f"{RED}Non-Significant Variables:{RESET} {non_significant_vars}")


-----------------------------------------------------------------
[1mVariable             Chi2       p-value    dof   Result[0m
-----------------------------------------------------------------
[92mGender               103.0369   0.0000     1     มีความสัมพันธ์ทางสถิติ O[0m
[92mPolyuria             227.8658   0.0000     1     มีความสัมพันธ์ทางสถิติ O[0m
[92mPolydipsia           216.1716   0.0000     1     มีความสัมพันธ์ทางสถิติ O[0m
[92msudden weight loss   97.2963    0.0000     1     มีความสัมพันธ์ทางสถิติ O[0m
[92mweakness             29.7679    0.0000     1     มีความสัมพันธ์ทางสถิติ O[0m
[92mPolyphagia           59.5953    0.0000     1     มีความสัมพันธ์ทางสถิติ O[0m
[92mGenital thrush       5.7921     0.0161     1     มีความสัมพันธ์ทางสถิติ O[0m
[92mvisual blurring      31.8085    0.0000     1     มีความสัมพันธ์ทางสถิติ O[0m
[91mItching              0.0462     0.8297     1     ไม่มีความสัมพันธ์ทางสถิติ X[0m
[92mIrritability         45.2083    0.0000     1     