# 01 - Exploratory Data Analysis (EDA)


In [None]:
import os
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from pathlib import Path

PROJECT_ROOT = Path('..')
DATA_DIR = PROJECT_ROOT / 'data'
TRAIN_PATH = DATA_DIR / 'train.csv'
TEST_PATH = DATA_DIR / 'test.csv'

train = pd.read_csv(TRAIN_PATH)
test = pd.read_csv(TEST_PATH)
train.head()


In [None]:
train.info()


In [None]:
train.isna().mean().sort_values(ascending=False).head(20)


## Target Distribution


In [None]:
target_col = 'Target'
ax = train[target_col].value_counts(normalize=True).mul(100).plot(kind='bar', figsize=(6,3), title='Target distribution (%)')
plt.show()


## Key Feature Distributions


In [None]:
for col in ['confidence', 'community', 'indicator']:
    if col in train.columns:
        plt.figure(figsize=(6,3))
        if str(train[col].dtype) == 'object':
            train[col].value_counts().head(20).plot(kind='bar', title=f'{col} (top 20)')
        else:
            sns.histplot(train[col].dropna(), kde=True)
            plt.title(col)
        plt.show()


## Text Lengths and Common Words


In [None]:
text_col = 'indicator_description'
if text_col in train.columns:
    train['text_len'] = train[text_col].astype(str).str.split().map(lambda x: len(x))
    sns.histplot(train['text_len'], bins=50)
    plt.title('Text length (tokens)')
    plt.show()
    
    from collections import Counter
    cnt = Counter()
    train[text_col].astype(str).str.lower().str.replace(r'[^a-z0-9 ]', ' ', regex=True).str.split().map(cnt.update)
    pd.Series(cnt).sort_values(ascending=False).head(30)
else:
    print('No text column found for EDA')
