Exploratory Data Analysis 

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import normaltest
import math

In [None]:
df = pd.read_csv("data.csv")

In [None]:
# initial inspection 
print(df.shape)
display(df.head())
print(df.dtypes)
df.info()
display(df.describe(include="all").T)

display(df.nunique())

print(df.isnull().sum())
print("Duplicates:", df.duplicated().sum())

In [None]:
sns.heatmap(df.isnull(), cbar=False)
plt.title("Missing Value Heatmap")
plt.show()

In [None]:
categorical_features = df.select_dtypes(include=["object", "category"]).columns.tolist()
numerical_features = df.select_dtypes(include=[np.number]).columns.tolist()

In [None]:
def plot_histograms(df, cols, kde=True, bins=30, PLOTS_PER_ROW=3):
    n = len(cols)
    rows = math.ceil(n / PLOTS_PER_ROW)
    
    fig, axes = plt.subplots(rows, PLOTS_PER_ROW, figsize=(6*PLOTS_PER_ROW, 4*rows))
    axes = axes.flatten()

    for idx, col in enumerate(cols):
        sns.histplot(df[col], kde=kde, bins=bins, ax=axes[idx])
        axes[idx].set_title(f"Distribution of {col}")

    for idx in range(n, len(axes)):
        axes[idx].set_visible(False)

    plt.tight_layout()
    plt.show()

In [None]:
for col in numeric_features:
    plt.figure(figsize=(6, 2))
    sns.boxplot(x=df[col])
    plt.title(f"Boxplot of {col}")
    plt.show()

In [None]:
def plot_countplots(df, cols, top_n=10, PLOTS_PER_ROW=3):
    n = len(cols)
    rows = math.ceil(n / PLOTS_PER_ROW)

    fig, axes = plt.subplots(rows, PLOTS_PER_ROW, figsize=(6*PLOTS_PER_ROW, 4*rows))
    axes = axes.flatten()

    for idx, col in enumerate(cols):
        order = df[col].value_counts().head(top_n).index
        sns.countplot(data=df, y=col, order=order, ax=axes[idx])
        axes[idx].set_title(f"Top {top_n}: {col}")

    for idx in range(n, len(axes)):
        axes[idx].set_visible(False)

    plt.tight_layout()
    plt.show()

In [None]:
for col in categorical_features:
    print(f"\n--- {col} ---")
    print(df[col].value_counts(dropna=False))

In [None]:
#normality
for col in numerical_features:
    stat, p = normaltest(df[col].dropna())
    print(f"{col}: p-value={p:.4f}")

In [None]:
plt.figure(figsize=(10, 6))
sns.heatmap(df[numerical_features].corr(), annot=True, fmt=".2f", cmap="coolwarm")
plt.show()

In [None]:
# Replace 'target'
df[numeric_features].corr()['target'].sort_values(ascending=False)

In [None]:
sns.pairplot(df[numeric_features], diag_kind="kde") 
plt.show()

Data Preparation

In [None]:
df.columns

df = df[["col1", "col2", "col3"]].copy()


df["col1"] = pd.to_datetime(df["col1"])
df["col2"] = pd.to_numeric(df["col2"], errors="coerce")


df = df.rename(columns={
    "old_name": "new_name"
})