## Exploratory Data Analysis Template

by Harman Singh

### 0. Libraries and data

In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns


# sns.get_dataset_names()

# Seaborn Datasets
anagrams = sns.load_dataset("anagrams")
anscombe = sns.load_dataset("anscombe")
attention = sns.load_dataset("attention")
brain_networks = sns.load_dataset("brain_networks")
car_crashes = sns.load_dataset("car_crashes")
diamonds = sns.load_dataset("diamonds")
dots = sns.load_dataset("dots")
dowjones = sns.load_dataset("dowjones")
exercise = sns.load_dataset("exercise")
flights = sns.load_dataset("flights")
fmri = sns.load_dataset("fmri")
geyser = sns.load_dataset("geyser")
glue = sns.load_dataset("glue")
healthexp = sns.load_dataset("healthexp")
iris = sns.load_dataset("iris")
mpg = sns.load_dataset("mpg")
penguins = sns.load_dataset("penguins")
planets = sns.load_dataset("planets")
seaice = sns.load_dataset("seaice")
taxis = sns.load_dataset("taxis")
tips = sns.load_dataset("tips")
titanic = sns.load_dataset("titanic")

# CSV datasets
melb_data = pd.read_csv("./datasets/melb_data.csv")

In [None]:
sns.get_dataset_names()

`SET DATASET`

In [None]:
dataset = penguins

### 01. General dataset information

In [None]:
dataset.columns

In [None]:
dataset.info()

In [None]:
dataset.describe()

### 02. Basic Analysis

the stuff I knew by heart

#### **Missing Values**

In [None]:
ONLY_SHOW_FEATURES_WITH_MISSING_VALUES = True

missing_values = dataset.isnull().sum()
missing_df = pd.DataFrame({'feature': missing_values.index, 'missing': missing_values.values}).reset_index(drop=True).sort_values(by="missing", ascending=False)

if (ONLY_SHOW_FEATURES_WITH_MISSING_VALUES):
    missing_df = missing_df[missing_df["missing"] > 0]

plt.figure(figsize=(10,6))

sns.barplot(missing_df, x="missing", y="feature")
plt.title("Missing values for each feature")
plt.xlabel("number of missing values")
plt.tight_layout()
plt.show()

#### **Correlation**

In [None]:
numerical_columns = dataset.select_dtypes(include="number")
correlation_matrix = numerical_columns.corr()

plt.figure(figsize=(10, 10))
sns.heatmap(correlation_matrix, cmap="coolwarm", annot=True, square=True, fmt=".2f")
plt.tight_layout()
plt.show()

#### **Outliers**

In [None]:
num_rows = len(numerical_columns.columns) // 4 + (len(numerical_columns.columns) % 4 > 0)

fig, axes = plt.subplots(nrows=num_rows, ncols=4, figsize=(15, num_rows * 5))

axes = axes.flatten()

for i, column in enumerate(numerical_columns.columns):
    numerical_columns.boxplot(column=[column], ax=axes[i])
    axes[i].set_title(f"{column}")
plt.tight_layout()
plt.show()

### 02. Advanced Analysis

the stuff I had to ask ChatGPT

#### **Distribution Plots**

In [None]:
num_cols = len(numerical_columns.columns) // 2
num_rows = (num_cols + 1) // 2

fig, axes = plt.subplots(nrows=num_cols, ncols=2, figsize=(12, 4 * num_cols))

for i, col in enumerate(numerical_columns.columns):
    row_idx = i // 2
    col_idx = i % 2

    sns.histplot(numerical_columns[col], kde=True, ax=axes[row_idx, col_idx])
    axes[row_idx, col_idx].set_title(f"Distribution of {col}")

plt.tight_layout()
plt.show()

#### **Categorical Data Exploration**

In [None]:
categorical_columns = dataset.select_dtypes(include='object')

In [None]:
num_cols = len(categorical_columns.columns)
num_rows = (num_cols + 1) // 2

fig, axes = plt.subplots(nrows=num_rows, ncols=2, figsize=(10, 4 * num_rows))

for i, col in enumerate(categorical_columns.columns):

    row_idx = i // 2
    col_idx = i % 2

    sns.countplot(x=col, data=categorical_columns, ax=axes[row_idx, col_idx])
    axes[row_idx, col_idx].set_title(f'Count plot of {col}')


    for p in axes[row_idx, col_idx].patches:
        axes[row_idx, col_idx].annotate(f'{p.get_height()}',
                (p.get_x() + p.get_width() / 2., p.get_height()),
                ha='center', va='center',
                xytext=(0, 5),  # offset of the text from the bar
                textcoords='offset points',
                # color="white"
            )


if num_cols % 2 // 1:
    fig.delaxes(axes[-1, -1])


plt.tight_layout()
plt.show()

#### **Pair Plots**

In [None]:
plt.figure(figsize=(10, 10))
sns.pairplot(numerical_columns)
plt.suptitle("Pair Plot of Numerical Columns", y=1.02)
plt.show()

# STOP (for now)

- Feature Relationships
    - Scatter Plots (already done)
    - Line Plots (there may not be a time series in the dataset, so it won't work on all datasets)
    - Box Plots (already done)
    - Heatmaps (already done)


- Target Variable Exploration
- Feature Engineering Opportunities
- Data Transformation
- Time Series Analysis
- Dimensionality Reduction
- Advanced Statistical Analysis
- Machine Learning Model Exploration
- Interactive Visualizations
