## 0. Configs

In [None]:
import warnings
warnings.filterwarnings("ignore")

## 1. Import libraries and data

In [None]:
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

df = pd.read_csv("./data/train_V2.csv")
df.head()

## 2. Survey the data

### **General dataset information**

In [None]:
df.columns

In [None]:
df.info()

In [None]:
df.describe()

### **Missing values**

Create DataFrame of missing values per feature

In [None]:
missing_values = df.isna().sum()

missing = pd.DataFrame({'feature': missing_values.index, 'missing': missing_values.values}) \
    .reset_index(drop=True) \
    .sort_values(by="missing", ascending=False)

only retain features that have missing values

In [None]:
missing = missing[missing["missing"] > 0]

In [None]:
missing.head()

In [None]:
plt.figure(figsize = (5, 12))
sns.barplot(missing, x="missing", y="feature")
plt.title("Missing values for each feature")
plt.xlabel("Number of missing values")
plt.ylabel("Feature")
plt.show()

### **Correlation**

Not really useful, so commented out

In [None]:
# numerical_columns = df.select_dtypes(include="number")
# correlation_matrix = numerical_columns.corr()

# plt.figure(figsize=(25, 25))
# sns.heatmap(correlation_matrix, cmap="coolwarm", annot=True, fmt=".2f")
# plt.tight_layout()
# plt.show()

### **Distribution**

In [None]:
num_rows = (len(df.columns) + 1) // 2  # Ensure we have enough rows for all columns
num_cols = 2

fig, axes = plt.subplots(nrows=num_rows, ncols=num_cols, figsize=(16, 80))

axes = axes.flatten()

for i, column in enumerate(df.columns):
    ax = axes[i]
    sns.histplot(df[column], kde=True, color='blue', edgecolor='black', ax=ax)
    ax.set_title(f'Distribution of {column}')

for j in range(len(df.columns), len(axes)):
    fig.delaxes(axes[j])

plt.tight_layout()
plt.show()

### **Categorical Data Exploration**

The only categorical column is "gender", so I won't plot it

It's also in the full distribution plot, but it's too small to make out the distribution

In [None]:
categorical_columns = df.select_dtypes(include='object')
categorical_columns["gender"].value_counts()