# Week 1 - Data Exploration


In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

sns.set_theme()

In [None]:
sample_submission = pd.read_csv("../data/submission_format.csv")

X_test = pd.read_csv("../data/test_values.csv")
X_train = pd.read_csv("../data/train_values.csv")
y_train = pd.read_csv("../data/train_labels.csv")

In [None]:
X_train.head()

In [None]:
y_train.head()

In [None]:
X_train.info()

In [None]:
df = X_train.merge(y_train, on="building_id")  # noqa: PD901
df.info()

## Missing Values

Analyze missing values and find uninformative columns


In [None]:
df.isna().sum().any()

## Univariate Analysis

Analyze the distributino of individual features. Are there any imbalances or outliers?


In [None]:
df.damage_grade.value_counts(normalize=True).plot.pie(autopct="%1.1f%%")
plt.title("Damage Grade Distribution")
plt.show()

In [None]:
cat_cols = X_train.select_dtypes(include="object").columns
numeric_cols = X_train.select_dtypes(include="int64").columns
binary_cols = [col for col in X_train.columns if col.startswith("has")]
numeric_cols = [col for col in numeric_cols if col not in binary_cols]

df[cat_cols].nunique()

In [None]:
n = len(binary_cols)
fig = plt.figure(figsize=(20, 40))
for i, col in enumerate(binary_cols):
    ax = plt.subplot(n // 3 + 1, 3, i + 1)
    df[col].value_counts(normalize=True).plot.pie(autopct="%1.1f%%")
    ax.set_title(col)

In [None]:
# correlation with target
corr_target = df[numeric_cols + binary_cols].corrwith(y_train.damage_grade)
corr_target.sort_values().plot.barh()

In [None]:
# visualize categorical columns

n = len(cat_cols)
fig = plt.figure(figsize=(20, 15))
for i, col in enumerate(cat_cols):
    ax = plt.subplot(n // 3 + 1, 3, i + 1)
    ax = sns.countplot(data=df, x=col)

## Multivariate Analysis

Analyze relationship between features. Are there any redundancies?

Analyze relationship between features and target variable. Are there any features that are highly correlated with the target variable?


In [None]:
corr = df[binary_cols].corr()
sns.heatmap(corr, cmap="coolwarm")

In [None]:
df[numeric_cols].hist(figsize=(20, 20))

In [None]:
df[numeric_cols].plot(kind="box", subplots=True, layout=(4, 4), figsize=(20, 20))