# Exploratory Data Analysis

This notebook is used for performing exploratory data analysis (EDA) on the dataset extracted from JSON files. The goal is to visualize and gain insights into the data before proceeding with further processing and model training.

In [None]:
import pandas as pd
import json
import matplotlib.pyplot as plt
import seaborn as sns

# Load the data
with open('../data/raw/sample.json') as f:
    data = json.load(f)

# Convert to DataFrame
df = pd.json_normalize(data)

# Display the first few rows of the DataFrame
df.head()

In [None]:
# Summary statistics
df.describe()

In [None]:
# Visualize distributions of numerical features
numerical_features = df.select_dtypes(include=['float64', 'int64']).columns
for feature in numerical_features:
    plt.figure(figsize=(10, 6))
    sns.histplot(df[feature], bins=30, kde=True)
    plt.title(f'Distribution of {feature}')
    plt.xlabel(feature)
    plt.ylabel('Frequency')
    plt.show()

In [None]:
# Correlation heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(df.corr(), annot=True, fmt='.2f', cmap='coolwarm', square=True)
plt.title('Correlation Heatmap')
plt.show()