In [None]:
# Data Exploration Notebook
# Filename: data_exploration.ipynb

# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Set some visualization settings for better readability
sns.set(style="whitegrid")
plt.rcParams['figure.figsize'] = (10, 6)

# 1. Load the Dataset
file_path = r"C:\Users\June Nguyen\AI_asm2\data\NewYork.csv"
data = pd.read_csv(file_path)

# 2. Display the first few rows of the dataset
print("First few rows of the dataset:")
display(data.head())

# 3. Check for missing values
print("\nMissing values in each column:")
display(data.isnull().sum())

# 4. Data Types and Summary
print("\nData types and summary statistics:")
display(data.info())
display(data.describe())

# 5. Correlation Matrix
print("\nCorrelation matrix of numerical features:")
corr_matrix = data.corr()
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()

# 6. Distribution of Numerical Features
print("\nDistribution plots of numerical features:")
num_features = data.select_dtypes(include=[np.number]).columns.tolist()
data[num_features].hist(bins=15, figsize=(15, 10), layout=(4, 5))
plt.suptitle('Distribution of Numerical Features', fontsize=16)
plt.show()

# 7. Relationship Between Features
print("\nScatter plots for selected feature relationships:")
sns.pairplot(data[num_features], diag_kind='kde')
plt.suptitle('Pairwise Relationships between Numerical Features', y=1.02, fontsize=16)
plt.show()

# 8. Categorical Feature Analysis
if 'preciptype' in data.columns:
    print("\nDistribution of the 'preciptype' categorical feature:")
    sns.countplot(x='preciptype', data=data)
    plt.title('Distribution of Precipitation Types')
    plt.show()
else:
    print("\nNo categorical 'preciptype' feature found.")

# 9. Summary of Findings
print("\nSummary of initial data exploration:")
# This section would include a markdown cell or narrative description summarizing key observations.

