# Exploratory Data Analysis (EDA)
This notebook explores the cleaned dataset to uncover key insights and relationships.

In [33]:
# Import necessary libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
%matplotlib inline

pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)

## Load the Cleaned Dataset

In [None]:
# Load the cleaned dataset
file_path = '../data/processed/cleaned_data_20241226_225801.csv'
df = pd.read_csv(file_path)
print(f"Dataset loaded with {df.shape[0]} rows and {df.shape[1]} columns.")
df.head()

In [None]:
print("Unique year-category combinations:", df['year_category'].unique())


## Missing Values Analysis

In [None]:
# Calculate and display the percentage of missing values for each column
missing_ratio = (df.isnull().sum() / len(df)) * 100
print("Missing Values Ratio (%):")
print(missing_ratio.sort_values(ascending=False))

## Data Distribution

In [None]:
# Visualize original distribution of 'data_value'
plt.figure(figsize=(10, 6))
sns.histplot(df['data_value'], kde=True, bins=30, color='blue')
plt.title('Distribution of Data Value')
plt.xlabel('Data Value')
plt.ylabel('Frequency')
plt.show()

# Visualize log-transformed distribution for better spread
plt.figure(figsize=(10, 6))
sns.histplot(df['data_value'].apply(lambda x: np.log1p(x) if x > 0 else 0), kde=True, bins=30, color='green')
plt.title('Log-Transformed Distribution of Data Value')
plt.xlabel('Log(Data Value)')
plt.ylabel('Frequency')
plt.show()

## Correlation Matrix

In [None]:
# Remove non-numeric columns and generate a correlation matrix
df_numeric = df.select_dtypes(include=['float64', 'int64'])
plt.figure(figsize=(12, 8))
corr_matrix = df_numeric.corr()
sns.heatmap(corr_matrix, annot=True, fmt='.2f', cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()

## Category Analysis

In [None]:
# Analyze data by category
category_summary = df.groupby('category')['data_value'].mean().sort_values(ascending=False)
print(category_summary)

# Enhanced bar graph with annotations
category_summary.plot(kind='bar', figsize=(12, 6), title='Average Data Value by Category', color='skyblue')
plt.ylabel('Average Data Value')
plt.xticks(rotation=45)
for i, v in enumerate(category_summary):
    plt.text(i, v + 0.5, f"{v:.2f}", ha='center', va='bottom')
plt.show()

In [None]:
import numpy as np  # Import numpy for numerical operations

def display_high_correlations(df: pd.DataFrame, threshold: float = 0.7):
    """
    Display columns with correlations above the given threshold.

    Parameters:
    df (pd.DataFrame): The DataFrame containing the data.
    threshold (float): The correlation threshold to consider as high.
    """
    # Compute correlation matrix
    corr_matrix = df.corr()

    # Flatten the matrix into a series with column pairs
    corr_pairs = (
        corr_matrix.where(~np.tril(np.ones(corr_matrix.shape)).astype(bool))
        .stack()
        .reset_index()
    )
    corr_pairs.columns = ["Column 1", "Column 2", "Correlation"]
    high_corrs = corr_pairs[abs(corr_pairs["Correlation"]) > threshold]

    if not high_corrs.empty:
        print("Columns with high correlations:")
        for _, row in high_corrs.iterrows():
            print(f"{row['Column 1']} and {row['Column 2']}: Correlation = {row['Correlation']:.2f}")
    else:
        print("No correlations above the threshold.")
        
# Select numeric columns
numeric_columns = df.select_dtypes(include=['float64', 'int64'])

# Display high correlations with a threshold of 0.7
display_high_correlations(numeric_columns, threshold=0.2)



In [None]:
year_summary = df.groupby('year')[['data_value', 'low_confidence_limit', 'high_confidence_limit']].mean()
print(year_summary)


In [None]:
if df['totalpopulation'].equals(df['totalpop18plus']):
    df = df.drop(columns=['totalpop18plus'])
    print("totalpop18plus column dropped.")
else:
    print("totalpop18plus column not dropped.")


In [None]:
pip install geopandas

In [None]:
import geopandas as gpd
import matplotlib.pyplot as plt

gdf = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df['longitude'], df['latitude']))
gdf.plot(column='data_value', legend=True, cmap='viridis')
plt.show()


In [None]:
state_summary = df.groupby('statedesc')['data_value'].mean().sort_values(ascending=False)
print(state_summary)


In [None]:
gdf[gdf['category'] == 'Prevention'].plot(column='data_value', legend=True, cmap='coolwarm')
plt.title("Prevention Category: Data Value by Location")
plt.show()


In [None]:
yearly_category = df.groupby(['year', 'category'])['data_value'].mean().unstack()
yearly_category.plot(kind='bar', figsize=(10, 6))
plt.title("Yearly Trends by Category")
plt.ylabel("Average Data Value")
plt.show()
