Step 1: Import Required Libraries

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Set style
sns.set(style='whitegrid')


Step 2: Load and Merge Datasets

In [None]:
# Load each dataset
edu = pd.read_csv('ed_lvl_ar.csv')
pop = pd.read_csv('pop_lvl_ar.csv')
poverty = pd.read_csv('poverty_lvl_ar.csv')
unemp = pd.read_csv('unemploy_lvl_ar.csv')

# Merge on a common column (assumed to be 'County')
df = edu.merge(pop, on='County').merge(poverty, on='County').merge(unemp, on='County')



Step 3: Inspect Dataset

In [None]:
print(df.info())
print(df.describe())
print(df.head())


Step 3.1 Clean the Data

In [None]:
#Cleaning the Data
# Check for null values
print('Null Values in Each Column:')
print(df.isnull().sum())
# Check for duplicates
duplicates = df.duplicated().sum()
print(f'Duplicates: {duplicates}')
# Check for unique values in 'County'
unique_counties = df['County'].nunique()
print(f'Unique Counties: {unique_counties}')
# Check for missing values in each column
missing_values = df.isnull().sum()
print('Missing Values:\n', missing_values)

Step 4: Visualize Distributions (Histograms & KDE)

In [None]:
# Visualize the distributions of key variables
variables = ['PovertyRate', 'UnemploymentRate', 'HighSchoolGradRate', 'BachelorsDegreeRate']

for var in variables:
    plt.figure(figsize=(8,4))
    sns.histplot(df[var], kde=True, bins=20)
    plt.title(f'Distribution of {var}')
    plt.xlabel(var)
    plt.ylabel('Frequency')
    plt.tight_layout()
    plt.show()

Step 5: Correlation Matrix and Heatmap

In [None]:
# Select only numeric columns of interest
corr_vars = df[['PovertyRate', 'UnemploymentRate', 'HighSchoolGradRate', 'BachelorsDegreeRate', 'Population']]
corr_matrix = corr_vars.corr()

plt.figure(figsize=(8,6))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Matrix of Key Indicators')
plt.tight_layout()
plt.show()

Step 6: Scatter Plots for Key Relationships

In [None]:
# Scatter plots to visualize relationships between key indicators
plt.figure(figsize=(8,6))
sns.scatterplot(x='BachelorsDegreeRate', y='PovertyRate', data=df)
plt.title('Bachelor’s Degree Rate vs Poverty Rate')
plt.xlabel('Bachelor’s Degree (%)')
plt.ylabel('Poverty Rate (%)')
plt.tight_layout()
plt.show()

# Scatter plot for High School Grad Rate vs Unemployment Rate
plt.figure(figsize=(8,6))
sns.scatterplot(x='HighSchoolGradRate', y='UnemploymentRate', data=df)
plt.title('High School Grad Rate vs Unemployment Rate')
plt.xlabel('High School Grad (%)')
plt.ylabel('Unemployment Rate (%)')
plt.tight_layout()
plt.show()


Step 7: Identify Outlier Counties with Boxplots

In [None]:
# Boxplots to visualize the distribution of key indicators
for var in variables:
    plt.figure(figsize=(8,4))
    sns.boxplot(x=df[var])
    plt.title(f'Boxplot of {var}')
    plt.tight_layout()
    plt.show()


In [None]:
df['LogPopulation'] = np.log1p(df['Population'])  # log(1 + x) avoids log(0)

plt.figure(figsize=(8,4))
sns.histplot(df['LogPopulation'], kde=True)
plt.title('Log-Transformed Population Distribution')
plt.xlabel('Log(Population)')
plt.tight_layout()
plt.show()