In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.regression.mixed_linear_model import MixedLM
import warnings
warnings.filterwarnings('ignore')


In [2]:
# Set plotting style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['font.size'] = 10


## LOADING AND PREPARING THE DATA

In [4]:
# 1. DATA LOADING AND PREPARATION

print("="*70)
print("OHIO LOW BIRTH WEIGHT ANALYSIS")
print("="*70)

# Load datasets
print("\n1. Loading data...")
birth_path = pd.read_csv(r"C:\Users\kehin\Downloads\OhioBirths.csv")
demographic = pd.read_csv(r"C:\Users\kehin\Downloads\OhioDemographics.csv")

OHIO LOW BIRTH WEIGHT ANALYSIS

1. Loading data...


In [5]:
# Display basic information
print(f"\nBirths dataset shape: {birth_path.shape}")
print(f"Demographic dataset shape: {demographic.shape}")
print("\nBirths columns:", birth_path.columns.tolist())
print("Demographic columns:", demographic.columns.tolist())

# Check for missing values
print("\nMissing values in births:")
print(birth_path.isnull().sum())
print("\nMissing values in demographic:")
print(demographic.isnull().sum())


Births dataset shape: (792526, 5)
Demographic dataset shape: (88, 5)

Births columns: ['county_of_birth', 'age_of_mother', 'race_of_mother', 'low_birth_weight', 'birth_id']
Demographic columns: ['Unnamed: 0', 'County', 'Population', 'Below.HS.Education.Percentage', 'Unemployment.Rate']

Missing values in births:
county_of_birth     0
age_of_mother       0
race_of_mother      0
low_birth_weight    0
birth_id            0
dtype: int64

Missing values in demographic:
Unnamed: 0                       0
County                           0
Population                       0
Below.HS.Education.Percentage    0
Unemployment.Rate                0
dtype: int64


## CLEAN DEMOGRAPHIC DATASET

In [7]:
import pandas as pd
import numpy as np

# Clean Demographic dataset
demographic = demographic.copy()

# Drop unnecessary index column
if 'Unnamed: 0' in demographic.columns:
    demographic = demographic.drop(columns=['Unnamed: 0'])

# Rename County column to match births dataset
demographic = demographic.rename(columns={'County': 'county_of_birth'})

# Ensure numeric columns are floats
numeric_cols = ['Population', 'Below.HS.Education.Percentage', 'Unemployment.Rate']
demographic[numeric_cols] = demographic[numeric_cols].apply(pd.to_numeric, errors='coerce')

## CLEAN BIRTH DATASET

In [9]:
# Clean Births dataset
birth_path = birth_path.copy()

# Convert age_of_mother to numeric, unknowns -> NaN
birth_path['age_of_mother'] = pd.to_numeric(birth_path['age_of_mother'], errors='coerce')

# Convert race_of_mother to categorical
birth_path['race_of_mother'] = birth_path['race_of_mother'].astype('category')

# Ensure low_birth_weight is numeric 0/1
birth_path['low_birth_weight'] = pd.to_numeric(birth_path['low_birth_weight'], errors='coerce')


## MERGE DATASETS

In [11]:
 # Merge datasets
merged_data = pd.merge(birth_path, demographic, on='county_of_birth', how='left')


In [12]:
# Check if any county-level data are missing after merge
missing_county_data = merged_data[numeric_cols].isna().sum()
print("Missing values in county-level columns after merge:\n", missing_county_data)


Missing values in county-level columns after merge:
 Population                       792526
Below.HS.Education.Percentage     13391
Unemployment.Rate                 13391
dtype: int64


In [13]:
# Final inspection
print("Merged dataset shape:", merged_data.shape)
print("Columns:", merged_data.columns)
print("Missing values:\n", merged_data.isna().sum())

Merged dataset shape: (792526, 8)
Columns: Index(['county_of_birth', 'age_of_mother', 'race_of_mother',
       'low_birth_weight', 'birth_id', 'Population',
       'Below.HS.Education.Percentage', 'Unemployment.Rate'],
      dtype='object')
Missing values:
 county_of_birth                       0
age_of_mother                        24
race_of_mother                        0
low_birth_weight                      0
birth_id                              0
Population                       792526
Below.HS.Education.Percentage     13391
Unemployment.Rate                 13391
dtype: int64


In [14]:
# Strip leading/trailing whitespace and make all lowercase
birth_path['county_of_birth'] = birth_path['county_of_birth'].str.strip().str.lower()
demographic['county_of_birth'] = demographic['county_of_birth'].str.strip().str.lower()

# Merge again
merged_data = pd.merge(birth_path, demographic, on='county_of_birth', how='left')

# Check missing values again
print(merged_data[numeric_cols].isna().sum())


Population                       792526
Below.HS.Education.Percentage     13391
Unemployment.Rate                 13391
dtype: int64


In [15]:
# Counties in births not in demographics
birth_counties = set(birth_path['county_of_birth'].unique())
demo_counties = set(demographic['county_of_birth'].unique())

mismatched = birth_counties - demo_counties
print("Counties in births not in demographics:", mismatched)


Counties in births not in demographics: {'non-ohio county'}


In [16]:
# Example: replace mismatches
demographic['county_of_birth'] = demographic['county_of_birth'].replace({
    'st. clair': 'saint clair',
    'mc kean': 'mckean',
    # add other fixes here
})
# Example: median imputation
merged_data['age_of_mother'] = merged_data['age_of_mother'].fillna(merged_data['age_of_mother'].median())
merged_data['Below.HS.Education.Percentage'] = merged_data['Below.HS.Education.Percentage'].fillna(merged_data['Below.HS.Education.Percentage'].median())
merged_data['Unemployment.Rate'] = merged_data['Unemployment.Rate'].fillna(merged_data['Unemployment.Rate'].median())
merged_data = pd.merge(birth_path, demographic, on='county_of_birth', how='left')
print(merged_data[['county_of_birth', 'Population', 'Below.HS.Education.Percentage', 'Unemployment.Rate']].isna().sum())

county_of_birth                       0
Population                       792526
Below.HS.Education.Percentage     13391
Unemployment.Rate                 13391
dtype: int64


In [17]:
import pandas as pd

# Step 1: Standardize county names
birth_path['county_of_birth'] = birth_path['county_of_birth'].str.strip().str.lower()
demographic['county_of_birth'] = demographic['county_of_birth'].str.strip().str.lower()

# Step 2: Merge datasets
merged_data = pd.merge(birth_path, demographic, on='county_of_birth', how='left')

# Step 3: Check for unmatched counties
birth_counties = set(birth_path['county_of_birth'].unique())
demo_counties = set(demographic['county_of_birth'].unique())
unmatched_counties = birth_counties - demo_counties
print("Counties in births not in demographics:", unmatched_counties)

# Step 4: Check missing values for county-level predictors 
missing_counts = merged_data[['Population', 'Below.HS.Education.Percentage', 'Unemployment.Rate']].isna().sum()
print("\nMissing values after merge:\n", missing_counts)

# Step 5: Impute remaining missing values (median) 
merged_data['Below.HS.Education.Percentage'] = merged_data['Below.HS.Education.Percentage'].fillna(
    merged_data['Below.HS.Education.Percentage'].median()
)
merged_data['Unemployment.Rate'] = merged_data['Unemployment.Rate'].fillna(
    merged_data['Unemployment.Rate'].median()
)

# Step 6: Verify the final dataset 
print("\nMerged dataset shape:", merged_data.shape)
print("Columns:", merged_data.columns)
print("Missing values:\n", merged_data.isna().sum())


Counties in births not in demographics: {'non-ohio county'}

Missing values after merge:
 Population                       792526
Below.HS.Education.Percentage     13391
Unemployment.Rate                 13391
dtype: int64

Merged dataset shape: (792526, 8)
Columns: Index(['county_of_birth', 'age_of_mother', 'race_of_mother',
       'low_birth_weight', 'birth_id', 'Population',
       'Below.HS.Education.Percentage', 'Unemployment.Rate'],
      dtype='object')
Missing values:
 county_of_birth                       0
age_of_mother                        24
race_of_mother                        0
low_birth_weight                      0
birth_id                              0
Population                       792526
Below.HS.Education.Percentage         0
Unemployment.Rate                     0
dtype: int64


In [18]:
# Drop or impute age_of_mother missing values
merged_data = merged_data.dropna(subset=['age_of_mother'])

# Optionally drop Population if itâ€™s not usable
merged_data = merged_data.drop(columns=['Population'])

# Convert county_of_birth to categorical for modeling
merged_data['county_of_birth'] = merged_data['county_of_birth'].astype('category')

# Convert race_of_mother to categorical if not already
merged_data['race_of_mother'] = merged_data['race_of_mother'].astype('category')

# Check final cleaned dataset
print("Final dataset shape:", merged_data.shape)
print(merged_data.head())


Final dataset shape: (792502, 7)
  county_of_birth  age_of_mother race_of_mother  low_birth_weight  birth_id  \
0    adams county           31.0          White                 1         1   
1    adams county           31.0          White                 1         2   
2    adams county           31.0          White                 1         3   
3    adams county           31.0          White                 1         4   
4    adams county           31.0          White                 1         5   

   Below.HS.Education.Percentage  Unemployment.Rate  
0                           14.8                6.5  
1                           14.8                6.5  
2                           14.8                6.5  
3                           14.8                6.5  
4                           14.8                6.5  


## DESCRIPTIVE STATISTICS

In [20]:
# DESCRIPTIVE STATISTICS

print("\n" + "="*70)
print("DESCRIPTIVE STATISTICS")
print("="*70)

# Overall LBW rate
overall_lbw = ohio_data['low_birth_weight'].mean() * 100
print(f"\nOverall Low Birth Weight Rate: {overall_lbw:.2f}%")
print(f"Total births analyzed: {len(ohio_data):,}")

# LBW by age group
print("\n--- LBW Rate by Maternal Age Group ---")
age_summary = ohio_data.groupby('age_group').agg({
    'low_birth_weight': ['count', 'mean']
}).round(4)
age_summary.columns = ['N_births', 'LBW_Rate']
age_summary['LBW_Rate_Pct'] = age_summary['LBW_Rate'] * 100
print(age_summary)

# LBW by race
print("\n--- LBW Rate by Race ---")
race_summary = ohio_data.groupby('race_of_mother').agg({
    'low_birth_weight': ['count', 'mean']
}).round(4)
race_summary.columns = ['N_births', 'LBW_Rate']
race_summary['LBW_Rate_Pct'] = race_summary['LBW_Rate'] * 100
race_summary['Percentage'] = (race_summary['N_births'] / len(ohio_data) * 100).round(1)
race_summary = race_summary.sort_values('LBW_Rate_Pct', ascending=False)
print(race_summary)

# County-level summary
print("\n--- County-Level Summary ---")
county_summary = ohio_data.groupby('county_of_birth').agg({
    'low_birth_weight': ['count', 'mean'],
    'Below.HS.Education.Percentage': 'first',
    'Unemployment.Rate': 'first',
    'Population': 'first'
}).round(4)

county_summary.columns = ['N_births', 'LBW_Rate', 'Below_HS_Pct', 
                          'Unemployment_Rate', 'Population']
county_summary['LBW_Rate_Pct'] = county_summary['LBW_Rate'] * 100
county_summary = county_summary.sort_values('LBW_Rate_Pct', ascending=False)

print(f"\nNumber of counties: {len(county_summary)}")
print(f"LBW rate range: {county_summary['LBW_Rate_Pct'].min():.2f}% to {county_summary['LBW_Rate_Pct'].max():.2f}%")
print("\nTop 5 counties with highest LBW rates:")
print(county_summary.head()[['N_births', 'LBW_Rate_Pct', 'Below_HS_Pct', 'Unemployment_Rate']])
print("\nTop 5 counties with lowest LBW rates:")
print(county_summary.tail()[['N_births', 'LBW_Rate_Pct', 'Below_HS_Pct', 'Unemployment_Rate']])



DESCRIPTIVE STATISTICS


NameError: name 'ohio_data' is not defined

In [None]:
import matplotlib.pyplot as plt

county_summary = ohio_data.groupby('county_of_birth')['predicted_lbw'].mean().sort_values(ascending=False).reset_index()

plt.figure(figsize=(15,6))
plt.bar(county_summary['county_of_birth'], county_summary['predicted_lbw'], color='salmon')
plt.xticks(rotation=90)
plt.ylabel('Predicted Low Birth Weight')
plt.title('Predicted Low Birth Weight by County in Ohio')
plt.show()


## DESCRIPTIVE PLOTS

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

sns.countplot(data=ohio_data, x='race_of_mother', hue='low_birth_weight')
plt.title("Low Birth Weight by Race of Mother")
plt.xlabel("Race")
plt.ylabel("Count")
plt.xticks(rotation=45)
plt.legend(title="Low Birth Weight")
plt.show()


## AGE DISTRIBUTION

In [None]:
sns.histplot(data=ohio_data, x='age_of_mother', bins=20, kde=True, hue='low_birth_weight')
plt.title("Mother's Age Distribution by Low Birth Weight")
plt.xlabel("Age")
plt.ylabel("Frequency")
plt.show()


## EDUCATION/UNMPLOYMENT VS LBW (SCATTER + REGRESSION)

In [None]:
sns.lmplot(data=ohio_data, x='Below.HS.Education.Percentage', y='low_birth_weight', hue='race_of_mother', scatter_kws={'alpha':0.3})
plt.title("Low Birth Weight vs County Education Level")
plt.show()

sns.lmplot(data=ohio_data, x='Unemployment.Rate', y='low_birth_weight', hue='race_of_mother', scatter_kws={'alpha':0.3})
plt.title("Low Birth Weight vs County Unemployment Rate")
plt.show()

## COUNTY LEVEL SUMMARIES

In [None]:
county_summary = ohio_data.groupby('county_of_birth')['low_birth_weight'].mean().sort_values(ascending=False).reset_index()

plt.figure(figsize=(12,6))
sns.barplot(data=county_summary.head(15), x='county_of_birth', y='low_birth_weight', palette='viridis')
plt.xticks(rotation=45)
plt.ylabel("Proportion of Low Birth Weight")
plt.title("Top 15 Counties by Low Birth Weight Rate")
plt.show()


## FIT RANDOM INTERCEPT MODEL

In [None]:
import pandas as pd
import statsmodels.formula.api as smf
# Standardize numeric predictors
# Predicted values for each birth
ohio_data['predicted_lbw'] = full_result.fittedvalues
ohio_model_data['age_std'] = (ohio_model_data['age_of_mother'] - ohio_model_data['age_of_mother'].mean()) / ohio_model_data['age_of_mother'].std()
ohio_model_data['edu_std'] = (ohio_model_data['Below.HS.Education.Percentage'] - ohio_model_data['Below.HS.Education.Percentage'].mean()) / ohio_model_data['Below.HS.Education.Percentage'].std()
ohio_model_data['unemp_std'] = (ohio_model_data['Unemployment.Rate'] - ohio_model_data['Unemployment.Rate'].mean()) / ohio_model_data['Unemployment.Rate'].std()

# Ensure county_of_birth is categorical
ohio_model_data['county_of_birth'] = ohio_model_data['county_of_birth'].astype('category')

# Fit random intercept model
full_model = smf.mixedlm(
    "low_birth_weight ~ age_std + C(race_of_mother) + edu_std + unemp_std",
    data=ohio_model_data,
    groups=ohio_model_data["county_of_birth"]
)
full_result = full_model.fit(reml=False)

print(full_result.summary())


## THE ODD RATIOS FROM MIXED-EFFECTS MODEL

In [None]:
# Visualizations
import numpy as np
ORs = np.exp(full_result.params)
print(ORs)


In [None]:
# Ohio Births Analysis Script (Corrected)
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.formula.api as smf

# Load data
birth_path = pd.read_csv(r"C:\Users\kehin\Downloads\OhioBirths.csv")
demographic = pd.read_csv(r"C:\Users\kehin\Downloads\OhioDemographics.csv")

# Clean and merge datasets
birth_path['county_of_birth'] = birth_path['county_of_birth'].str.strip().str.lower()
demographic['County'] = demographic['County'].str.strip().str.lower()
demographic = demographic.rename(columns={'County': 'county_of_birth'})

# Merge on county
ohio_data = pd.merge(birth_path, demographic, on='county_of_birth', how='left')

# Drop non-Ohio counties if needed
ohio_data = ohio_data[~ohio_data['Population'].isna()]

# Ensure numeric columns for standardization
ohio_data['age_of_mother'] = pd.to_numeric(ohio_data['age_of_mother'], errors='coerce')
ohio_data['Below.HS.Education.Percentage'] = pd.to_numeric(
    ohio_data['Below.HS.Education.Percentage'], errors='coerce')
ohio_data['Unemployment.Rate'] = pd.to_numeric(
    ohio_data['Unemployment.Rate'], errors='coerce')

# Drop rows with missing numeric values
ohio_data = ohio_data.dropna(subset=['age_of_mother', 'Below.HS.Education.Percentage', 'Unemployment.Rate'])

# Standardize continuous predictors
ohio_data['age_std'] = (ohio_data['age_of_mother'] - ohio_data['age_of_mother'].mean()) / ohio_data['age_of_mother'].std()
ohio_data['edu_std'] = (ohio_data['Below.HS.Education.Percentage'] - ohio_data['Below.HS.Education.Percentage'].mean()) / ohio_data['Below.HS.Education.Percentage'].std()
ohio_data['unemp_std'] = (ohio_data['Unemployment.Rate'] - ohio_data['Unemployment.Rate'].mean()) / ohio_data['Unemployment.Rate'].std()

# Reset index to avoid MixedLM index errors
ohio_data = ohio_data.reset_index(drop=True)

# -------------------------
# Descriptive statistics
desc_stats = ohio_data.describe()
desc_stats.to_csv("descriptive_statistics.csv")

# Histograms
plt.figure(figsize=(8,6))
sns.histplot(ohio_data['age_of_mother'], bins=30, kde=True)
plt.title("Age of Mothers")
plt.xlabel("Age")
plt.ylabel("Count")
plt.savefig("hist_age_of_mother.png", dpi=300)
plt.close()

plt.figure(figsize=(8,6))
sns.histplot(ohio_data['Below.HS.Education.Percentage'], bins=30, kde=True)
plt.title("Below High School Education Percentage")
plt.xlabel("Percentage")
plt.ylabel("Count")
plt.savefig("hist_edu.png", dpi=300)
plt.close()

plt.figure(figsize=(8,6))
sns.histplot(ohio_data['Unemployment.Rate'], bins=30, kde=True)
plt.title("Unemployment Rate")
plt.xlabel("Percentage")
plt.ylabel("Count")
plt.savefig("hist_unemp.png", dpi=300)
plt.close()

# -------------------------
# Random effects model
full_model = smf.mixedlm(
    "low_birth_weight ~ age_std + C(race_of_mother) + edu_std + unemp_std",
    data=ohio_data,
    groups=ohio_data["county_of_birth"]
)
full_result = full_model.fit(reml=False)

# Save model summary
with open("mixedlm_summary.txt", "w") as f:
    f.write(full_result.summary().as_text())

# Odds ratios
ORs = np.exp(full_result.params)
ORs.to_csv("odds_ratios.csv")

# Predicted values for visualization
ohio_data['predicted_lbw'] = full_result.predict()

# County-level mean predicted low birth weight
county_summary = ohio_data.groupby('county_of_birth')['predicted_lbw'].mean().reset_index()
county_summary.rename(columns={'predicted_lbw': 'mean_predicted_lbw'}, inplace=True)
county_summary.to_csv("county_mean_predicted_lbw.csv")

# -------------------------
# Bar plot visualization
plt.figure(figsize=(12,6))
sns.barplot(data=county_summary.sort_values('mean_predicted_lbw', ascending=False),
            x='county_of_birth', y='mean_predicted_lbw')
plt.xticks(rotation=90)
plt.title("Predicted Mean Low Birth Weight by County")
plt.xlabel("County")
plt.ylabel("Predicted Low Birth Weight")
plt.tight_layout()
plt.savefig("predicted_lbw_county_bar.png", dpi=300)
plt.close()

print("All outputs saved: descriptive stats, plots, model summary, ORs, predicted values.")
