In [10]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import cross_val_score


In [11]:
# Load the data
file_path = '../data/Cleaned/longevity.csv'
data = pd.read_csv(file_path)

# Convert all values in the Country Name column to lowercase with the first letter of each word in the country name capitalized
data['Country Name'] = data['Country Name'].str.title()

'''
Narrow dataframe to the following columns: 
'''
# narrow dataframe to features I select
selected_columns = [
    'Country Name',
    'Age at first marriage, female',
    'Age at first marriage, male', 'Capital health expenditure (% of GDP)', 'Cause of death, by communicable diseases and maternal, prenatal and nutrition conditions (% of total)', "Cause of death, by injury (% of total)", 'Cause of death, by non-communicable diseases (% of total)', 'Community health workers (per 1,000 people)', 'Condom use, population ages 15-24, female (% of females ages 15-24)', 'Condom use, population ages 15-24, male (% of males ages 15-24)', 'Consumption of iodized salt (% of households)', 'Current health expenditure (% of GDP)', 'Births attended by skilled health staff (% of total)', 'Contraceptive prevalence, any method (% of all women ages 15-49)', 'Current health expenditure per capita (current US$)', 'Demand for family planning satisfied by any methods (% of married women with demand for family planning)', 'Diabetes prevalence (% of population ages 20 to 79)', 'Diarrhea treatment (% of children under 5 receiving oral rehydration and continued feeding)', 'Exclusive breastfeeding (% of children under 6 months)', 'Female headed households (% of households with a female head)', 'Hospital beds (per 1,000 people)', 'Immunization, BCG (% of one-year-old children)', 'Immunization, DPT (% of children ages 12-23 months)', 'Immunization, HepB3 (% of one-year-old children)', 'Immunization, Hib3 (% of children ages 12-23 months)', 'Immunization, Pol3 (% of one-year-old children)', 'Immunization, measles (% of children ages 12-23 months)', 'Immunization, measles second dose (% of children by the nationally recommended age)', 'Domestic private health expenditure per capita (current US$)', 'Domestic general government health expenditure per capita (current US$)', 'Incidence of tuberculosis (per 100,000 people)', 'Literacy rate, adult total (% of people ages 15 and above)', 'Literacy rate, youth total (% of people ages 15-24)', 'Malaria cases reported', 'Mortality caused by road traffic injury (per 100,000 people)', 'Mortality from CVD, cancer, diabetes or CRD between exact ages 30 and 70 (%)', 'Mortality rate attributed to household and ambient air pollution (per 100,000 population)', 'Mortality rate attributed to unintentional poisoning (per 100,000 population)', 'Mortality rate attributed to unsafe water, unsafe sanitation and lack of hygiene (per 100,000 population)', #'Net migration', 
    'Newborns protected against tetanus (%)', 'Number of people who are undernourished', 'Nurses and midwives (per 1,000 people)', 'People practicing open defecation (% of population)', 'People using at least basic drinking water services (% of population)', 'People using at least basic sanitation services (% of population)', 'People using safely managed drinking water services (% of population)', 'People using safely managed sanitation services (% of population)', 'People with basic handwashing facilities including soap and water (% of population)', 'Physicians (per 1,000 people)', 'Poverty headcount ratio at national poverty line (% of population)', 'Prevalence of HIV, total (% of population ages 15-49)', 'Prevalence of current tobacco use (% of adults)', 'Prevalence of hypertension (% of adults ages 30-79)', 'Prevalence of overweight (% of adults)', 'Prevalence of undernourishment (% of population)', 'Public spending on education, total (% of GDP)', 'Prevalence of underweight, weight for age (% of children under 5)', 'Prevalence of anemia among women of reproductive age (% of women ages 15-49)', 'Risk of impoverishing expenditure for surgical care (% of people at risk)', #'Rural population', 'Urban population', 
    'School enrollment, primary (% gross)', 'Total alcohol consumption per capita (liters of pure alcohol, projected estimates, 15+ years of age)', 'Specialist surgical workforce (per 100,000 population)', 'Suicide mortality rate (per 100,000 population)', 'Teenage mothers (% of women ages 15-19 who have had children or are currently pregnant)', 'Unemployment, total (% of total labor force)', 'Unmet need for contraception (% of married women ages 15-49)', 'Urban poverty headcount ratio at national poverty lines (% of urban population)', 'Rural poverty headcount ratio at national poverty lines (% of rural population)', 'Vitamin A supplementation coverage rate (% of children ages 6-59 months)', 'power_distance', 'individualism', 'motivation', 'uncertainty_avoidance', 'long_term_orientation', 'indulgence', 'Hepatitis B', 'Measles ', ' BMI ', 'Diphtheria ', ' HIV/AIDS', 'GDP', 'Polio', 'Schooling', 'Alcohol', 'Life expectancy at birth, total (years)'
]

data = data[selected_columns]

data.head()

# Strip all whitespaces from column names
data.columns = data.columns.str.strip()

# Display the modified DataFrame's first few rows to confirm the changes
data.head()

# save dataframe
# narrowed_data.to_csv('../Data/Cleaned/longevity_reduced.csv', index=False) 



FileNotFoundError: [Errno 2] No such file or directory: '../data/Cleaned/longevity.csv'

In [60]:
# Exclude the 'Country Name' column which is not needed for the model
target = 'Life expectancy at birth, total (years)'
X = data.drop(columns=[target, 'Country Name'])
y = data[target]

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the Random Forest Regressor
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)

# Fit the model on the training data
rf_model.fit(X_train, y_train)

# Function to predict life expectancy and top 3 influencing features for a given country
def predict_life_expectancy(country_name):
    # Find the row in the dataframe that corresponds to the given country
    country_data = data[data['Country Name'] == country_name].drop(columns=[target, 'Country Name'])
    
    # Predict life expectancy using the trained model
    life_expectancy = rf_model.predict(country_data)
    
    # Calculate the deviation of the country's feature values from the dataset mean
    deviations = country_data.squeeze() - X.mean()
    
    # Determine if the deviation is higher or lower than the mean
    deviation_direction = deviations.apply(lambda x: 'higher' if x > 0 else 'lower')

    # Combine the absolute deviation and its direction
    deviations_with_direction = deviations.abs().sort_values(ascending=False).head(3)
    deviations_with_direction = deviations_with_direction.to_frame(name='Deviation')
    deviations_with_direction['Direction'] = deviation_direction.loc[deviations_with_direction.index]

    top_z_score_features = find_top_z_score_features(country_name)
    top_rf_features = rf_feature_importance()
    
    return life_expectancy[0], deviations_with_direction, top_z_score_features, top_rf_features

# Function to find the top features based on Z-scores alone for the United States
def find_top_z_score_features(country_name):
    # Isolate the row for the given country
    country_row = data[data['Country Name'] == country_name].drop(columns=[target, 'Country Name'])
    
    # Calculate the mean and std deviation for the features in the dataset excluding the selected country
    mean_values = X.mean()
    std_dev_values = X.std()
    
    # Calculate the Z-scores for the country's features
    z_scores = (country_row - mean_values) / std_dev_values
    z_scores = z_scores.squeeze()  # Convert to Series for easier manipulation
    
    # Sort the z-scores to find the features with the highest deviation from the mean
    top_z_scores = z_scores.sort_values(ascending=False).head(3)
    
    return top_z_scores
'''
def rf_feature_importance():
    # Get feature importances specific to the country's data
    importances = rf_model.feature_importances_
    indices = np.argsort(importances)[-3:]  # Get indices of top 3 features
    top_features = X.columns[indices]
    top_importances = importances[indices]
    # compare to mean feature importances to see if they are higher or lower
    mean_importances = rf_model.feature_importances_.mean()
    # return how much higher or lower the feature importances are (deviation)
    deviation = top_importances - mean_importances  
    return top_features, top_importances, deviation
'''
# Example: Predict for a given country (replace 'Afghanistan' with user input)
example_country = 'China'
predicted_life_expectancy, top_features, top_z_score_features, top_rf_features  = predict_life_expectancy(example_country)
# top features ranked in ascending order of importance
print(f'Predicted life expectancy for {example_country}: {predicted_life_expectancy:.2f} years')
print(f'Top 3 features: {top_features}')
# print top z score features on different lines

print()

print('Top Z-score features:')
for feature, z_score in top_z_score_features.items():
    print(f'{feature}: {z_score:.2f}')

print()
#predicted_life_expectancy, top_features

Predicted life expectancy for China: 72.90 years
Top 3 features:                                                        Deviation Direction
Number of people who are undernourished             6.417223e+07    higher
Immunization, Hib3 (% of children ages 12-23 mo...  4.445474e+06    higher
Community health workers (per 1,000 people)         3.723616e+06     lower

Top Z-score features:
Measles: 10.54
long_term_orientation: 3.18
Immunization, Hib3 (% of children ages 12-23 months): 2.70

