In [3]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
import mysql.connector
from mysql.connector import Error
import joblib
import pickle
from fuzzywuzzy import process

## Connect with MySQL to pull longevity data

In [2]:
# create a mysql database connection
try:
    connection = mysql.connector.connect(host='localhost',
                                         database='longevity',
                                         user='root',
                                         password='root')
    if connection.is_connected():
        db_Info = connection.get_server_info()
        print("Connected to MySQL database... MySQL Server version on ", db_Info)
        cursor = connection.cursor()
        cursor.execute("select database();")
        record = cursor.fetchone()
        print("Your connected to - ", record)
except Error as e:
    print("Error while connecting to MySQL", e)

# read data from mysql database
data = pd.read_sql('SELECT * FROM longevity.LONGEVITY', con=connection)

connection.close()
data.head()

Connected to MySQL database... MySQL Server version on  8.0.21
Your connected to -  ('longevity',)


  data = pd.read_sql('SELECT * FROM longevity.LONGEVITY', con=connection)


Unnamed: 0,Country Name,"Age at first marriage, female","Age at first marriage, male",Capital health expenditure (% of GDP),Death due to communicable disease and nutrition conditions,Death due to injury,"Cause of death, by non-communicable diseases (% of total)","Community health workers (per 1,000 people)","Condom use, population ages 15-24, male (% of males ages 15-24)",Consumption of iodized salt (% of households),...,Hepatitis B,Measles,BMI,Diphtheria,HIV/AIDS,GDP,Polio,Schooling,Alcohol,"Life expectancy at birth, total (years)"
0,Afghanistan,21.4,24.7,0.2441649,49.118515,11.865492,39.015993,5053479.0,5053479.0,46.566667,...,64.5625,2362.25,15.51875,52.3125,0.1,340.015425,48.375,8.2125,0.014375,59.439
1,Africa Eastern And Southern,5053479.0,5053479.0,5053479.0,59.703069,9.171409,31.125521,5053479.0,34.60712,62.264491,...,80.862429,2331.640525,38.325809,82.193116,1.67715,7550.512712,82.196424,12.040561,4.604644,56.532946
2,Africa Western And Central,5053479.0,5053479.0,5053479.0,66.095155,7.494896,26.409949,0.1440893,38.39535,78.0864,...,80.862429,2331.640525,38.325809,82.193116,1.67715,7550.512712,82.196424,12.040561,4.604644,53.366361
3,Albania,24.06667,28.14,0.1452148,4.774622,6.005545,89.219834,5053479.0,49.8,88.833333,...,98.0,53.375,49.06875,98.0625,0.1,2119.726679,98.125,12.1375,4.84875,77.196062
4,Algeria,29.125,32.925,0.01103354,16.714915,9.92249,73.362596,5053479.0,5053479.0,74.9,...,78.740211,1943.875,48.74375,91.875,0.1,2847.853392,91.75,12.7125,0.669678,72.913625


## Scale Numeric data

In [13]:
# Load the data
file_path = '../data/Cleaned/longevity_reduced.csv'
data = pd.read_csv(file_path)
data.drop(columns=["Measles", "Polio", "Literacy rate, youth total"], inplace=True)

# Select columns to normalize
columns_to_normalize = [col for col in data.columns if col not in ['Country Name', 'Life expectancy at birth, total (years)']]

# Initialize the MinMaxScaler
scaler = MinMaxScaler()

# Normalize the selected columns
data[columns_to_normalize] = scaler.fit_transform(data[columns_to_normalize])

## Train a random forest model

In [22]:

# Exclude the 'Country Name' column which is not needed for the model
data_copy = data.copy()
target = 'Life expectancy at birth, total (years)'
X = data_copy.drop(columns=[target, 'Country Name'])
y = data_copy[target]

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0)

# Initialize the Random Forest Regressor
rf_model = RandomForestRegressor(n_estimators=100, random_state=0)

# Fit the model on the training data
rf_model.fit(X_train, y_train)
preds = rf_model.predict(X_test)

## Use Random Forest Feature Selection and Z-Score to predict most influential features

In [25]:

# Function to predict life expectancy and top 3 influencing features for a given country
def predict_life_expectancy(country_name):
    # Find the row in the dataframe that corresponds to the given country
    country_data = data_copy[data_copy['Country Name'] == country_name].drop(columns=[target, 'Country Name'])
    
    # Use actual value for life expectancy
    life_expectancy = data[data['Country Name'] == country_name][target].values[0]
    
    top_z_score_features = find_top_z_score_features(country_name)
    top_rf_features = rf_feature_importance(country_data)
    
    return life_expectancy, top_rf_features, top_z_score_features

# Function to find the top features based on Z-scores alone for the United States
def find_top_z_score_features(country_name):
    # Isolate the row for the given country
    country_row = data[data['Country Name'] == country_name].drop(columns=[target, 'Country Name'])
    
    # Calculate the mean and std deviation for the features
    mean_values = X.mean()
    std_dev_values = X.std()
    
    # Calculate the Z-scores for the country's features
    z_scores = (country_row - mean_values) / std_dev_values
    z_scores = z_scores.squeeze() 

    # Make new series for z-score sign
    z_scores_signed = z_scores.copy()
    z_scores_signed[z_scores_signed > 0] = 1
    z_scores_signed[z_scores_signed < 0] = -1

    # Sort by absolute Z-score value
    sorted_z_scores = z_scores.abs().sort_values(ascending=False)
    
    # Multiply the sign by the absolute value to get a series with the sorted magnitudes
    for index, value in sorted_z_scores.items():
        sorted_z_scores[index] = value * z_scores_signed[index]

    top_z_scores = sorted_z_scores.head(25)

    return top_z_scores

def rf_feature_importance(country_data):
    # Get feature importances specific to the country's data
    importances = rf_model.feature_importances_
    indices = np.argsort(importances)[-5:]  # Get indices of top 3 features
    top_features = country_data.columns[indices].tolist()  # Get names of top 3 features
    # return if the deviation is higher or lower than the mean
    return top_features

## Run Model to get feature importances

In [31]:
# Example: Predict for a given country (replace 'Afghanistan' with user input)
example_country = 'China'
example_country = "United States"
example_country = "India"

# if country is not in dataset, return error message
if example_country not in data['Country Name'].values:
    print('Country not found in dataset')
    # return error message

# perform string formatting to ensure first letter of each word is capitalized
example_country = example_country.title()

life_expectancy, top_rf_features, top_z_score_features = predict_life_expectancy(example_country)
# print life expectancy
print('Life Expectancy for {}:'.format(example_country), life_expectancy)
print()
# top features random forest features
print('Top 5 Random Forest features:')
# print top rf features on different lines
print()
for feature in top_rf_features:
    print(feature)
print()
print()
# print top z score features on different lines
print('Top 10 Z-score features:')
display(top_z_score_features)

Life Expectancy for India: 66.0110625

Top 5 Random Forest features:

Basic drinking water services
School enrollment
Incidence of tuberculosis
Basic sanitation services
People practicing open defecation


Top 10 Z-score features:


Hepatitis B resistance                    -3.498595
Physicians                                 2.652181
Number of undernourished people            2.017245
BMI                                       -1.942703
Adversion to uncertainty                  -1.903572
Condom use, population ages 15-24         -1.773254
Demand for family planning                -1.686507
Female headed households                  -1.670177
Community health workers                  -1.670172
Vitamin A supplementation coverage rate   -1.578171
Positivity                                -1.537818
Teenage mothers                           -1.521927
Lack of managed drinking water services    1.245727
Diphtheria resistance                     -1.189190
Low Poverty Frequency                     -1.135008
Handwashing facilities                    -1.135003
Individualism                             -1.109126
Diarrhea treatment                        -1.068117
Consumption of iodized salt               -1.044204
Newborns pro

## Saving trained model as a joblib file

In [20]:
model_filename = "../models/rf_model.joblib"
joblib.dump(rf_model, model_filename)

['../models/rf_model.joblib']

In [14]:
import json
import requests
url = "https://cdn.jsdelivr.net/npm/world-atlas@2/countries-50m.json"
# Get json from url
r = requests.get(url)
# Convert to dict
d = r.json()

print(d.keys())



dict_keys(['type', 'objects', 'arcs', 'bbox', 'transform'])


# Adding interpretability to column names for WebApp

In [15]:
countries = []
for country in d["objects"]["countries"]["geometries"]:
    name = country["properties"]["name"]
    countries.append(name)

sorted_countries = sorted(countries)

# Get countries from df
df_countries = data["Country Name"].unique()

# Convert to list
df_countries = df_countries.tolist()
sorted_df_countries = sorted(df_countries)

# Using fuzzywuzzy to match countries in the two lists
conversion_dict = {}
z = 0
for c in sorted_countries:
    match, score = process.extractOne(c, df_countries)
    
    if score >= 94:
        conversion_dict[c] = match
    else:
        print(f"{c} ---> {match}")
        conversion_dict[c] = None
        z += 1
print(z)
print(conversion_dict)

for key in conversion_dict.keys():
    if conversion_dict[key] and conversion_dict[key] != key:
        print(f"Key: {key}, Value: {conversion_dict[key]}")

# Write to a json file
# with open('../Data/country_conversion.json', 'w') as f:
#     json.dump(conversion_dict, f)

Anguilla ---> Angola
Antarctica ---> Qatar
Antigua and Barb. ---> Antigua And Barbuda
Ashmore and Cartier Is. ---> Fragile And Conflict Affected Situations
Bahamas ---> Bahamas, The
Bosnia and Herz. ---> Africa Eastern And Southern
Br. Indian Ocean Ter. ---> India
British Virgin Is. ---> British Virgin Islands
Brunei ---> Brunei Darussalam
Cayman Is. ---> Cayman Islands
Central African Rep. ---> Central African Republic
Congo ---> Congo, Dem. Rep.
Cook Is. ---> Solomon Islands
Curaçao ---> Curacao
Dominican Rep. ---> Dominica
Egypt ---> Egypt, Arab Rep.
Eq. Guinea ---> Guinea
Faeroe Is. ---> Faroe Islands
Falkland Is. ---> Poland
Fr. Polynesia ---> French Polynesia
Fr. S. Antarctic Lands ---> Iceland
Gambia ---> Gambia, The
Guernsey ---> Germany
Heard I. and McDonald Is. ---> Fragile And Conflict Affected Situations
Hong Kong ---> Hong Kong Sar, China
Indian Ocean Ter. ---> India
Iran ---> Iran, Islamic Rep.
Jersey ---> Oecd Members
Kyrgyzstan ---> Kazakhstan
Laos ---> Barbados
Macao -