In [3]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
import mysql.connector
from mysql.connector import Error
import joblib
import pickle

In [2]:
# create a mysql database connection
try:
    connection = mysql.connector.connect(host='localhost',
                                         database='longevity',
                                         user='root',
                                         password='root')
    if connection.is_connected():
        db_Info = connection.get_server_info()
        print("Connected to MySQL database... MySQL Server version on ", db_Info)
        cursor = connection.cursor()
        cursor.execute("select database();")
        record = cursor.fetchone()
        print("Your connected to - ", record)
except Error as e:
    print("Error while connecting to MySQL", e)
'''
# load data from csv file into mysql database
file_path = '../data/Cleaned/longevity_reduced.csv'
df = pd.read_csv(file_path)
engine = create_engine('mysql+mysqlconnector://root:root@localhost:3306/longevity', echo=False)
df.to_sql(name='LONGEVITY', con=engine, if_exists = 'append', index=False)
'''

# read data from mysql database
data = pd.read_sql('SELECT * FROM longevity.LONGEVITY', con=connection)

connection.close()
data.head()

Connected to MySQL database... MySQL Server version on  8.0.21
Your connected to -  ('longevity',)


  data = pd.read_sql('SELECT * FROM longevity.LONGEVITY', con=connection)


Unnamed: 0,Country Name,"Age at first marriage, female","Age at first marriage, male",Capital health expenditure (% of GDP),Death due to communicable disease and nutrition conditions,Death due to injury,"Cause of death, by non-communicable diseases (% of total)","Community health workers (per 1,000 people)","Condom use, population ages 15-24, male (% of males ages 15-24)",Consumption of iodized salt (% of households),...,Hepatitis B,Measles,BMI,Diphtheria,HIV/AIDS,GDP,Polio,Schooling,Alcohol,"Life expectancy at birth, total (years)"
0,Afghanistan,21.4,24.7,0.2441649,49.118515,11.865492,39.015993,5053479.0,5053479.0,46.566667,...,64.5625,2362.25,15.51875,52.3125,0.1,340.015425,48.375,8.2125,0.014375,59.439
1,Africa Eastern And Southern,5053479.0,5053479.0,5053479.0,59.703069,9.171409,31.125521,5053479.0,34.60712,62.264491,...,80.862429,2331.640525,38.325809,82.193116,1.67715,7550.512712,82.196424,12.040561,4.604644,56.532946
2,Africa Western And Central,5053479.0,5053479.0,5053479.0,66.095155,7.494896,26.409949,0.1440893,38.39535,78.0864,...,80.862429,2331.640525,38.325809,82.193116,1.67715,7550.512712,82.196424,12.040561,4.604644,53.366361
3,Albania,24.06667,28.14,0.1452148,4.774622,6.005545,89.219834,5053479.0,49.8,88.833333,...,98.0,53.375,49.06875,98.0625,0.1,2119.726679,98.125,12.1375,4.84875,77.196062
4,Algeria,29.125,32.925,0.01103354,16.714915,9.92249,73.362596,5053479.0,5053479.0,74.9,...,78.740211,1943.875,48.74375,91.875,0.1,2847.853392,91.75,12.7125,0.669678,72.913625


In [12]:
# Load the data
file_path = '../data/Cleaned/longevity_reduced.csv'
data = pd.read_csv(file_path)
data.drop(columns=["Measles", "Polio", "Literacy rate, youth total"], inplace=True)

# Select columns to normalize
columns_to_normalize = [col for col in data.columns if col not in ['Country Name', 'Life expectancy at birth, total (years)']]

# Initialize the MinMaxScaler
scaler = MinMaxScaler()

# Normalize the selected columns
data[columns_to_normalize] = scaler.fit_transform(data[columns_to_normalize])

# Exclude the 'Country Name' column which is not needed for the model
target = 'Life expectancy at birth, total (years)'
X = data.drop(columns=[target, 'Country Name'])
y = data[target]

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0)

# Initialize the Random Forest Regressor
rf_model = RandomForestRegressor(n_estimators=100, random_state=0)

# Fit the model on the training data
rf_model.fit(X_train, y_train)
preds = rf_model.predict(X_test)

# save as pickled model without using joblib
with open('../models/rf_model.pkl', 'wb') as file:
    pickle.dump(rf_model, file)

# Function to predict life expectancy and top 3 influencing features for a given country
def predict_life_expectancy(country_name):
    # Find the row in the dataframe that corresponds to the given country
    country_data = data[data['Country Name'] == country_name].drop(columns=[target, 'Country Name'])
    
    # Predict life expectancy using the trained model
    life_expectancy = rf_model.predict(country_data)
    
    top_z_score_features = find_top_z_score_features(country_name)
    top_rf_features = rf_feature_importance(country_data)
    
    return top_rf_features, top_z_score_features

# Function to find the top features based on Z-scores alone for the United States
def find_top_z_score_features(country_name):
    # Isolate the row for the given country
    country_row = data[data['Country Name'] == country_name].drop(columns=[target, 'Country Name'])
    
    # Calculate the mean and std deviation for the features
    mean_values = X.mean()
    std_dev_values = X.std()
    
    # Calculate the Z-scores for the country's features
    z_scores = (country_row - mean_values) / std_dev_values
    z_scores = z_scores.squeeze() 

    # Make new series for z-score sign
    z_scores_signed = z_scores.copy()
    z_scores_signed[z_scores_signed > 0] = 1
    z_scores_signed[z_scores_signed < 0] = -1

    # Sort by absolute Z-score value
    sorted_z_scores = z_scores.abs().sort_values(ascending=False)
    
    # Multiply the sign by the absolute value to get a series with the sorted magnitudes
    for index, value in sorted_z_scores.items():
        sorted_z_scores[index] = value * z_scores_signed[index]

    top_z_scores = sorted_z_scores.head(25)

    return top_z_scores

def rf_feature_importance(country_data):
    # Get feature importances specific to the country's data
    importances = rf_model.feature_importances_
    indices = np.argsort(importances)[-5:]  # Get indices of top 3 features
    top_features = country_data.columns[indices].tolist()  # Get names of top 3 features
    # return if the deviation is higher or lower than the mean
    return top_features

# Example: Predict for a given country (replace 'Afghanistan' with user input)
example_country = 'China'
example_country = "United States"
example_country = "India"

# if country is not in dataset, return error message
if example_country not in data['Country Name'].values:
    print('Country not found in dataset')
    # return error message

# perform string formatting to ensure first letter of each word is capitalized
example_country = example_country.title()

top_rf_features, top_z_score_features = predict_life_expectancy(example_country)
# top features random forest features
print('Top 5 Random Forest features:')
# print top rf features on different lines
for feature in top_rf_features:
    print(feature)
print()

# print top z score features on different lines

print()

print('Top 10 Z-score features:')

display(top_z_score_features)
print()

NameError: name 'pickle' is not defined

In [20]:
model_filename = "../models/rf_model.joblib"
joblib.dump(rf_model, model_filename)

['../models/rf_model.joblib']

In [5]:
import json
import requests
url = "https://cdn.jsdelivr.net/npm/world-atlas@2/countries-50m.json"
# Get json from url
r = requests.get(url)
# Convert to dict
d = r.json()

print(d.keys())

dict_keys(['type', 'objects', 'arcs', 'bbox', 'transform'])


In [11]:
from fuzzywuzzy import process
countries = []
for country in d["objects"]["countries"]["geometries"]:
    name = country["properties"]["name"]
    countries.append(name)

sorted_countries = sorted(countries)

# Get countries from df
df_countries = data["Country Name"].unique()

# Convert to list
df_countries = df_countries.tolist()
sorted_df_countries = sorted(df_countries)


conversion_dict = {}
z = 0
for c in sorted_countries:
    match, score = process.extractOne(c, df_countries)
    
    if score >= 94:
        conversion_dict[c] = match
    else:
        print(f"{c} ---> {match}")
        conversion_dict[c] = None
        z += 1
print(z)
print(conversion_dict)

for key in conversion_dict.keys():
    if conversion_dict[key] and conversion_dict[key] != key:
        print(f"Key: {key}, Value: {conversion_dict[key]}")

# Write to a json file
# with open('../Data/country_conversion.json', 'w') as f:
#     json.dump(conversion_dict, f)

Defaulting to user installation because normal site-packages is not writeable
[0mCollecting fuzzywuzzy
  Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl (18 kB)
[0mInstalling collected packages: fuzzywuzzy
Successfully installed fuzzywuzzy-0.18.0
[0mNote: you may need to restart the kernel to use updated packages.




NameError: name 'd' is not defined

In [31]:
for c in sorted_df_countries:
    print(c)


Afghanistan
Africa Eastern And Southern
Africa Western And Central
Albania
Algeria
American Samoa
Andorra
Angola
Antigua And Barbuda
Arab World
Argentina
Armenia
Aruba
Australia
Austria
Azerbaijan
Bahamas, The
Bahrain
Bangladesh
Barbados
Belarus
Belgium
Belize
Benin
Bermuda
Bhutan
Bolivia
Bosnia And Herzegovina
Botswana
Brazil
British Virgin Islands
Brunei Darussalam
Bulgaria
Burkina Faso
Burundi
Cabo Verde
Cambodia
Cameroon
Canada
Caribbean Small States
Cayman Islands
Central African Republic
Central Europe And The Baltics
Chad
Channel Islands
Chile
China
Colombia
Comoros
Congo, Dem. Rep.
Congo, Rep.
Costa Rica
Cote D'Ivoire
Croatia
Cuba
Curacao
Cyprus
Czechia
Denmark
Djibouti
Dominica
Dominican Republic
Early-Demographic Dividend
East Asia & Pacific
East Asia & Pacific (Excluding High Income)
East Asia & Pacific (Ida & Ibrd Countries)
Ecuador
Egypt, Arab Rep.
El Salvador
Equatorial Guinea
Eritrea
Estonia
Eswatini
Ethiopia
Euro Area
Europe & Central Asia
Europe & Central Asia (Excludi