In [24]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
import sqlalchemy
from sqlalchemy import create_engine
import mysql.connector
from mysql.connector import Error
from mysql.connector import errorcode

In [25]:
# create a mysql database connection
try:
    connection = mysql.connector.connect(host='localhost',
                                         database='longevity',
                                         user='root',
                                         password='root')
    if connection.is_connected():
        db_Info = connection.get_server_info()
        print("Connected to MySQL database... MySQL Server version on ", db_Info)
        cursor = connection.cursor()
        cursor.execute("select database();")
        record = cursor.fetchone()
        print("Your connected to - ", record)
except Error as e:
    print("Error while connecting to MySQL", e)
'''
# load data from csv file into mysql database
file_path = '../data/Cleaned/longevity_reduced.csv'
df = pd.read_csv(file_path)
engine = create_engine('mysql+mysqlconnector://root:root@localhost:3306/longevity', echo=False)
df.to_sql(name='LONGEVITY', con=engine, if_exists = 'append', index=False)
'''

# read data from mysql database
data = pd.read_sql('SELECT * FROM longevity.LONGEVITY', con=connection)

connection.close()
data.head()

Connected to MySQL database... MySQL Server version on  8.0.21
Your connected to -  ('longevity',)


  data = pd.read_sql('SELECT * FROM longevity.LONGEVITY', con=connection)


Unnamed: 0,Country Name,"Age at first marriage, female","Age at first marriage, male",Capital health expenditure (% of GDP),Death due to communicable disease and nutrition conditions,Death due to injury,"Cause of death, by non-communicable diseases (% of total)","Community health workers (per 1,000 people)","Condom use, population ages 15-24, male (% of males ages 15-24)",Consumption of iodized salt (% of households),...,Hepatitis B,Measles,BMI,Diphtheria,HIV/AIDS,GDP,Polio,Schooling,Alcohol,"Life expectancy at birth, total (years)"
0,Afghanistan,21.4,24.7,0.2441649,49.118515,11.865492,39.015993,5053479.0,5053479.0,46.566667,...,64.5625,2362.25,15.51875,52.3125,0.1,340.015425,48.375,8.2125,0.014375,59.439
1,Africa Eastern And Southern,5053479.0,5053479.0,5053479.0,59.703069,9.171409,31.125521,5053479.0,34.60712,62.264491,...,80.862429,2331.640525,38.325809,82.193116,1.67715,7550.512712,82.196424,12.040561,4.604644,56.532946
2,Africa Western And Central,5053479.0,5053479.0,5053479.0,66.095155,7.494896,26.409949,0.1440893,38.39535,78.0864,...,80.862429,2331.640525,38.325809,82.193116,1.67715,7550.512712,82.196424,12.040561,4.604644,53.366361
3,Albania,24.06667,28.14,0.1452148,4.774622,6.005545,89.219834,5053479.0,49.8,88.833333,...,98.0,53.375,49.06875,98.0625,0.1,2119.726679,98.125,12.1375,4.84875,77.196062
4,Algeria,29.125,32.925,0.01103354,16.714915,9.92249,73.362596,5053479.0,5053479.0,74.9,...,78.740211,1943.875,48.74375,91.875,0.1,2847.853392,91.75,12.7125,0.669678,72.913625


In [27]:
# Load the data
file_path = '../data/Cleaned/longevity_reduced.csv'
data = pd.read_csv(file_path)

# Exclude the 'Country Name' column which is not needed for the model
target = 'Life expectancy at birth, total (years)'
X = data.drop(columns=[target, 'Country Name'])
y = data[target]

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0)

# Initialize the Random Forest Regressor
rf_model = RandomForestRegressor(n_estimators=100, random_state=0)

# Fit the model on the training data
rf_model.fit(X_train, y_train)
preds = rf_model.predict(X_test)

# Function to predict life expectancy and top 3 influencing features for a given country
def predict_life_expectancy(country_name):
    # Find the row in the dataframe that corresponds to the given country
    country_data = data[data['Country Name'] == country_name].drop(columns=[target, 'Country Name'])
    
    # Predict life expectancy using the trained model
    life_expectancy = rf_model.predict(country_data)
    
    # Calculate the deviation of the country's feature values from the dataset mean
    deviations = country_data.squeeze() - X.mean()
    print(deviations)
    # Determine if the deviation is higher or lower than the mean
    deviation_direction = deviations.apply(lambda x: 'higher' if x > 0 else 'lower')

    # Combine the absolute deviation and its direction
    deviations_with_direction = deviations.abs().sort_values(ascending=False).head(5)
    deviations_with_direction = deviations_with_direction.to_frame(name='Deviation')
    deviations_with_direction['Direction'] = deviation_direction.loc[deviations_with_direction.index]

    top_z_score_features = find_top_z_score_features(country_name)
    
    return life_expectancy[0], deviations_with_direction, top_z_score_features

# Function to find the top features based on Z-scores alone for the United States
def find_top_z_score_features(country_name):
    # Isolate the row for the given country
    country_row = data[data['Country Name'] == country_name].drop(columns=[target, 'Country Name'])
    
    # Calculate the mean and std deviation for the features in the dataset excluding the selected country
    mean_values = X.mean()
    std_dev_values = X.std()
    
    # Calculate the Z-scores for the country's features
    z_scores = (country_row - mean_values) / std_dev_values
    z_scores = z_scores.squeeze()  # Convert to Series for easier manipulation
    
    # Sort the z-scores to find the features with the highest deviation from the mean
    top_z_scores = z_scores.sort_values(ascending=False).head(5)

    return top_z_scores
'''
def rf_feature_importance():
    # Get feature importances specific to the country's data
    importances = rf_model.feature_importances_
    indices = np.argsort(importances)[-3:]  # Get indices of top 3 features
    top_features = X.columns[indices]
    top_importances = importances[indices]
    # compare to mean feature importances to see if they are higher or lower
    mean_importances = rf_model.feature_importances_.mean()
    # return how much higher or lower the feature importances are (deviation)
    deviation = top_importances - mean_importances  
    return top_features, top_importances, deviation
'''
# Example: Predict for a given country (replace 'Afghanistan' with user input)
example_country = 'India'

# if country is not in dataset, return error message
if example_country not in data['Country Name'].values:
    print('Country not found in dataset')
    # return error message

# perform string formatting to ensure first letter of each word is capitalized
example_country = example_country.title()

predicted_life_expectancy, top_features, top_z_score_features = predict_life_expectancy(example_country)
# top features ranked in ascending order of importance
print(f'Predicted life expectancy for {example_country}: {predicted_life_expectancy:.2f} years')
print(f'Top 5 features: {top_features}')
# print top z score features on different lines

print()

print('Top 5 Z-score features:')
for feature, z_score in top_z_score_features.items():
    print(f'{feature}: {z_score:.2f}')

print()
#predicted_life_expectancy, top_features

     Age at first marriage, female  Age at first marriage, male  \
111                  -1.120883e+06                -1.272865e+06   
377                  -1.120883e+06                -1.272865e+06   

     Capital health expenditure (% of GDP)  \
111                          -1.899804e+06   
377                          -1.899804e+06   

     Death due to communicable disease and nutrition conditions  \
111                                      -740908.93324            
377                                      -740908.93324            

     Death due to injury  \
111       -740921.611301   
377       -740921.611301   

     Cause of death, by non-communicable diseases (% of total)  \
111                                     -740925.719705           
377                                     -740925.719705           

     Community health workers (per 1,000 people)  \
111                                -3.723616e+06   
377                                -3.723616e+06   

     Condom use,

ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().