In [17]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
import mysql.connector
from mysql.connector import Error
import joblib

In [10]:
# create a mysql database connection
try:
    connection = mysql.connector.connect(host='localhost',
                                         database='longevity',
                                         user='root',
                                         password='root')
    if connection.is_connected():
        db_Info = connection.get_server_info()
        print("Connected to MySQL database... MySQL Server version on ", db_Info)
        cursor = connection.cursor()
        cursor.execute("select database();")
        record = cursor.fetchone()
        print("Your connected to - ", record)
except Error as e:
    print("Error while connecting to MySQL", e)
'''
# load data from csv file into mysql database
file_path = '../data/Cleaned/longevity_reduced.csv'
df = pd.read_csv(file_path)
engine = create_engine('mysql+mysqlconnector://root:root@localhost:3306/longevity', echo=False)
df.to_sql(name='LONGEVITY', con=engine, if_exists = 'append', index=False)
'''

# read data from mysql database
data = pd.read_sql('SELECT * FROM longevity.LONGEVITY', con=connection)

connection.close()
data.head()

Error while connecting to MySQL 1045 (28000): Access denied for user 'root'@'localhost' (using password: YES)


NameError: name 'connection' is not defined

In [22]:
# Load the data
file_path = '../data/Cleaned/longevity_reduced.csv'
data = pd.read_csv(file_path)

# Select columns to normalize
columns_to_normalize = [col for col in data.columns if col not in ['Country Name', 'Life expectancy at birth, total (years)']]

# Initialize the MinMaxScaler
scaler = MinMaxScaler()

# Normalize the selected columns
data[columns_to_normalize] = scaler.fit_transform(data[columns_to_normalize])

# Exclude the 'Country Name' column which is not needed for the model
target = 'Life expectancy at birth, total (years)'
X = data.drop(columns=[target, 'Country Name'])
y = data[target]

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0)

# Initialize the Random Forest Regressor
rf_model = RandomForestRegressor(n_estimators=100, random_state=0)

# Fit the model on the training data
rf_model.fit(X_train, y_train)
preds = rf_model.predict(X_test)

# Function to predict life expectancy and top 3 influencing features for a given country
def predict_life_expectancy(country_name):
    # Find the row in the dataframe that corresponds to the given country
    country_data = data[data['Country Name'] == country_name].drop(columns=[target, 'Country Name'])
    
    # Predict life expectancy using the trained model
    life_expectancy = rf_model.predict(country_data)
    
    # Calculate the deviation of the country's feature values from the dataset mean
    deviations = country_data.squeeze() - X.mean()

    # Determine if the deviation is higher or lower than the mean
    deviation_direction = deviations.apply(lambda x: 'higher' if x > 0 else 'lower')

    # Combine the absolute deviation and its direction
    deviations_with_direction = deviations.abs().sort_values(ascending=False).head(5)
    deviations_with_direction = deviations_with_direction.to_frame(name='Deviation')
    deviations_with_direction['Direction'] = deviation_direction.loc[deviations_with_direction.index]

    top_z_score_features = find_top_z_score_features(country_name)
    
    return life_expectancy[0], deviations_with_direction, top_z_score_features

# Function to find the top features based on Z-scores alone for the United States
def find_top_z_score_features(country_name):
    # Isolate the row for the given country
    country_row = data[data['Country Name'] == country_name].drop(columns=[target, 'Country Name'])
    
    # Calculate the mean and std deviation for the features in the dataset excluding the selected country
    mean_values = X.mean()
    std_dev_values = X.std()
    
    # Calculate the Z-scores for the country's features
    z_scores = (country_row - mean_values) / std_dev_values
    z_scores = z_scores.squeeze()  # Convert to Series for easier manipulation
    
    # Sort the z-scores to find the features with the highest deviation from the mean
    top_z_scores = z_scores.sort_values(ascending=False).head(5)

    return top_z_scores
'''
def rf_feature_importance():
    # Get feature importances specific to the country's data
    importances = rf_model.feature_importances_
    indices = np.argsort(importances)[-3:]  # Get indices of top 3 features
    top_features = X.columns[indices]
    top_importances = importances[indices]
    # compare to mean feature importances to see if they are higher or lower
    mean_importances = rf_model.feature_importances_.mean()
    # return how much higher or lower the feature importances are (deviation)
    deviation = top_importances - mean_importances  
    return top_features, top_importances, deviation
'''
# Example: Predict for a given country (replace 'Afghanistan' with user input)
example_country = 'India'

# if country is not in dataset, return error message
if example_country not in data['Country Name'].values:
    print('Country not found in dataset')
    # return error message

# perform string formatting to ensure first letter of each word is capitalized
example_country = example_country.title()

predicted_life_expectancy, top_features, top_z_score_features = predict_life_expectancy(example_country)
# top features ranked in ascending order of importance
print(f'Predicted life expectancy for {example_country}: {predicted_life_expectancy:.2f} years')
print(f'Top 5 features: {top_features}')
# print top z score features on different lines

print()

print('Top 5 Z-score features:')
for feature, z_score in top_z_score_features.items():
    print(f'{feature}: {z_score:.2f}')

print()

Predicted life expectancy for India: 65.93 years
Top 5 features:                                                     Deviation Direction
Physicians                                           0.875940    higher
Condom use, population ages 15-24, male (% of m...   0.759399     lower
Demand for family planning                           0.740592     lower
Community health workers (per 1,000 people)          0.736842     lower
Female headed households (% of households with ...   0.736841     lower

Top 5 Z-score features:
Measles: 7.32
Physicians: 2.65
Number of undernourished people: 2.02
People using safely managed drinking water services: 1.25
Lack of culture and tradition: 0.97



In [16]:
model_filename = "../models/rf_model.joblib"
joblib.dump(rf_model, model_filename)

['../models/rf_model.joblib']