In [5]:
##Key tasks for the project:
# Task 0: Import necessary libraries and load the dataset.
# Task 1:  Explore the dataset, identify missing values, and determine the percentage of  missing data for each feature.
# Task 2: Decide on the appropriate method for handling missing data (imputation, removal,or using algorithms that handle missing values natively) and justify your choice.
# Task 3: Implement the chosen method and evaluate its impact on the dataset
# Task 4: Explore the dataset and identify potential features.
# Task 5: Implement feature engineering techniques to create new features.
# Task 6: Evaluate the impact of the new features on model performance.
# Task 7: Explore the dataset and identify key variables for visualization.
# Task 8: Use libraries like Matplotlib, Seaborn, and Plotly to create various types of visualizations (e.g., scatter plots, heatmaps, 3D plots).
# Task 9: Interpret the visualizations to uncover patterns and insights in the data.
# Task 10: Data Splitting, Training of modules
# Task 11: Cross Validation and Evaluation of the model
# Task 12: Conclusions, Actionable insights, Recommendation and summary

In [6]:
#Import Necessary Libraries
import pandas as pd      
import numpy as np      
import matplotlib.pyplot as plt 
import seaborn as sb    
from sklearn.model_selection import train_test_split  
from sklearn.linear_model import LinearRegression     
from sklearn.metrics import r2_score, mean_absolute_error 
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import make_scorer 
import plotly.express as px  
from mpl_toolkits.mplot3d import Axes3D
  


ModuleNotFoundError: No module named 'xgboost'

Task 1:  Explore the dataset, identify missing values, and determine the percentage of  missing data for each feature.

In [None]:
# Load the dataset
df = pd.read_csv('Life_Expectancy_Data.csv')  

df.head()

In [None]:
df.shape 

In [None]:
df.info()

In [None]:
df['Country'] = df['Country'].astype('category')
df['Status'] = df['Status'].astype('category')

print("Updated Data Types:")
print(df.dtypes[['Country', 'Status']])




In [None]:
df.head()

In [None]:
# Remove leading/trailing spaces from column names
df.columns = df.columns.str.strip()

In [None]:
df.describe().T

In [None]:
#Checking if there are any duplicate rows
duplicates = df.duplicated().sum()
print(f"\nNumber of duplicate rows: {duplicates}")

In [None]:
missing_values = df.isnull().sum()
missing_percent = (missing_values / len(df)) * 100

missing_table = pd.DataFrame({
    'Missing Values': missing_values,
    '% Missing': missing_percent.round(2)
}).sort_values(by='Missing Values', ascending=False)

print(" Missing Values Summary:")
display(missing_table)

Task 2: Decide on the appropriate method for handling missing data 
 


In [None]:
# After examining the dataset, there are no missing values in any of the features.
# Therefore, no imputation, removal, or special handling is required.

Task 3: Implement the chosen method and evaluate its impact on the dataset

In [None]:
# Since no missing data was found, no changes were made to the dataset.
# The structure and distribution of the data remain unchanged

Task 4: Explore the dataset and identify potential features.

In [None]:
# Replace 'Status' with numeric codes
df['Status_Code'] = df['Status'].map({'Developing': 0, 'Developed': 1})


In [None]:
# Correlation matrix with target variable
correlation = df.corr(numeric_only=True)['Life expectancy'].sort_values(ascending=False)
print("Correlation with Life Expectancy:\n")
print(correlation)



In [None]:
plt.figure(figsize=(16, 12))
sb.heatmap(df.select_dtypes(include=[np.number]).corr(), annot=True, cmap='coolwarm', center=0)
plt.title("Feature Correlation Heatmap")
plt.show()

In [None]:
# Remove Highly Correlated Features to avoid redundancy (|correlation|>0.8)
# Getting rid of under-five deaths column since it is perfectly positively correlated with infant deaths hence a redundant feature
df.drop(columns=['under-five deaths'], inplace=True)
df.shape

# Getting rid of thinness 5-9 since it is very highly positively correlated with thinness 1-19 years hence a redundant feature
df.drop(columns=['thinness 5-9 years'], inplace=True)

# Gettting rid of percent expenditure since it is highly negatively correlated with GDP
df.drop(columns=['percentage expenditure'], inplace=True)

In [None]:
# Checking for skewness in numerical columns
numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns
skewness = df[numerical_cols].skew()
print("\nSkewness of numerical columns:")
print(skewness)

In [None]:
high_skew = skewness[abs(skewness) >= 5].index
for col in high_skew:
    df[col] = np.log1p(df[col])
    print(f"Applied log transformation to {col}")   
    
skewness = df[numerical_cols].skew()
print("\nSkewness after transformation:")
print(skewness)

Task 5: Implement feature engineering techniques to create new features.

In [None]:
#GDP per Capita
df['GDP_per_capita'] = df['GDP'] / df['Population']
df['GDP_per_capita'] = df['GDP_per_capita'].fillna(df['GDP_per_capita'].mean())

#Vaccination Composite Feature
df['Vaccination'] = df[['Hepatitis B', 'Measles', 'Polio', 'Diphtheria']].mean(axis=1)

#BMI Categories
bins = [0, 18.5, 25, 30, np.inf]
labels = ['Underweight', 'Normal', 'Overweight', 'Obese']
df['BMI Category'] = pd.cut(df['BMI'], bins=bins, labels=labels)

#Total Expenditure per Capita
df['Total_expenditure_per_capita'] = df['Total expenditure'] / df['Population']

# # Is_Developed Feature

df['Is_Developed'] = df['Status'].apply(lambda x: 1 if x == 'Developed' else 0)

df[['GDP_per_capita','Vaccination','BMI Category','Total_expenditure_per_capita','Is_Developed']].head()



In [None]:
plt.figure(figsize=(16, 12))
sb.heatmap(df.select_dtypes(include=[np.number]).corr(), annot=True, cmap='coolwarm', center=0)
plt.title("Feature Correlation Heatmap")
plt.show()

 Task 6: Evaluate the impact of the new features on model performance.

In [None]:

y = df['Life expectancy']

X_base = df[['Is_Developed', 'Adult Mortality', 'infant deaths', 
             'Alcohol', 'Hepatitis B', 'Measles', 'BMI',
             'Polio', 'Total expenditure', 'Diphtheria', 
             'HIV/AIDS', 'GDP', 'Population', 'thinness  1-19 years', 
             'Income composition of resources', 'Schooling']]

X_new = df[['Adult Mortality', 'infant deaths', 'Alcohol',
            'HIV/AIDS', 'thinness  1-19 years', 'Income composition of resources', 'Schooling',
            'GDP_per_capita', 'Vaccination', 'BMI Category', 
            'Total_expenditure_per_capita', 'Is_Developed']]

# Encode categorical 
X_new = pd.get_dummies(X_new, drop_first=True)


train_idx, test_idx = train_test_split(df.index, test_size=0.2, random_state=42)

Xb_train, Xb_test = X_base.loc[train_idx], X_base.loc[test_idx]
Xn_train, Xn_test = X_new.loc[train_idx], X_new.loc[test_idx]
yb_train, yb_test = y.loc[train_idx], y.loc[test_idx]

model = LinearRegression()

model.fit(Xb_train, yb_train)
y_pred_base = model.predict(Xb_test)

r2_base = r2_score(yb_test, y_pred_base)
mae_base = mean_absolute_error(yb_test, y_pred_base)


model.fit(Xn_train, yb_train)
y_pred_new = model.predict(Xn_test)

r2_new = r2_score(yb_test, y_pred_new)
mae_new = mean_absolute_error(yb_test, y_pred_new)

# Compare results
print("Model Performance Comparison:")
print(f"R² (Base Model): {r2_base:.3f}")
print(f"MAE (Base Model): {mae_base:.3f}")
print()
print(f"R² (With Engineered Features): {r2_new:.3f}")
print(f"MAE (With Engineered Features): {mae_new:.3f}")


Task 7 & 8: Identification and Visualization of Key Variables Using Matplotlib, Seaborn, and Plotly

In [None]:
numeric_df = df.select_dtypes(include=['float64', 'int64'])

# top 12 most correlated features with Life Expectancy
corr_features = numeric_df.corr()['Life expectancy'].abs().sort_values(ascending=False).head(13).index
plt.figure(figsize=(12, 8))
sb.heatmap(numeric_df[corr_features].corr(), annot=True, cmap="coolwarm", fmt=".2f")
plt.title("Correlation Heatmap (Top Features Related to Life Expectancy)", fontsize=14)
plt.show()


In [None]:
#scatter plots for top features

top_features = ['Schooling', 'Income composition of resources', 'Adult Mortality']



plt.figure(figsize=(18, 5))

for i, feature in enumerate(top_features, 1):
    plt.subplot(1, 3, i)
    sb.scatterplot(data=df, x=feature, y='Life expectancy', hue='Status', alpha=0.7)
    sb.regplot(x=df[feature], y=df['Life expectancy'], scatter=False, color='red')
    plt.title(f'Life Expectancy vs {feature}')
    plt.xlabel(feature)
    plt.ylabel('Life Expectancy')

plt.tight_layout()
plt.show()



In [None]:
# Box Plot – Life Expectancy by Status
plt.figure(figsize=(10, 6))
sb.boxplot(data=df, x='Status', y='Life expectancy')
plt.title("Life Expectancy by Development Status", fontsize=14)
plt.show()


In [None]:
# 📊 Bar Plot: Average Life Expectancy by Development Status (Using 'Status')
plt.figure(figsize=(6, 5))
sb.barplot(data=df, x='Status', y='Life expectancy')
plt.title('Average Life Expectancy by Development Status')
plt.xlabel('Development Status')
plt.ylabel('Life Expectancy')
plt.tight_layout()
plt.show()





In [None]:
plt.figure(figsize=(10, 6))
sb.lineplot(x='Year', y='Life expectancy', data=df, hue='Status', ci=None)
plt.title("Life Expectancy Trend Over Years")
plt.show()

In [None]:
# 3D Plot – GDP vs Schooling vs Life Expectancy (Using Plotly)

fig = px.scatter_3d(df, x='Schooling', y='Income composition of resources', z='Adult Mortality',
                    color='Status', size='Vaccination', opacity=0.7,
                    title="3D Scatter: GDP vs Schooling vs Life Expectancy")
fig.show()





In [None]:
sb.histplot(df['Life expectancy'], kde=True, bins=30)
plt.title("Distribution of Life Expectancy")
plt.show()


In [None]:
plt.figure(figsize=(10, 6))
sb.violinplot(x='BMI Category', y='Life expectancy', data=df, hue='Status', split=True)
plt.title("Life Expectancy Distribution by BMI Category")
plt.show()

Task 10: Data Splitting, Training of modules

In [None]:

y = df['Life expectancy']

X = df[['Adult Mortality', 'infant deaths', 'Alcohol',
            'HIV/AIDS', 'thinness  1-19 years', 'Income composition of resources', 'Schooling',
            'GDP_per_capita', 'Vaccination', 'BMI Category', 
            'Total_expenditure_per_capita', 'Is_Developed']]
X = pd.get_dummies(X, drop_first=True)


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Random Forest
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
rf_preds = rf_model.predict(X_test)

r2_rf = r2_score(y_test, rf_preds)
mae_rf = mean_absolute_error(y_test, rf_preds)

# XGBoost
xgb_model = XGBRegressor(n_estimators=100, random_state=42)
xgb_model.fit(X_train, y_train)
xgb_preds = xgb_model.predict(X_test)

r2_xgb = r2_score(y_test, xgb_preds)
mae_xgb = mean_absolute_error(y_test, xgb_preds)

#  Compare Results
print("Model Performance Comparison :")
print(f"Random Forest R²: {r2_rf:.3f} | MAE: {mae_rf:.2f}")
print(f"XGBoost       R²: {r2_xgb:.3f} | MAE: {mae_xgb:.2f}")


Cross Validation and Evaluation of the model

In [None]:

y = df['Life expectancy']

X = df[['Adult Mortality', 'infant deaths', 'Alcohol',
            'HIV/AIDS', 'thinness  1-19 years', 'Income composition of resources', 'Schooling',
            'GDP_per_capita', 'Vaccination', 'BMI Category', 
            'Total_expenditure_per_capita', 'Is_Developed']]
X = pd.get_dummies(X, drop_first=True)

#  Define the model
model = RandomForestRegressor(n_estimators=100, random_state=42)

# Set up K-Fold Cross-Validation (5 folds)
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Define scoring metrics
r2_scores = cross_val_score(model, X, y, cv=kf, scoring='r2')
mae_scores = cross_val_score(model, X, y, cv=kf, scoring=make_scorer(mean_absolute_error))

# Print cross-validation results
print("Cross-Validation Performance:")
print(f"R² Scores: {r2_scores}")
print(f"MAE Scores: {mae_scores}")
print("\nAverage Performance:")
print(f" Mean R² Score: {np.mean(r2_scores):.3f}")
print(f" Mean MAE: {np.mean(mae_scores):.2f}")
