In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import joblib
import os

df = pd.read_csv('Life Expectancy Data.csv')

print(df.shape)
print("\n")
print(df.head())
print("\n")
print(df.info())
print("\n")
print(df.describe())
print("\n")
print(df.isnull().sum())
print("\n")
df_clean = df.drop(['Country', 'Year'], axis=1).copy()

df_clean['Status'] = df_clean['Status'].map({'Developed': 1, 'Developing': 0})

for column in df_clean.columns:
    if df_clean[column].isnull().sum() > 0:
        df_clean[column] = df_clean[column].fillna(df_clean[column].median())

print("\n")
print("Očiščeni podaci: ")
print("\n")
print(df_clean.isnull().sum())
print("\n")
print(df_clean.info())



(2938, 22)


       Country  Year      Status  Life expectancy   Adult Mortality  \
0  Afghanistan  2015  Developing              65.0            263.0   
1  Afghanistan  2014  Developing              59.9            271.0   
2  Afghanistan  2013  Developing              59.9            268.0   
3  Afghanistan  2012  Developing              59.5            272.0   
4  Afghanistan  2011  Developing              59.2            275.0   

   infant deaths  Alcohol  percentage expenditure  Hepatitis B  Measles   ...  \
0             62     0.01               71.279624         65.0      1154  ...   
1             64     0.01               73.523582         62.0       492  ...   
2             66     0.01               73.219243         64.0       430  ...   
3             69     0.01               78.184215         67.0      2787  ...   
4             71     0.01                7.097109         68.0      3013  ...   

   Polio  Total expenditure  Diphtheria    HIV/AIDS         GDP  Populati

In [2]:
plt.figure(figsize=(20, 16))
correlation_matrix = df_clean.corr()
sns.heatmap(correlation_matrix, annot=True, fmt='.2f', cmap='coolwarm', square=True)
plt.title('Korelacijska matrica značajki')
plt.tight_layout()
plt.savefig('correlation_matrix.png', dpi=300, bbox_inches='tight')
plt.show()

life_exp_corr = correlation_matrix['Life expectancy '].sort_values(ascending=False)
print("\nKorelacija sa životnim vijekom: ")
print(life_exp_corr)

important_features = life_exp_corr[abs(life_exp_corr) > 0.5].index.tolist()
important_features.remove('Life expectancy ')
print(f"\nOdabrane važne značajke ({len(important_features)}): ")
print(important_features)

In [None]:
X = df_clean[important_features]
y = df_clean['Life expectancy ']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(f"\nVeličina trening seta: {X_train.shape}")
print(f"Veličina test seta: {X_test.shape}")

models = {
    'Linear Regression': LinearRegression(),
    'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42, max_depth=10),
    'Gradient Boosting': GradientBoostingRegressor(n_estimators=100, random_state=42, max_depth=5)
}

results = {}

print("\n" + "="*80)
print("USPOREDBA MODELA")
print("="*80)

for name, model in models.items():
    model.fit(X_train_scaled, y_train)
    
    y_pred_train = model.predict(X_train_scaled)
    y_pred_test = model.predict(X_test_scaled)
    
    train_mae = mean_absolute_error(y_train, y_pred_train)
    test_mae = mean_absolute_error(y_test, y_pred_test)
    train_rmse = np.sqrt(mean_squared_error(y_train, y_pred_train))
    test_rmse = np.sqrt(mean_squared_error(y_test, y_pred_test))
    train_r2 = r2_score(y_train, y_pred_train)
    test_r2 = r2_score(y_test, y_pred_test)
    
    results[name] = {
        'Train MAE': train_mae,
        'Test MAE': test_mae,
        'Train RMSE': train_rmse,
        'Test RMSE': test_rmse,
        'Train R²': train_r2,
        'Test R²': test_r2
    }
    
    print(f"\n{name}:")
    print(f"  Train MAE: {train_mae:.4f} | Test MAE: {test_mae:.4f}")
    print(f"  Train RMSE: {train_rmse:.4f} | Test RMSE: {test_rmse:.4f}")
    print(f"  Train R²: {train_r2:.4f} | Test R²: {test_r2:.4f}")

fig, axes = plt.subplots(1, 3, figsize=(18, 5))

metrics = ['Test MAE', 'Test RMSE', 'Test R²']
for idx, metric in enumerate(metrics):
    values = [results[model][metric] for model in results.keys()]
    axes[idx].bar(results.keys(), values, color=['#3498db', '#e74c3c', '#2ecc71'])
    axes[idx].set_title(f'{metric} Comparison', fontsize=14, fontweight='bold')
    axes[idx].set_ylabel(metric)
    axes[idx].tick_params(axis='x', rotation=45)
    
    for i, v in enumerate(values):
        axes[idx].text(i, v, f'{v:.3f}', ha='center', va='bottom')

plt.tight_layout()
plt.savefig('model_comparison.png', dpi=300, bbox_inches='tight')
plt.show()


Random Forest:
  Train MAE: 0.8228 | Test MAE: 1.3067
  Train RMSE: 1.1770 | Test RMSE: 1.8296
  Train R²: 0.9848 | Test R²: 0.9614

Gradient Boosting:
  Train MAE: 1.0470 | Test MAE: 1.4086
  Train RMSE: 1.4523 | Test RMSE: 1.9461
  Train R²: 0.9769 | Test R²: 0.9563


In [None]:

best_model = models['Random Forest']

feature_importance = pd.DataFrame({
    'feature': important_features,
    'importance': best_model.feature_importances_
}).sort_values('importance', ascending=False)

print("\nVažnost značajki:")
print(feature_importance)

plt.figure(figsize=(10, 6))
plt.barh(feature_importance['feature'], feature_importance['importance'], color='#3498db')
plt.xlabel('Importance')
plt.title('Feature Importance - Random Forest Model')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.savefig('feature_importance.png', dpi=300, bbox_inches='tight')
plt.show()

os.makedirs("models", exist_ok=True)

joblib.dump(best_model, "models/life_expectancy_model.pkl")
joblib.dump(scaler, "models/scaler.pkl")
joblib.dump(important_features, "models/features.pkl")

print("\nOdabrane značajke za API:")
print(important_features)