In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error


df = pd.read_csv('Life Expectancy Data.csv')

print("Dimenzije dataseta:", df.shape)
print("\nPrvih 5 redova:")
print(df.head())
print("\nInformacije o kolonama:")
print(df.info())
print("\nStatistička analiza:")
print(df.describe())

print("\nNedostajuće vrijednosti po kolonama:")
print(df.isnull().sum())
print(f"\nPostotak nedostajućih vrijednosti: {(df.isnull().sum().sum() / (df.shape[0] * df.shape[1]) * 100):.2f}%")

In [None]:

plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.hist(df['Life expectancy '], bins=30, edgecolor='black', alpha=0.7)
plt.xlabel('Life Expectancy (years)')
plt.ylabel('Frequency')
plt.title('Distribution of Life Expectancy')

plt.subplot(1, 2, 2)
plt.boxplot(df['Life expectancy '].dropna())
plt.ylabel('Life Expectancy (years)')
plt.title('Box Plot of Life Expectancy')

plt.tight_layout()
plt.show()

plt.figure(figsize=(16, 12))

numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
correlation_matrix = df[numeric_cols].corr()

life_exp_corr = correlation_matrix['Life expectancy '].sort_values(ascending=False)
print("\nNajjače korelacije s Life Expectancy:")
print(life_exp_corr.head(16)) 

top_features = life_exp_corr.head(16).index
sns.heatmap(df[top_features].corr(), annot=True, fmt='.2f', cmap='coolwarm', center=0)
plt.title('Correlation Matrix - Top Features')
plt.tight_layout()
plt.show()

In [None]:

df_clean = df.drop(['Country', 'Year'], axis=1, errors='ignore')

df_clean['Status'] = df_clean['Status'].map({'Developed': 1, 'Developing': 0})

threshold = 0.4
missing_percentages = df_clean.isnull().sum() / len(df_clean)
cols_to_drop = missing_percentages[missing_percentages > threshold].index.tolist()
print(f"Kolone s > 40% nedostajućih vrijednosti (izbačene): {cols_to_drop}")
df_clean = df_clean.drop(cols_to_drop, axis=1)

for col in df_clean.columns:
    if df_clean[col].isnull().sum() > 0:
        df_clean[col].fillna(df_clean[col].median(), inplace=True)

print(f"\nPreostale nedostajuće vrijednosti: {df_clean.isnull().sum().sum()}")

X = df_clean.drop('Life expectancy ', axis=1)
y = df_clean['Life expectancy ']

print(f"\nBroj značajki: {X.shape[1]}")
print(f"Značajke: {list(X.columns)}")