In [1]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
import warnings
warnings.filterwarnings('ignore')

from azure.ai.ml import MLClient
from azure.identity import DefaultAzureCredential
from azure.ai.ml.entities import Data
from azure.ai.ml.constants import AssetTypes

subscription_id = "YOUR_SUBSCRIPTION_ID"
resource_group = "YOUR_RESOURCE_GROUP"
workspace = "YOUR_WORKSPACE_NAME"

ml_client = MLClient(
    DefaultAzureCredential(), 
    subscription_id, 
    resource_group, 
    workspace
)

print(f"Povezano na workspace: {workspace}")


df = pd.read_csv('Life Expectancy Data.csv')

print(f"Dimenzije dataseta: {df.shape}")
print(f"\nPrvih 5 redova:")
df.head()

print(df.describe())

print(df.dtypes)

missing_data = df.isnull().sum()
missing_percent = 100 * missing_data / len(df)
missing_table = pd.concat([missing_data, missing_percent], axis=1, 
                          keys=['Broj', 'Postotak'])
print(missing_table[missing_table['Broj'] > 0].sort_values('Postotak', ascending=False))







X_train, X_test, y_train, y_test = train_test_split(
    X_imputed, y, test_size=0.2, random_state=42
)

print(f"\n=== PODJELA PODATAKA ===")
print(f"Training set: {X_train.shape}")
print(f"Test set: {X_test.shape}")

scaler = StandardScaler()
X_train_scaled = pd.DataFrame(
    scaler.fit_transform(X_train),
    columns=X_train.columns,
    index=X_train.index
)
X_test_scaled = pd.DataFrame(
    scaler.transform(X_test),
    columns=X_test.columns,
    index=X_test.index
)

train_data = pd.concat([X_train, y_train], axis=1)
test_data = pd.concat([X_test, y_test], axis=1)

train_data.to_csv('./data/life_expectancy_train.csv', index=False)
test_data.to_csv('./data/life_expectancy_test.csv', index=False)

import joblib
joblib.dump(scaler, './data/scaler.pkl')
joblib.dump(imputer, './data/imputer.pkl')

print("\n✓ Podaci uspješno spremljeni lokalno")


train_data_asset = Data(
    name="life-expectancy-train",
    version="v1",
    description="Processed training data for life expectancy prediction",
    path="./data/life_expectancy_train.csv",
    type=AssetTypes.URI_FILE,
)

try:
    ml_client.data.create_or_update(train_data_asset)
    print("✓ Training data uploaded to Azure ML")
except Exception as e:
    print(f"Training data already exists or error: {e}")

test_data_asset = Data(
    name="life-expectancy-test",
    version="v1",
    description="Processed test data for life expectancy prediction",
    path="./data/life_expectancy_test.csv",
    type=AssetTypes.URI_FILE,
)

try:
    ml_client.data.create_or_update(test_data_asset)
    print("✓ Test data uploaded to Azure ML")
except Exception as e:
    print(f"Test data already exists or error: {e}")

print("\n=== KONAČNE ZNAČAJKE ZA MODEL ===")
print(f"Broj značajki: {len(X_train.columns)}")
print("\nPopis značajki:")
for i, col in enumerate(X_train.columns, 1):
    print(f"{i:2d}. {col}")

metadata = {
    'feature_names': X_train.columns.tolist(),
    'target_name': 'Life expectancy',
    'n_features': len(X_train.columns),
    'n_train_samples': len(X_train),
    'n_test_samples': len(X_test),
    'missing_threshold': missing_threshold,
    'test_size': 0.2,
    'random_state': 42
}

import json
with open('./data/metadata.json', 'w') as f:
    json.dump(metadata, f, indent=4)

print("\n✓ Metadata spremljena")
print("\n" + "="*60)
print("PRIPREMA PODATAKA ZAVRŠENA")
print("="*60)

ModuleNotFoundError: No module named 'seaborn'

In [None]:
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.hist(df['Life expectancy '].dropna(), bins=50, edgecolor='black', alpha=0.7)
plt.xlabel('Životni vijek (godine)')
plt.ylabel('Frekvencija')
plt.title('Distribucija Životnog Vijeka')
plt.grid(True, alpha=0.3)

plt.subplot(1, 2, 2)
plt.boxplot(df['Life expectancy '].dropna(), vert=True)
plt.ylabel('Životni vijek (godine)')
plt.title('Box Plot - Životni Vijek')
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('life_expectancy_distribution.png', dpi=300, bbox_inches='tight')
plt.show()

plt.figure(figsize=(10, 6))
df.boxplot(column='Life expectancy ', by='Status', figsize=(10, 6))
plt.suptitle('')
plt.title('Životni Vijek prema Statusu')
plt.xlabel('Status Zemlje')
plt.ylabel('Životni Vijek (godine)')
plt.savefig('life_expectancy_by_status.png', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()

correlation_matrix = df[numeric_cols].corr()

life_exp_corr = correlation_matrix['Life expectancy '].abs().sort_values(ascending=False)
print(life_exp_corr.head(15))

plt.figure(figsize=(10, 8))
top_features = life_exp_corr.head(16).index  
sns.heatmap(df[top_features].corr(), annot=True, fmt='.2f', 
            cmap='coolwarm', center=0, square=True)
plt.title('Korelacijska Matrica')
plt.tight_layout()
plt.savefig('correlation_matrix.png', dpi=300, bbox_inches='tight')
plt.show()

In [None]:

df_clean = df.drop(['Country'], axis=1)

df_clean['Status'] = df_clean['Status'].map({'Developed': 1, 'Developing': 0})

df_clean.columns = df_clean.columns.str.strip()

missing_threshold = 40  
missing_pct = 100 * df_clean.isnull().sum() / len(df_clean)
columns_to_keep = missing_pct[missing_pct < missing_threshold].index
df_clean = df_clean[columns_to_keep]

print(f"\nZadržano {len(columns_to_keep)} od {len(df.columns)} stupaca")
print(f"Uklonjeni stupci s >{missing_threshold}% nedostajućih podataka:")
print(missing_pct[missing_pct >= missing_threshold].sort_values(ascending=False))

df_clean = df_clean.dropna(subset=['Life expectancy'])

print(f"\nDimenzije nakon čišćenja: {df_clean.shape}")


X = df_clean.drop('Life expectancy', axis=1)
y = df_clean['Life expectancy']

imputer = SimpleImputer(strategy='median')
X_imputed = pd.DataFrame(
    imputer.fit_transform(X),
    columns=X.columns,
    index=X.index
)


print(f"X: {X_imputed.isnull().sum().sum()}")
print(f"y: {y.isnull().sum()}")
