In [None]:
# 1) Data Import
from google.colab import files
import io, pandas as pd, numpy as np
uploaded = files.upload()
fname = list(uploaded.keys())[0]
df = pd.read_csv(io.BytesIO(uploaded[fname]))
print(df.shape)
df.head()

In [None]:
# 2) Data Cleaning
df.drop_duplicates(inplace=True)
for col in df.select_dtypes(include=[np.number]).columns:
    df[col].fillna(df[col].mean(), inplace=True)
for col in df.select_dtypes(include=[np.number]).columns:
    Q1,Q3 = df[col].quantile(0.25), df[col].quantile(0.75)
    IQR = Q3-Q1
    lower, upper = Q1-1.5*IQR, Q3+1.5*IQR
    mask = (df[col]<lower)|(df[col]>upper)
    df.loc[mask,col] = df[col].mean()
df.info()

In [None]:
# 3) EDA - distributions & correlation
import matplotlib.pyplot as plt, seaborn as sns
plt.hist(df['Life expectancy '].dropna(), bins=30); plt.title('Life Expectancy'); plt.show()
sns.heatmap(df.corr(), cmap='coolwarm'); plt.show()

In [None]:
# 4) Preprocessing - encode & scale
from sklearn.preprocessing import LabelEncoder, StandardScaler
cat_cols = df.select_dtypes(include=['object']).columns
le = LabelEncoder()
for col in cat_cols:
    df[col] = le.fit_transform(df[col].astype(str))
scaler = StandardScaler()
num_cols = df.select_dtypes(include=[np.number]).columns
df[num_cols] = scaler.fit_transform(df[num_cols])
print(df.head())

In [None]:
# 5) Model Training - Regression models
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score
X = df.drop(columns=['Life expectancy '])
y = df['Life expectancy ']
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)
models={'RF':RandomForestRegressor(n_estimators=200,random_state=42),
        'ET':ExtraTreesRegressor(n_estimators=200,random_state=42),
        'GB':GradientBoostingRegressor(random_state=42),
        'XGB':XGBRegressor(random_state=42,objective='reg:squarederror')}
for name,model in models.items():
    model.fit(X_train,y_train)
    pred=model.predict(X_test)
    print(name,"RMSE",mean_squared_error(y_test,pred,squared=False),"R2",r2_score(y_test,pred))

In [None]:
# 6) Cross Validation on XGBoost
from sklearn.model_selection import cross_val_score
xgb = XGBRegressor(random_state=42,objective='reg:squarederror')
scores = cross_val_score(xgb,X,y,cv=5,scoring='r2')
print("CV R2 scores:",scores)
print("Mean R2:",scores.mean())