In [11]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor  # Regressor for continuous target
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import LabelEncoder

# Load and clean column names
df = pd.read_csv('Life Expectancy Data.csv')
df.columns = df.columns.str.strip()

# Check your data
print("Data shape:", df.shape)
print("\nData types:")
print(df.dtypes)
print("\nMissing values:")
print(df.isnull().sum())

Data shape: (2938, 22)

Data types:
Country                             object
Year                                 int64
Status                              object
Life expectancy                    float64
Adult Mortality                    float64
infant deaths                        int64
Alcohol                            float64
percentage expenditure             float64
Hepatitis B                        float64
Measles                              int64
BMI                                float64
under-five deaths                    int64
Polio                              float64
Total expenditure                  float64
Diphtheria                         float64
HIV/AIDS                           float64
GDP                                float64
Population                         float64
thinness  1-19 years               float64
thinness 5-9 years                 float64
Income composition of resources    float64
Schooling                          float64
dtype: object

Mis

In [10]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer

# Load data
df = pd.read_csv('Life Expectancy Data.csv')
df.columns = df.columns.str.strip()

# Separate target
target = 'Life expectancy'
y = df[target]
X = df.drop(target, axis=1)

# ============================================
# MINIMAL PROCESSING (Required for model to run)
# ============================================

# Drop rows where target is missing
mask = y.notna()
X = X[mask]
y = y[mask]

# Handle categorical columns (required - model can't use text)
X_minimal = X.copy()

# Option 1: Drop text columns
X_minimal = X_minimal.drop(['Country', 'Status'], axis=1)

# Fill missing values with median (required - model can't use NaN)
X_minimal = X_minimal.fillna(X_minimal.median())

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X_minimal, y, test_size=0.2, random_state=42
)

# ============================================
# EXPERIMENT 1: Without Extra Preprocessing
# ============================================
model_raw = RandomForestRegressor(random_state=42)
model_raw.fit(X_train, y_train)
y_pred_raw = model_raw.predict(X_test)

r2_raw = r2_score(y_test, y_pred_raw)
rmse_raw = np.sqrt(mean_squared_error(y_test, y_pred_raw))
print(f"WITHOUT extra preprocessing:")
print(f"  R² Score: {r2_raw:.4f}")
print(f"  RMSE:     {rmse_raw:.4f}")

# ============================================
# EXPERIMENT 2: With Full Preprocessing
# ============================================
X_processed = X.copy()

# Encode categorical columns instead of dropping
le_status = LabelEncoder()
X_processed['Status'] = le_status.fit_transform(X_processed['Status'])

# Drop Country (too many unique values) or encode it
X_processed = X_processed.drop('Country', axis=1)

# Fill missing values
imputer = SimpleImputer(strategy='median')
X_processed = pd.DataFrame(
    imputer.fit_transform(X_processed),
    columns=X_processed.columns
)

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_processed)

# Split (same random state!)
X_train_p, X_test_p, y_train_p, y_test_p = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42
)

model_processed = RandomForestRegressor(random_state=42)
model_processed.fit(X_train_p, y_train_p)
y_pred_processed = model_processed.predict(X_test_p)

r2_processed = r2_score(y_test_p, y_pred_processed)
rmse_processed = np.sqrt(mean_squared_error(y_test_p, y_pred_processed))
print(f"\nWITH full preprocessing:")
print(f"  R² Score: {r2_processed:.4f}")
print(f"  RMSE:     {rmse_processed:.4f}")

# ============================================
# COMPARISON
# ============================================
print("\n" + "="*50)
print("COMPARISON SUMMARY")
print("="*50)
print(f"R² improvement:   {r2_processed - r2_raw:.4f}")
print(f"RMSE improvement: {rmse_raw - rmse_processed:.4f}")

ValueError: could not convert string to float: 'Cabo Verde'

In [4]:
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer

X_train_processed = X_train.copy()
X_test_processed = X_test.copy()


imputer = SimpleImputer(strategy='mean')  
X_train_processed = imputer.fit_transform(X_train_processed)
X_test_processed = imputer.transform(X_test_processed)  


scaler = StandardScaler()
X_train_processed = scaler.fit_transform(X_train_processed)
X_test_processed = scaler.transform(X_test_processed)


model_processed = RandomForestClassifier(random_state=42)
model_processed.fit(X_train_processed, y_train)


y_pred_processed = model_processed.predict(X_test_processed)
accuracy_processed = accuracy_score(y_test, y_pred_processed)
print(f"Accuracy WITH preprocessing: {accuracy_processed:.4f}")

NameError: name 'X_train' is not defined

In [None]:

print("\n" + "="*50)
print("EXPERIMENT RESULTS")
print("="*50)
print(f"Without Preprocessing: {accuracy_raw:.4f}")
print(f"With Preprocessing:    {accuracy_processed:.4f}")
print(f"Improvement:           {accuracy_processed - accuracy_raw:.4f}")

['Country', 'Year', 'Status', 'Life expectancy ', 'Adult Mortality', 'infant deaths', 'Alcohol', 'percentage expenditure', 'Hepatitis B', 'Measles ', ' BMI ', 'under-five deaths ', 'Polio', 'Total expenditure', 'Diphtheria ', ' HIV/AIDS', 'GDP', 'Population', ' thinness  1-19 years', ' thinness 5-9 years', 'Income composition of resources', 'Schooling']
