In [6]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, MinMaxScaler, Normalizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error, r2_score
from xgboost import XGBRegressor
import matplotlib.pyplot as plt
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Input


# Load the CSV file
df = pd.read_csv('/Users/javi/Desktop/Ironhack/Entregas_de_github/Final-Project-at-Ironhack/Data/Cleaned/World_machine_learning_ready.csv')

# Select relevant features and target
features = df.drop(columns=['World_Total Population'])
target = df['World_Total Population']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

# Define scalers
scalers = {
    'StandardScaler': StandardScaler(),
    'MinMaxScaler': MinMaxScaler(),
    'Normalizer': Normalizer()
}

# Model training and evaluation function
def train_and_evaluate(scaler, X_train, X_test, y_train, y_test):
    # Scale the features
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    # Initialize and train models
    models = {
        'Linear Regression': LinearRegression(),
        'Random Forest': RandomForestRegressor(random_state=42),
        'Gradient Boosting': GradientBoostingRegressor(random_state=42),
        'XGBoost': XGBRegressor(random_state=42),
        'SVR': SVR(kernel='rbf'),
        'KNN': KNeighborsRegressor(),
        'Neural Network': Sequential([
            Input(shape=(X_train_scaled.shape[1],)),
            Dense(64, activation='relu'),
            Dense(64, activation='relu'),
            Dense(1, activation='linear')
        ])
    }
    
    results = {}
    
    for name, model in models.items():
        if name == 'Neural Network':
            model.compile(optimizer='adam', loss='mean_squared_error')
            model.fit(X_train_scaled, y_train, epochs=50, batch_size=32, validation_split=0.2, verbose=0)
            y_pred = model.predict(X_test_scaled).flatten()
        else:
            if name == 'Random Forest':
                param_grid = {'n_estimators': [50, 100], 'max_features': ['sqrt'], 'max_depth': [10, None]}
                grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=3, n_jobs=-1, verbose=0)
                grid_search.fit(X_train_scaled, y_train)
                model = grid_search.best_estimator_
            else:
                model.fit(X_train_scaled, y_train)
            
            y_pred = model.predict(X_test_scaled)
        
        mse = mean_squared_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)
        results[name] = {'MSE': mse, 'R2': r2}
    
    return results

# Collect and display results for each scaler
for scaler_name, scaler in scalers.items():
    print(f"\nResults for {scaler_name}:")
    results = train_and_evaluate(scaler, X_train, X_test, y_train, y_test)
    for model_name, metrics in results.items():
        print(f"{model_name} - MSE: {metrics['MSE']:.2f}, R2: {metrics['R2']:.4f}")

# Example plot for neural network predictions with MinMaxScaler
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
model = Sequential([
    Input(shape=(X_train_scaled.shape[1],)),
    Dense(64, activation='relu'),
    Dense(64, activation='relu'),
    Dense(1, activation='linear')
])
model.compile(optimizer='adam', loss='mean_squared_error')
model.fit(X_train_scaled, y_train, epochs=50, batch_size=32, validation_split=0.2, verbose=0)
y_pred_nn = model.predict(X_test_scaled).flatten()

plt.figure(figsize=(7, 6))
plt.plot(range(len(y_test)), y_test, label='Actual Population')
plt.plot(range(len(y_test)), y_pred_nn, label='Predicted Population (Neural Network)', linestyle='--')
plt.xlabel('Index')
plt.ylabel('Total Population')
plt.title('Neural Network with MinMaxScaler: Actual vs. Predicted Total Population')
plt.legend()
plt.show()


Results for StandardScaler:
