In [5]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Load the dataset
refugee_df = pd.read_csv('Resources/demographic_ml_df.csv', dtype={'country_origin': 'category', 'country_asylum': 'category'})

# Select rows for countries that report demographic data
r_df = refugee_df.loc[(refugee_df['Male total'] != 0) & (refugee_df['Female total'] != 0)].copy()
r_df.drop(['Male total', 'Unnamed: 0', 'Female total'], axis=1, inplace=True)
r_df.rename(columns={'total': 'total_refugees'}, inplace=True)

# Define the features and target variable
X = r_df.drop(['total_refugees'], axis=1)
y = r_df['total_refugees']

# Define the columns to encode
cat_features = ['country_origin', 'country_asylum']

# Create a ColumnTransformer object to apply the encoder
preprocessor = ColumnTransformer(
    transformers=[
        ('onehot', OneHotEncoder(), cat_features)
    ])

# Apply the preprocessor to the data
X_processed = preprocessor.fit_transform(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_processed, y, test_size=0.3, random_state=np.random.RandomState(42))

# Define the parameter grid to search over
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30],
    'min_samples_split': [2, 4, 6]
}

# Create the GridSearchCV object
rf_reg = RandomForestRegressor()
grid_search = GridSearchCV(estimator=rf_reg, param_grid=param_grid, scoring='neg_mean_squared_error', cv=5, n_jobs=-1)

# Fit the GridSearchCV object to the training data
grid_search.fit(X_train, y_train)

# Get the best hyperparameters and the best MSE score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print("Best hyperparameters:", best_params)
print("Best MSE score:", -best_score)

# Create a random forest regressor with the best hyperparameters
rf_reg = RandomForestRegressor(n_estimators=best_params['n_estimators'], 
                               max_depth=best_params['max_depth'], 
                               min_samples_split=best_params['min_samples_split'],
                               n_jobs=-1)

# Train the regressor on the training data
rf_reg.fit(X_train, y_train)

# Make predictions on the testing data
y_pred = rf_reg.predict(X_test)

# Evaluate the performance of the model using the mean squared error
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

# Evaluate the performance of the model using R-squared
r2 = r2_score(y_test, y_pred)
print("R-squared:", r2)

Best hyperparameters: {'max_depth': 30, 'min_samples_split': 6, 'n_estimators': 300}
Best MSE score: 3507003676.6075525
Mean Squared Error: 3341032910.648945
R-squared: 0.6848456916470325


In [9]:
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor

# Define the models and their names
models = [
    ('Linear Regression', LinearRegression()),
    ('Support Vector Regression', SVR()),
    ('Decision Tree Regression', DecisionTreeRegressor(random_state=42)),
    ('K-Nearest Neighbors Regression', KNeighborsRegressor()),
    ('Random Forest Regression', RandomForestRegressor(n_estimators=best_params['n_estimators'],
                                                        max_depth=best_params['max_depth'],
                                                        min_samples_split=best_params['min_samples_split'],
                                                        n_jobs=-1,
                                                        random_state=42)),
    ('Random Forest Regressor', RandomForestRegressor(n_estimators=best_params['n_estimators'], 
                                                        max_depth=best_params['max_depth'], 
                                                        min_samples_split=best_params['min_samples_split'],
                                                        n_jobs=-1))
]

# Train and evaluate each model
for name, model in models:
    # Fit the model to the training data
    model.fit(X_train, y_train)

    # Make predictions on the testing data
    y_pred = model.predict(X_test)

    # Calculate the evaluation metrics
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    print(f"{name}:")
    print(f"  Mean Squared Error: {mse}")
    print(f"  Root Mean Squared Error: {rmse}")
    print(f"  Mean Absolute Error: {mae}")
    print(f"  R-squared: {r2}\n")

Linear Regression:
  Mean Squared Error: 8843359266.87819
  Root Mean Squared Error: 94039.13688926642
  Mean Absolute Error: 24330.542127939378
  R-squared: 0.16582001799902513

Support Vector Regression:
  Mean Squared Error: 10785823570.695892
  Root Mean Squared Error: 103854.81967966577
  Mean Absolute Error: 13652.85549039879
  R-squared: -0.017409543200086475

Decision Tree Regression:
  Mean Squared Error: 3272402569.4109035
  Root Mean Squared Error: 57204.9173534138
  Mean Absolute Error: 7746.986950326762
  R-squared: 0.6913194823289392

K-Nearest Neighbors Regression:
  Mean Squared Error: 3711193366.956696
  Root Mean Squared Error: 60919.56473052558
  Mean Absolute Error: 8580.764855072464
  R-squared: 0.6499290458949172

Random Forest Regression:
  Mean Squared Error: 3311646351.7604966
  Root Mean Squared Error: 57546.905666251914
  Mean Absolute Error: 8883.265671463692
  R-squared: 0.6876176789003887

Random Forest Regressor:
  Mean Squared Error: 3239317870.0185237
 