In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [47]:
data = pd.read_csv("./world_pop_mig_186_countries_scaled.csv")

data.head()

Unnamed: 0,country,year,population,netMigration,population_in_millions,population scaled,netMigration scaled,population_in_millions scaled
0,Afghanistan,2023,42239854,-65846,42,0.029565,0.658143,0.029412
1,Afghanistan,2022,41128771,-65846,41,0.028787,0.658143,0.028711
2,Afghanistan,2021,40099462,-183672,40,0.028067,0.646407,0.028011
3,Afghanistan,2020,38972230,166821,38,0.027278,0.681317,0.026611
4,Afghanistan,2019,37769499,-8082,37,0.026436,0.663896,0.02591


# 3.1

In [58]:
import pandas as pd
import matplotlib.pyplot as plt

# Filter data for the USA
usa_data = data
# Define features and target variable
X = usa_data[['year', 'population scaled']]
y = usa_data['netMigration scaled']

# Split scaled data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# 3.2

# Modelle vergleichen

In [64]:
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error, r2_score

models = {
    "Linear Regression": LinearRegression(),
    "Random Forest": RandomForestRegressor(random_state=42, n_estimators=100, max_depth=10),
    "Gradient Boosting": GradientBoostingRegressor(random_state=42),
    "Support Vector Regressor": SVR(),
    "KNN Regressor": KNeighborsRegressor(n_neighbors=5),
    "Neural Network": MLPRegressor(random_state=42, max_iter=500)
}

results = []

# evaluate each of the models
for name, model in models.items():
    model.fit(X_train, y_train.values.ravel())
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    score = model.score(X_train, y_train)
    results.append((name, mse, r2, round(score,3)))

results_df = pd.DataFrame(results, columns=["Model", "Mean Squared Error", "R² Score", "Score"])
results_df.sort_values(by="R² Score", ascending=False, inplace=True)

print(results_df)


                      Model  Mean Squared Error  R² Score  Score
1             Random Forest            0.000116  0.442980  0.579
2         Gradient Boosting            0.000124  0.400603  0.507
0         Linear Regression            0.000204  0.016420  0.006
5            Neural Network            0.000248 -0.197035 -0.150
4             KNN Regressor            0.000252 -0.214843  0.136
3  Support Vector Regressor            0.000357 -0.721037 -0.555


Ich habe mich für den Random Forest Algorithmus entschieden, da er mit einem Mean Squared Error (MSE) von 0.000116, einem R²-Wert von 0.442980 und einem Score von 0.579 die besten Ergebnisse liefert. Diese Metriken zeigen, dass Random Forest die Datenstruktur effektiv modelliert und eine hohe Vorhersagegenauigkeit bietet. Alternativen wie Gradient Boosting oder lineare Regression erreichen zwar respektable Werte, schneiden jedoch schlechter ab. Random Forest ist zudem robust gegenüber Überanpassung und eignet sich gut für nicht-lineare Zusammenhänge. Das Modell wurde auf den Trainingsdaten trainiert und evaluiert, um optimale Ergebnisse zu gewährleisten.

# Model berechnen 

In [60]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Initialize the model
model = RandomForestRegressor(random_state=42, n_estimators=100, max_depth=10)

# Train the model on the training data
model.fit(X_train, y_train)

# Make predictions on the test data
y_pred = model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Display results
print(f"Mean Squared Error (MSE): {mse:.6f}")
print(f"R² Score: {r2:.6f}")




Mean Squared Error (MSE): 0.000116
R² Score: 0.442980


# Save Model 

In [61]:
import joblib

# Save the model
joblib.dump(model, 'UsMigration_random_forest_model.joblib')

best_model = joblib.load('UsMigration_random_forest_model.joblib')

print(best_model)


RandomForestRegressor(max_depth=10, random_state=42)


# 3.3

# Prediction 

In [63]:
# Generate future data for years from 2024 to 2034
future_years = list(range(2024, 2035))  # Includes 2034

# Create population scaled values similar to 0.234430 with slight growth
future_population_scaled = [
    0.235000, 0.236500, 0.238000, 0.239500, 0.241000,
    0.242500, 0.244000, 0.245500, 0.247000, 0.248500, 25.250000
]  # 11 values corresponding to 2024–2034

# Create a DataFrame for future data
future_data = pd.DataFrame({
    "year": future_years,
    "population scaled": future_population_scaled
})

# Use the trained model to predict net migration for future years
future_predictions = best_model.predict(future_data)

# Combine results into a DataFrame
future_results = pd.DataFrame({
    "Year": future_years,
    "Population Scaled": future_population_scaled,
    "Predicted Net Migration Scaled": future_predictions
})

# Display the predictions
future_results





Unnamed: 0,Year,Population Scaled,Predicted Net Migration Scaled
0,2024,0.235,0.749156
1,2025,0.2365,0.749156
2,2026,0.238,0.749156
3,2027,0.2395,0.749156
4,2028,0.241,0.749156
5,2029,0.2425,0.749156
6,2030,0.244,0.749156
7,2031,0.2455,0.749156
8,2032,0.247,0.749156
9,2033,0.2485,0.749156
