# **Creating Modeling**

## **Setting Up the Modeling Notebook**

### **Loading Libraries**

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import joblib

### **Loading the Feature-Engineered Dataset**

In [2]:
df = pd.read_csv("/workspaces/Global-Population-Growth-EDA-and-Prediction/data/processed/final_features.csv")

## **Split the Data**

### **Defining Features and Target**

In [3]:
X = df.drop(columns=['Population (2024)'])
y = df['Population (2024)']

### **Train-Test Split**

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_test.to_csv('/workspaces/Global-Population-Growth-EDA-and-Prediction/data/test/X_test.csv', index=False)
y_test.to_csv('/workspaces/Global-Population-Growth-EDA-and-Prediction/data/test/y_test.csv', index=False)

## **Model Development**

### **Baseline Model - Linear Regression**

In [5]:
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

y_pred_train_lr = lr_model.predict(X_train)
y_pred_test_lr = lr_model.predict(X_test)


### **Random Forest Regressor**

In [6]:
rf_model = RandomForestRegressor(random_state=42)
rf_model.fit(X_train, y_train)

y_pred_train_rf = rf_model.predict(X_train)
y_pred_test_rf = rf_model.predict(X_test)


## **Model Evaluation**

### **Evaluating Performance**

In [7]:
def evaluate_model(y_train, y_train_pred, y_test, y_test_pred):
    print("Training Performance:")
    print(f"MAE: {mean_absolute_error(y_train, y_train_pred)}")
    print(f"MSE: {mean_squared_error(y_train, y_train_pred)}")
    print(f"R2 Score: {r2_score(y_train, y_train_pred)}\n")

    print("Testing Performance:")
    print(f"MAE: {mean_absolute_error(y_test, y_test_pred)}")
    print(f"MSE: {mean_squared_error(y_test, y_test_pred)}")
    print(f"R2 Score: {r2_score(y_test, y_test_pred)}")

print("Linear Regression Model:")
evaluate_model(y_train, y_pred_train_lr, y_test, y_pred_test_lr)

print("\nRandom Forest Model:")
evaluate_model(y_train, y_pred_train_rf, y_test, y_pred_test_rf)


Linear Regression Model:
Training Performance:
MAE: 4.8592160555637775e-17
MSE: 4.5929187362756227e-33
R2 Score: 1.0

Testing Performance:
MAE: 5.039754817230306e-17
MSE: 5.4850178819922065e-33
R2 Score: 1.0

Random Forest Model:
Training Performance:
MAE: 0.0006760766747345232
MSE: 4.492477621338281e-05
R2 Score: 0.9958033119632693

Testing Performance:
MAE: 0.0010678067327660567
MSE: 1.4632377497146501e-05
R2 Score: 0.9986250042509114


## **Hyperparameter Tuning**

### **Grid Search for Random Forest**

In [8]:
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

best_rf_model = grid_search.best_estimator_

y_pred_train_best_rf = best_rf_model.predict(X_train)
y_pred_test_best_rf = best_rf_model.predict(X_test)
evaluate_model(y_train, y_pred_train_best_rf, y_test, y_pred_test_best_rf)


Fitting 5 folds for each of 81 candidates, totalling 405 fits


[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.2s
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.3s
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.3s
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.3s
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.4s
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   0.6s
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   0.8s
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   1.5s
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   1.1s
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total tim

  _data = np.array(data, dtype=dtype, copy=copy,


## **Saving the Best Model**

In [9]:
joblib.dump(best_rf_model, '/workspaces/Global-Population-Growth-EDA-and-Prediction/models/population_growth_model.pkl')

['/workspaces/Global-Population-Growth-EDA-and-Prediction/models/population_growth_model.pkl']