# Importing libraries and classes

In [5]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.ensemble import RandomForestRegressor as SKLearnRandomForestRegressor
from sklearn.tree import DecisionTreeRegressor as SKLearnDecisionTreeRegressor

from DecisionTreeRegressor import DecisionTreeRegressor
from RandomForestRegressor import RandomForestRegressor
from LinearRegression import LogisticRegression

# Reading and splitting the data

In [6]:
df = pd.read_csv('./NSO_Population_Sex_dataset/NSO_POPULATION_DATA_CLEANED.csv')

feature_cols = ['District', 'Sex', 'Year', 'Population_Growth_Rate','Average_Population']
X = pd.get_dummies(df[feature_cols], columns=['District', 'Sex'])
y = df["Population"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# SKLearn Decision Trees

In [7]:
SKLearn_Decision_Tree_Regressor = SKLearnDecisionTreeRegressor(max_depth=100, min_samples_split=2, min_samples_leaf=5)

SKLearn_Decision_Tree_Regressor.fit(X_train, y_train)

SKLearn_Decision_Tree_Regressor_Prediction = SKLearn_Decision_Tree_Regressor.predict(X_test)

SKLearn_Decision_Tree_Regressor_Prediction_MAE = mean_absolute_error(y_test, SKLearn_Decision_Tree_Regressor_Prediction)
SKLearn_Decision_Tree_Regressor_Prediction_MSE = mean_squared_error(y_test, SKLearn_Decision_Tree_Regressor_Prediction)

print(f"SKLearn Decision Tree Regressor Mean Absolute Error: {SKLearn_Decision_Tree_Regressor_Prediction_MAE}")
print(f"SKLearn Decision Tree Regressor Mean Squared Error: {SKLearn_Decision_Tree_Regressor_Prediction_MSE}")

SKLearn_Decision_Tree_Regressor_Results_DF = pd.DataFrame({'Actual': y_test, 'Predicted': SKLearn_Decision_Tree_Regressor_Prediction})
print(f"\nSKLearn Decision Tree Predictions:\n{SKLearn_Decision_Tree_Regressor_Results_DF}")

SKLearn Decision Tree Regressor Mean Absolute Error: 0.015074905544077323
SKLearn Decision Tree Regressor Mean Squared Error: 0.0007723816286669939

SKLearn Decision Tree Predictions:
       Actual  Predicted
25   0.331167   0.333404
91   0.244705   0.247635
29   0.345026   0.357097
132  0.209815   0.216194
111  0.222412   0.204437
57   0.687152   0.687170
152  0.237668   0.247635
10   0.333920   0.333404
186  0.008192   0.007939
118  0.188260   0.186869
100  0.180819   0.186869
179  0.002243   0.001880
147  0.199997   0.201668
28   0.340245   0.357097
110  0.220062   0.202423
153  0.254724   0.296744
172  0.014773   0.020779
183  0.004230   0.007939
120  0.192624   0.186869
35   0.589261   0.592101
51   0.608144   0.613334
81   0.196801   0.187675
94   0.267079   0.285572
75   0.273888   0.247635
169  0.008958   0.007939
159  0.397456   0.285572
15   0.378399   0.357097
79   0.325540   0.285572
59   0.752918   0.687170
142  0.365064   0.346216
129  0.185560   0.187675
32   0.585246   

# Custom Decision Trees

In [8]:
Custom_Decision_Tree_Regressor = DecisionTreeRegressor(max_depth=100, min_samples_split=2)

Custom_Decision_Tree_Regressor.fit(X_train.values, y_train.values)

Custom_Decision_Tree_Regressor_Prediction = Custom_Decision_Tree_Regressor.predict(X_test.values)

Custom_Decision_Tree_Regressor_Prediction_MAE = mean_absolute_error(y_test, Custom_Decision_Tree_Regressor_Prediction)
Custom_Decision_Tree_Regressor_Prediction_MSE = mean_squared_error(y_test, Custom_Decision_Tree_Regressor_Prediction)

print(f"Custom Decision Tree Regressor Mean Absolute Error: {Custom_Decision_Tree_Regressor_Prediction_MAE}")
print(f"Custom Decision Tree Regressor Mean Squared Error: {Custom_Decision_Tree_Regressor_Prediction_MSE}")

Custom_Decision_Tree_Regressor_Results_DF = pd.DataFrame({'Actual': y_test, 'Predicted': Custom_Decision_Tree_Regressor_Prediction})
print(f"\nCustom Decision Tree Predictions:\n{Custom_Decision_Tree_Regressor_Results_DF}")

Custom Decision Tree Regressor Mean Absolute Error: 0.010683617202465998
Custom Decision Tree Regressor Mean Squared Error: 0.00030303990767434607

Custom Decision Tree Predictions:
       Actual  Predicted
25   0.331167   0.330334
91   0.244705   0.250547
29   0.345026   0.357865
132  0.209815   0.208270
111  0.222412   0.212111
57   0.687152   0.689139
152  0.237668   0.251796
10   0.333920   0.333557
186  0.008192   0.010368
118  0.188260   0.182283
100  0.180819   0.190878
179  0.002243   0.003344
147  0.199997   0.201193
28   0.340245   0.344811
110  0.220062   0.211856
153  0.254724   0.273432
172  0.014773   0.014115
183  0.004230   0.004593
120  0.192624   0.186957
35   0.589261   0.591961
51   0.608144   0.611689
81   0.196801   0.198278
94   0.267079   0.268852
75   0.273888   0.250547
169  0.008958   0.007279
159  0.397456   0.364016
15   0.378399   0.351231
79   0.325540   0.268852
59   0.752918   0.797599
142  0.365064   0.324062
129  0.185560   0.183734
32   0.585246   0.

# Random Forest Regressor using SKLearn
https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html

In [9]:
SKLearn_Random_Forest_Regressor = SKLearnRandomForestRegressor() # https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html

SKLearn_Random_Forest_Regressor.fit(X_train, y_train)

SKLearn_Random_Forest_Regressor_Prediction = SKLearn_Random_Forest_Regressor.predict(X_test)

SKLearn_Random_Forest_Regressor_Prediction_MAE = mean_absolute_error(y_test, SKLearn_Random_Forest_Regressor_Prediction)
SKLearn_Random_Forest_Regressor_Prediction_MSE = mean_squared_error(y_test, SKLearn_Random_Forest_Regressor_Prediction)

print(f"SKLearn Random Forest Regressor Mean Absolute Error: {SKLearn_Random_Forest_Regressor_Prediction_MAE}")
print(f"SKLearn Random Forest Regressor Mean Squared Error: {SKLearn_Random_Forest_Regressor_Prediction_MSE}")

SKLearn_Random_Forest_Regressor_Results_DF = pd.DataFrame({'Actual': y_test, 'Predicted': SKLearn_Random_Forest_Regressor_Prediction})
print(f"\nSKLearn Random Forest Predictions:\n{SKLearn_Random_Forest_Regressor_Results_DF}")


SKLearn Random Forest Regressor Mean Absolute Error: 0.011561182088438005
SKLearn Random Forest Regressor Mean Squared Error: 0.00041511734407771045

SKLearn Random Forest Predictions:
       Actual  Predicted
25   0.331167   0.330517
91   0.244705   0.234708
29   0.345026   0.350541
132  0.209815   0.222417
111  0.222412   0.217856
57   0.687152   0.684622
152  0.237668   0.249920
10   0.333920   0.332081
186  0.008192   0.007307
118  0.188260   0.184959
100  0.180819   0.183844
179  0.002243   0.002687
147  0.199997   0.215170
28   0.340245   0.341379
110  0.220062   0.231648
153  0.254724   0.278995
172  0.014773   0.013088
183  0.004230   0.004562
120  0.192624   0.188577
35   0.589261   0.592455
51   0.608144   0.606721
81   0.196801   0.195109
94   0.267079   0.257479
75   0.273888   0.252843
169  0.008958   0.007401
159  0.397456   0.314309
15   0.378399   0.354706
79   0.325540   0.271607
59   0.752918   0.725123
142  0.365064   0.322269
129  0.185560   0.188776
32   0.585246  

# Custom Random Forest Regressor using SKLearn Decision Trees

In [10]:
SKLearn_Decision_Trees_Random_Forest_Regressor = RandomForestRegressor(n_estimators=100, max_depth=None, custom=False)
SKLearn_Decision_Trees_Random_Forest_Regressor.fit(X_train, y_train)

SKLearn_Decision_Trees_Random_Forest_Regressor_Prediction = SKLearn_Decision_Trees_Random_Forest_Regressor.predict(X_test)

SKLearn_Decision_Trees_Random_Forest_Regressor_MAE = mean_absolute_error(y_test, SKLearn_Decision_Trees_Random_Forest_Regressor_Prediction)
SKLearn_Decision_Trees_Random_Forest_Regressor_MSE = mean_squared_error(y_test, SKLearn_Decision_Trees_Random_Forest_Regressor_Prediction)

print(f"Custom Random Forest Regressor using SKLearn Decision Trees Mean Absolute Error: {SKLearn_Decision_Trees_Random_Forest_Regressor_MAE}")
print(f"Custom Random Forest Regressor using SKLearn Decision Trees Mean Squared Error: {SKLearn_Decision_Trees_Random_Forest_Regressor_MSE}")

SKLearn_Decision_Trees_Random_Forest_Regressor_Results_DF = pd.DataFrame({'Actual': y_test, 'Predicted': SKLearn_Decision_Trees_Random_Forest_Regressor_Prediction})
print(f"\nCustom Random Forest Regressor using SKLearn Decision Trees Predictions:\n{SKLearn_Decision_Trees_Random_Forest_Regressor_Results_DF}")

Custom Random Forest Regressor using SKLearn Decision Trees Mean Absolute Error: 0.011887324254066571
Custom Random Forest Regressor using SKLearn Decision Trees Mean Squared Error: 0.00045178211931527507

Custom Random Forest Regressor using SKLearn Decision Trees Predictions:
       Actual  Predicted
25   0.331167   0.330640
91   0.244705   0.228680
29   0.345026   0.350439
132  0.209815   0.217807
111  0.222412   0.218112
57   0.687152   0.678699
152  0.237668   0.245143
10   0.333920   0.332551
186  0.008192   0.007542
118  0.188260   0.185510
100  0.180819   0.184094
179  0.002243   0.002706
147  0.199997   0.212972
28   0.340245   0.341412
110  0.220062   0.228633
153  0.254724   0.278856
172  0.014773   0.013264
183  0.004230   0.004667
120  0.192624   0.189242
35   0.589261   0.592773
51   0.608144   0.607381
81   0.196801   0.193776
94   0.267079   0.260507
75   0.273888   0.251818
169  0.008958   0.007623
159  0.397456   0.311696
15   0.378399   0.353515
79   0.325540   0.271

# Custom Random Forest Regressor

In [11]:
Custom_Random_Forest_Regressor = RandomForestRegressor(n_estimators=100, max_depth=100)
Custom_Random_Forest_Regressor.fit(X_train, y_train)

Custom_Random_Forest_Regressor_Prediction = Custom_Random_Forest_Regressor.predict(X_test)

Custom_Random_Forest_Regressor_MAE = mean_absolute_error(y_test, Custom_Random_Forest_Regressor_Prediction)
Custom_Random_Forest_Regressor_MSE = mean_squared_error(y_test, Custom_Random_Forest_Regressor_Prediction)

print(f"Custom Random Forest Regressor Mean Absolute Error: {Custom_Random_Forest_Regressor_MAE}")
print(f"Custom Random Forest Regressor Mean Squared Error: {Custom_Random_Forest_Regressor_MSE}")

Custom_Random_Forest_Regressor_Results_DF = pd.DataFrame({'Actual': y_test, 'Predicted': Custom_Random_Forest_Regressor_Prediction})
print(f"\nCustom Random Forest Regressor Predictions:\n{Custom_Random_Forest_Regressor_Results_DF}")

Custom Random Forest Regressor Mean Absolute Error: 0.01342030365922069
Custom Random Forest Regressor Mean Squared Error: 0.0005364967934451953

Custom Random Forest Regressor Predictions:
       Actual  Predicted
25   0.331167   0.332674
91   0.244705   0.238875
29   0.345026   0.356216
132  0.209815   0.219975
111  0.222412   0.226139
57   0.687152   0.708387
152  0.237668   0.248525
10   0.333920   0.335184
186  0.008192   0.010790
118  0.188260   0.186035
100  0.180819   0.185931
179  0.002243   0.002964
147  0.199997   0.217174
28   0.340245   0.348003
110  0.220062   0.239627
153  0.254724   0.271072
172  0.014773   0.016647
183  0.004230   0.005244
120  0.192624   0.191030
35   0.589261   0.596983
51   0.608144   0.610822
81   0.196801   0.193985
94   0.267079   0.276404
75   0.273888   0.263171
169  0.008958   0.008484
159  0.397456   0.314122
15   0.378399   0.355661
79   0.325540   0.275057
59   0.752918   0.812312
142  0.365064   0.339967
129  0.185560   0.190542
32   0.585

# Custom Linear Regression

In [12]:
# Normalisation of data 

X_train_normalized = (X_train - np.mean(X_train, axis=0)) / np.std(X_train, axis=0)
X_test_normalized = (X_test - np.mean(X_train, axis=0)) / np.std(X_train, axis=0)

In [13]:
Custom_Linear_Regression = LogisticRegression(learning_rate=0.01, num_iterations=1000)
Custom_Linear_Regression.fit(X_train_normalized, y_train)

Custom_Linear_Regression_Prediction = Custom_Linear_Regression.predict(X_test_normalized)

Custom_Linear_Regression_MAE = mean_absolute_error(y_test, Custom_Linear_Regression_Prediction)
Custom_Linear_Regression_MSE = mean_squared_error(y_test, Custom_Linear_Regression_Prediction)

print(f"Custom Linear Regression Mean Absolute Error: {Custom_Linear_Regression_MAE}")
print(f"Custom Linear Regression Mean Squared Error: {Custom_Linear_Regression_MSE}")


Custom_Linear_Regression_Results_DF = pd.DataFrame({'Actual': y_test, 'Predicted': Custom_Linear_Regression_Prediction})
print(f"\nCustom Linear Regression Predictions:\n{Custom_Linear_Regression_Results_DF}")

Custom Linear Regression Mean Absolute Error: 0.056078718428310546
Custom Linear Regression Mean Squared Error: 0.004059016874168292

Custom Linear Regression Predictions:
       Actual  Predicted
25   0.331167   0.374329
91   0.244705   0.289954
29   0.345026   0.413777
132  0.209815   0.248640
111  0.222412   0.281818
57   0.687152   0.752435
152  0.237668   0.288227
10   0.333920   0.392354
186  0.008192   0.090457
118  0.188260   0.206585
100  0.180819   0.198774
179  0.002243   0.059506
147  0.199997   0.247434
28   0.340245   0.403794
110  0.220062   0.272658
153  0.254724   0.296223
172  0.014773   0.094066
183  0.004230   0.081948
120  0.192624   0.220560
35   0.589261   0.711462
51   0.608144   0.704173
81   0.196801   0.212137
94   0.267079   0.315896
75   0.273888   0.297043
169  0.008958   0.084341
159  0.397456   0.352852
15   0.378399   0.443305
79   0.325540   0.333490
59   0.752918   0.767772
142  0.365064   0.333629
129  0.185560   0.226270
32   0.585246   0.685303
38 