In [1]:
#Importing needed libraries:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
import seaborn as sns
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
import warnings
warnings.filterwarnings("ignore")
sns.set(style="whitegrid")#setting grid for all plots

In [2]:
# Loading the dataset:
df = pd.read_csv('https://ws.cso.ie/public/api.restful/PxStat.Data.Cube_API.ReadDataset/PEA03/CSV/1.0/en')

In [3]:
df

Unnamed: 0,STATISTIC,STATISTIC Label,TLIST(A1),Year,C02076V03371,Age Group,C02199V02655,Sex,C02542V03077,Inward or Outward Flow,UNIT,VALUE
0,PEA03,Estimated Migration (Persons in April),1987,1987,-,All ages,-,Both sexes,1,Net migration,Thousand,-23.0
1,PEA03,Estimated Migration (Persons in April),1987,1987,-,All ages,-,Both sexes,4,Emigrants: All destinations,Thousand,40.2
2,PEA03,Estimated Migration (Persons in April),1987,1987,-,All ages,-,Both sexes,5,Immigrants: All origins,Thousand,17.2
3,PEA03,Estimated Migration (Persons in April),1987,1987,-,All ages,1,Male,1,Net migration,Thousand,
4,PEA03,Estimated Migration (Persons in April),1987,1987,-,All ages,1,Male,4,Emigrants: All destinations,Thousand,
...,...,...,...,...,...,...,...,...,...,...,...,...
1993,PEA03,Estimated Migration (Persons in April),2023,2023,575,65 years and over,1,Male,4,Emigrants: All destinations,Thousand,0.4
1994,PEA03,Estimated Migration (Persons in April),2023,2023,575,65 years and over,1,Male,5,Immigrants: All origins,Thousand,1.5
1995,PEA03,Estimated Migration (Persons in April),2023,2023,575,65 years and over,2,Female,1,Net migration,Thousand,1.8
1996,PEA03,Estimated Migration (Persons in April),2023,2023,575,65 years and over,2,Female,4,Emigrants: All destinations,Thousand,0.6


In [4]:
# Dropping the 'STATISTIC Label' column it's irrelevant for the analysis
df_col_drop = ['STATISTIC','STATISTIC Label','TLIST(A1)','C02076V03371','Age Group','C02199V02655','C02542V03077','UNIT']
df.drop(df_col_drop ,axis=1, inplace=True) 

In [5]:
df

Unnamed: 0,Year,Sex,Inward or Outward Flow,VALUE
0,1987,Both sexes,Net migration,-23.0
1,1987,Both sexes,Emigrants: All destinations,40.2
2,1987,Both sexes,Immigrants: All origins,17.2
3,1987,Male,Net migration,
4,1987,Male,Emigrants: All destinations,
...,...,...,...,...
1993,2023,Male,Emigrants: All destinations,0.4
1994,2023,Male,Immigrants: All origins,1.5
1995,2023,Female,Net migration,1.8
1996,2023,Female,Emigrants: All destinations,0.6


In [6]:
# Dropping rows based on criteria
df = df[~((df['Sex'] != 'Male') |  (df['Inward or Outward Flow'] != 'Emigrants: All destinations') | (df['Year'] < 2002))] # Drop both sexes different that All ages

In [7]:
# #Normalising Population
# df['VALUE'] = df['VALUE'] *100000

In [8]:
df

Unnamed: 0,Year,Sex,Inward or Outward Flow,VALUE
814,2002,Male,Emigrants: All destinations,12.7
823,2002,Male,Emigrants: All destinations,1.2
832,2002,Male,Emigrants: All destinations,9.0
841,2002,Male,Emigrants: All destinations,1.7
850,2002,Male,Emigrants: All destinations,0.0
...,...,...,...,...
1957,2023,Male,Emigrants: All destinations,2.9
1966,2023,Male,Emigrants: All destinations,7.5
1975,2023,Male,Emigrants: All destinations,17.2
1984,2023,Male,Emigrants: All destinations,3.0


In [9]:
df2002 = df[(df['Year'] == 2002)].copy()

In [10]:
df2002

Unnamed: 0,Year,Sex,Inward or Outward Flow,VALUE
814,2002,Male,Emigrants: All destinations,12.7
823,2002,Male,Emigrants: All destinations,1.2
832,2002,Male,Emigrants: All destinations,9.0
841,2002,Male,Emigrants: All destinations,1.7
850,2002,Male,Emigrants: All destinations,0.0
859,2002,Male,Emigrants: All destinations,0.8


In [11]:
# Dropping the 'STATISTIC Label' column it's irrelevant for the analysis
df_col_drop = ['Sex']
df.drop(df_col_drop ,axis=1, inplace=True) 

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 132 entries, 814 to 1993
Data columns (total 3 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Year                    132 non-null    int64  
 1   Inward or Outward Flow  132 non-null    object 
 2   VALUE                   132 non-null    float64
dtypes: float64(1), int64(1), object(1)
memory usage: 4.1+ KB


In [13]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.metrics import mean_squared_error


# Perform one-hot encoding
df = pd.get_dummies(df, columns=['Year','Inward or Outward Flow'])

# Split the data into features and target variable
X = df.drop('VALUE', axis=1)
y = df['VALUE']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.80, random_state=100)

# Fit the models
rf_model = RandomForestRegressor()
lr_model = LinearRegression()
ridge_model = Ridge()

rf_model.fit(X_train, y_train)
lr_model.fit(X_train, y_train)
ridge_model.fit(X_train, y_train)

# Evaluate the models
rf_pred = rf_model.predict(X_test)
lr_pred = lr_model.predict(X_test)
ridge_pred = ridge_model.predict(X_test)

print("Random Forest Regressor RMSE:", mean_squared_error(y_test, rf_pred, squared=False))
print("Linear Regression RMSE:", mean_squared_error(y_test, lr_pred, squared=False))
print("Ridge Regression RMSE:", mean_squared_error(y_test, ridge_pred, squared=False))


Random Forest Regressor RMSE: 13.57462126508062
Linear Regression RMSE: 14.918552078644602
Ridge Regression RMSE: 12.071402787587509


In [14]:
# # Create a heatmap
# plt.figure(figsize=(8, 6))
# sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt=".2f", annot_kws={"size": 10}, xticklabels=corr_matrix.columns, yticklabels=corr_matrix.columns)
# plt.title("Correlation Heatmap")
# plt.show()

In [15]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler

# Perform feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Perform hyperparameter tuning for Random Forest Regressor
param_grid = {'n_estimators': [100, 200, 300], 'max_depth': [5, 10, 15]}
grid_search = GridSearchCV(RandomForestRegressor(), param_grid, cv=5)
grid_search.fit(X_train, y_train)
best_rf_model = grid_search.best_estimator_

# Retrain the models with the best parameters
best_rf_model.fit(X_train, y_train)
lr_model.fit(X_train_scaled, y_train)
ridge_model.fit(X_train_scaled, y_train)

# Evaluate the models
best_rf_pred = best_rf_model.predict(X_test)
lr_pred = lr_model.predict(X_test_scaled)
ridge_pred = ridge_model.predict(X_test_scaled)

print("Tuned Random Forest Regressor RMSE:", mean_squared_error(y_test, best_rf_pred, squared=False))
print("Linear Regression RMSE:", mean_squared_error(y_test, lr_pred, squared=False))
print("Ridge Regression RMSE:", mean_squared_error(y_test, ridge_pred, squared=False))


Tuned Random Forest Regressor RMSE: 13.38421551007842
Linear Regression RMSE: 14.91752954917916
Ridge Regression RMSE: 14.67278568224769
