In [1]:
#Importing needed libraries:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
import seaborn as sns
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
import warnings
warnings.filterwarnings("ignore")
sns.set(style="whitegrid")#setting grid for all plots

In [2]:
# Loading the dataset:
df = pd.read_csv('https://ws.cso.ie/public/api.restful/PxStat.Data.Cube_API.ReadDataset/PEA11/CSV/1.0/en')

In [3]:
df

Unnamed: 0,STATISTIC,STATISTIC Label,TLIST(A1),Year,C02076V03371,Single Year of Age,C02199V02655,Sex,UNIT,VALUE
0,PEA11,Population estimates from 1926,1926,1926,-,All ages,-,Both sexes,Number,2971992
1,PEA11,Population estimates from 1926,1926,1926,-,All ages,1,Male,Number,1506889
2,PEA11,Population estimates from 1926,1926,1926,-,All ages,2,Female,Number,1465103
3,PEA11,Population estimates from 1926,1926,1926,200,Under 1 year,-,Both sexes,Number,55391
4,PEA11,Population estimates from 1926,1926,1926,200,Under 1 year,1,Male,Number,28084
...,...,...,...,...,...,...,...,...,...,...
11812,PEA11,Population estimates from 1926,2023,2023,098,98 years,1,Male,Number,197
11813,PEA11,Population estimates from 1926,2023,2023,098,98 years,2,Female,Number,632
11814,PEA11,Population estimates from 1926,2023,2023,646,99 years and over,-,Both sexes,Number,1600
11815,PEA11,Population estimates from 1926,2023,2023,646,99 years and over,1,Male,Number,352


In [4]:
# Dropping the 'STATISTIC Label' column it's irrelevant for the analysis
df_col_drop = ['STATISTIC','STATISTIC Label','TLIST(A1)','C02076V03371','C02199V02655','UNIT']
df.drop(df_col_drop ,axis=1, inplace=True) 

In [5]:
df

Unnamed: 0,Year,Single Year of Age,Sex,VALUE
0,1926,All ages,Both sexes,2971992
1,1926,All ages,Male,1506889
2,1926,All ages,Female,1465103
3,1926,Under 1 year,Both sexes,55391
4,1926,Under 1 year,Male,28084
...,...,...,...,...
11812,2023,98 years,Male,197
11813,2023,98 years,Female,632
11814,2023,99 years and over,Both sexes,1600
11815,2023,99 years and over,Male,352


In [6]:
# Renaming the columns
df = df.rename(columns={'Single Year of Age': 'Age_Group','VALUE':'Population'})

In [7]:
df

Unnamed: 0,Year,Age_Group,Sex,Population
0,1926,All ages,Both sexes,2971992
1,1926,All ages,Male,1506889
2,1926,All ages,Female,1465103
3,1926,Under 1 year,Both sexes,55391
4,1926,Under 1 year,Male,28084
...,...,...,...,...
11812,2023,98 years,Male,197
11813,2023,98 years,Female,632
11814,2023,99 years and over,Both sexes,1600
11815,2023,99 years and over,Male,352


In [8]:
# #Normalising Population
# df['VALUE'] = df['VALUE'] *100000

In [9]:
 #Normalising Population
df['Population'] = df['Population'] /100000

In [10]:
# Dropping rows based on criteria
df = df[~((df['Age_Group'] == 'All ages') |  (df['Sex'] == 'Both sexes'))] # Drop both sexes different that All ages

In [11]:
df['Age_Group'] = df['Age_Group'].str.replace('Under 1 year', '1 year').str.replace('99 years and over', '99 years')

In [12]:
df

Unnamed: 0,Year,Age_Group,Sex,Population
4,1926,1 year,Male,0.28084
5,1926,1 year,Female,0.27307
7,1926,1 year,Male,0.28374
8,1926,1 year,Female,0.27502
10,1926,2 years,Male,0.29728
...,...,...,...,...
11810,2023,97 years,Female,0.00821
11812,2023,98 years,Male,0.00197
11813,2023,98 years,Female,0.00632
11815,2023,99 years,Male,0.00352


In [13]:
# Create a new column 'Age' based on the modified values from 'Age_Group'
df['Age'] = df['Age_Group'].apply(lambda x: int(x.split(' ')[0]))

# Convert the 'Age' column to int64
df['Age'] = df['Age'].astype('int64')

In [14]:
df

Unnamed: 0,Year,Age_Group,Sex,Population,Age
4,1926,1 year,Male,0.28084,1
5,1926,1 year,Female,0.27307,1
7,1926,1 year,Male,0.28374,1
8,1926,1 year,Female,0.27502,1
10,1926,2 years,Male,0.29728,2
...,...,...,...,...,...
11810,2023,97 years,Female,0.00821,97
11812,2023,98 years,Male,0.00197,98
11813,2023,98 years,Female,0.00632,98
11815,2023,99 years,Male,0.00352,99


In [15]:
# # Dropping the 'STATISTIC Label' column it's irrelevant for the analysis
df_col_drop = ['Age_Group']
df.drop(df_col_drop ,axis=1, inplace=True) 

In [16]:
# Dropping rows based on criteria
df = df[~((df['Sex'] != 'Female'))] # Drop both sexes different that All ages

In [17]:
df = df.groupby('Year')['Population'].sum().reset_index() # Creating a df

In [18]:
df

Unnamed: 0,Year,Population
0,1926,14.65103
1,1936,14.47966
2,1946,14.6023
3,1951,14.53996
4,1961,14.01792
5,1966,14.3497
6,1971,14.82488
7,1979,16.74945
8,1981,17.14051
9,1986,17.70953


In [19]:
df

Unnamed: 0,Year,Population
0,1926,14.65103
1,1936,14.47966
2,1946,14.6023
3,1951,14.53996
4,1961,14.01792
5,1966,14.3497
6,1971,14.82488
7,1979,16.74945
8,1981,17.14051
9,1986,17.70953


In [20]:


# Split the dataset into features and target variable
X = df [['Year']]
y = df ['Population']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.641, random_state=2)

# Train a linear regression model
linear_regressor = LinearRegression()
linear_regressor.fit(X_train, y_train)

# Train a random forest regressor
rf_regressor = RandomForestRegressor()
rf_regressor.fit(X_train, y_train)

# Make predictions
y_pred_lr = linear_regressor.predict(X_test)
y_pred_rf = rf_regressor.predict(X_test)

# Evaluate the models
mse_lr = mean_squared_error(y_test, y_pred_lr)*100
mse_rf = mean_squared_error(y_test, y_pred_rf)*100

print(f"Linear Regression RMSE: {mse_lr}")
print(f"Random Forest Regressor RMSE: {mse_rf}")


Linear Regression RMSE: 806.7838612657546
Random Forest Regressor RMSE: 29.939265973891025


In [21]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder

# Assuming df is your DataFrame

# Preprocessing
# Handle missing values and encode categorical variables if needed

# Split the dataset into features and target variable
X = df[['Year']]
y = df['Population']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.641, random_state=2)

# Train a linear regression model
linear_regressor = LinearRegression()
linear_regressor.fit(X_train, y_train)

# Train a random forest regressor
rf_regressor = RandomForestRegressor()
rf_regressor.fit(X_train, y_train)

# Train a gradient boosting regressor
gb_regressor = GradientBoostingRegressor()
gb_regressor.fit(X_train, y_train)

# Train a ridge regression model
ridge_regressor = Ridge()
ridge_regressor.fit(X_train, y_train)


# Make predictions
y_pred_lr = linear_regressor.predict(X_test)
y_pred_rf = rf_regressor.predict(X_test)
y_pred_gb = gb_regressor.predict(X_test)
y_pred_ridge = ridge_regressor.predict(X_test)
# Evaluate the models
mse_lr = mean_squared_error(y_test, y_pred_lr) * 100
mse_rf = mean_squared_error(y_test, y_pred_rf) * 100
mse_gb = mean_squared_error(y_test, y_pred_gb) * 100

print(f"Linear Regression RMSE: {mse_lr}")
print(f"Random Forest Regressor RMSE: {mse_rf}")
print(f"Gradient Boosting Regressor RMSE: {mse_gb}")
print(f"Ridge Regression RMSE: {mse_ridge}")


NameError: name 'Ridge' is not defined

In [None]:
import matplotlib.pyplot as plt

# Plotting the results
plt.figure(figsize=(10, 6))
plt.scatter(X_test, y_test, color='black', label='Actual')
plt.scatter(X_test, y_pred_lr, color='blue', label='Linear Regression Prediction')
plt.scatter(X_test, y_pred_rf, color='green', label='Random Forest Prediction')
plt.scatter(X_test, y_pred_gb, color='red', label='Gradient Boosting Prediction')
plt.xlabel('Year')
plt.ylabel('Population')
plt.legend()
plt.show()
