# Part 3 of Machine Learning 

#### Predicting year when Ireland will get to 99% of Internet Access Penetration rate

using Classification algorithms

In [24]:
##IMPORTING LIBRARIES
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.gridspec import GridSpec
import matplotlib.ticker as ticker
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
#Suppress warnings
import warnings
warnings.filterwarnings('ignore')

In [25]:
# Loading file with processed data (from Data Processing.ipynb Step 5)
df_pr = pd.read_csv("FINAL.csv")
df_pr.head()

Unnamed: 0,Year,Region,Estimated Population nr,Nr of Persons with Internet Access,% of Persons with Internet Access
0,2011,Border,784000.0,517440.0,66.0
1,2011,Dublin,2523000.0,2119320.0,84.0
2,2011,Ireland,9149700.0,7136766.0,78.0
3,2011,Mid-East,1314900.0,1143963.0,87.0
4,2011,Mid-West,933600.0,718872.0,77.0


In [14]:
# Loading predicted data from Part 1 Machine learning (ML Estimated population.ipynb)
dfe = pd.read_csv("dfe.csv")
dfe.head()

Unnamed: 0,Year,Region,Predicted Estimated Population nr
0,2023,South-West,1483545.23
1,2024,South-West,1497658.74
2,2025,South-West,1511772.24
3,2026,South-West,1525885.75
4,2027,South-West,1539999.26


In [26]:
# Loading predicted data from Part 2 Machine learning (ML Nr of population with IA.ipynb)
dfp = pd.read_csv("dfp.csv")
dfp.head()

Unnamed: 0,Year,Region,Predicted Nr of Persons with Internet Access
0,2023,Dublin,2897494.35
1,2024,Dublin,2963599.76
2,2025,Dublin,3029705.16
3,2026,Dublin,3095810.56
4,2027,Dublin,3161915.97


##### mergind all data frames

In [28]:
df_pred = dfe.merge(dfp, on=['Year', 'Region'], how='left')

In [29]:
df_pred.describe()

Unnamed: 0,Year,Predicted Estimated Population nr,Predicted Nr of Persons with Internet Access
count,45.0,45.0,45.0
mean,2025.0,2348936.0,2296941.0
std,1.430194,3021037.0,2975547.0
min,2023.0,640740.4,609329.4
25%,2024.0,923904.8,883805.5
50%,2025.0,1024233.0,991406.5
75%,2026.0,1612315.0,1567357.0
max,2027.0,10798400.0,10848170.0


In [30]:
# Calculating rate of people with Internet Access by dividing number of people with internet access to total number of people
df_pred['% of Persons with Internet Access'] = df_pred['Predicted Nr of Persons with Internet Access']/df_pred['Predicted Estimated Population nr']*100
df_pred.head()

Unnamed: 0,Year,Region,Predicted Estimated Population nr,Predicted Nr of Persons with Internet Access,% of Persons with Internet Access
0,2023,South-West,1483545.23,1413930.25,95.307526
1,2024,South-West,1497658.74,1447736.28,96.666633
2,2025,South-West,1511772.24,1481542.29,98.000363
3,2026,South-West,1525885.75,1515348.32,99.309422
4,2027,South-West,1539999.26,1549154.34,100.594486


In [31]:
#renaming columns to concat them with data frame with years from 
df_pred = df_pred.rename(columns={'Predicted Estimated Population nr': 'Estimated Population nr',
                                  'Predicted Nr of Persons with Internet Access': 'Nr of Persons with Internet Access'})

In [32]:
df_final = pd.concat([df_pr,df_pred],ignore_index=True)

In [33]:
df_final.describe()

Unnamed: 0,Year,Estimated Population nr,Nr of Persons with Internet Access,% of Persons with Internet Access
count,153.0,153.0,153.0,153.0
mean,2019.0,2201954.0,1993066.0,89.241023
std,4.915068,2809968.0,2580091.0,7.604403
min,2011.0,567600.0,424834.0,66.0
25%,2015.0,843352.9,720534.0,85.0
50%,2019.0,995800.0,930959.9,89.5
75%,2023.0,1550029.0,1515348.0,95.307526
max,2027.0,10798400.0,10848170.0,104.962482


In [34]:
#loading data to the scv file to be used in the statistic part
df_final.to_csv("stat.csv", index = False)

In [35]:
#labeling each row with 0 if % of people with Internet Access is less than 99% and 1 otherwise
df_final['YN'] = df_final['% of Persons with Internet Access'].apply(lambda x: 0 if x < 99 else 1)

In [36]:
df_final.describe()

Unnamed: 0,Year,Estimated Population nr,Nr of Persons with Internet Access,% of Persons with Internet Access,YN
count,153.0,153.0,153.0,153.0,153.0
mean,2019.0,2201954.0,1993066.0,89.241023,0.104575
std,4.915068,2809968.0,2580091.0,7.604403,0.30701
min,2011.0,567600.0,424834.0,66.0,0.0
25%,2015.0,843352.9,720534.0,85.0,0.0
50%,2019.0,995800.0,930959.9,89.5,0.0
75%,2023.0,1550029.0,1515348.0,95.307526,0.0
max,2027.0,10798400.0,10848170.0,104.962482,1.0


In [37]:
#Taking Data only for Ireland and splitting them to train and test parts
df_lreg = df_final.drop(df_final[(df_final["Region"] != 'Ireland') ].index)

X = df_lreg[['Year']]
y = df_lreg['YN']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3)

#### Training our classification models and getting their scores

Logistic Regression

In [48]:
lr = LogisticRegression(solver='liblinear',multi_class='ovr')
lr.fit(X_train, y_train)
lr.score(X_test, y_test)

0.8333333333333334

SVM

In [49]:
svm = SVC(gamma='auto')
svm.fit(X_train, y_train)
svm.score(X_test, y_test)

0.8333333333333334

Random Forest

In [50]:
rf = RandomForestClassifier(n_estimators=40)
rf.fit(X_train, y_train)
rf.score(X_test, y_test)

0.8333333333333334

#### each training gives different score

### Using Cross Validation to adjust parameters

In [51]:
from sklearn.model_selection import cross_val_score

Logistic classifier model performance using cross_val_score

In [66]:
log_reg_score = cross_val_score(LogisticRegression(solver='liblinear',multi_class='ovr'), X, y,cv=5)
np.average(log_reg_score)

0.9

svm model performance using cross_val_score

In [67]:
svm_score = cross_val_score(SVC(gamma='auto'), X, y,cv=5)
np.average(svm_score)

0.9

random forest performance using cross_val_score

In [68]:
ran_forest = cross_val_score(RandomForestClassifier(n_estimators=40),X, y,cv=5)
np.average(ran_forest)

0.8833333333333334

#### we can use Logistic classifier or svm model  as they give same score for prediction 

In [69]:
years_to_predict = np.array([2023, 2024, 2025, 2026, 2027,2028,2029,2030])

In [72]:
#Using svm mpdel for prediction
data=[]

for year in years_to_predict:
    p = svm.predict([[year]])
    prediction = round(p.item(),2)
    row = [year, prediction]
    data.append(row)
print(data)

[[2023, 0], [2024, 0], [2025, 0], [2026, 0], [2027, 1], [2028, 0], [2029, 0], [2030, 0]]


In [73]:
#Getting the result - 2027 is the year when Ireland will get 99% of population connected to the Internet

In [74]:
#Using svm mpdel for prediction
data=[]

for year in years_to_predict:
    p = lr.predict([[year]])
    prediction = round(p.item(),2)
    row = [year, prediction]
    data.append(row)
print(data)

[[2023, 0], [2024, 0], [2025, 0], [2026, 0], [2027, 0], [2028, 0], [2029, 0], [2030, 0]]


In [75]:
#Getting the result - there is no possibility for Ireland to get 99% of population connected to the Internet by 2030