This notebook intends to do exploration on data sets for predicting local authority. (Created by Hadrian)

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import folium

from geo import *



In [2]:
# Read in the data
df = pd.read_csv('./resources/postcodes_labelled.csv')
df



Unnamed: 0,postcode,easting,northing,soilType,elevation,localAuthority,riskLabel,medianPrice,historicallyFlooded
0,OL9 7NS,390978,403269,Unsurveyed/Urban,130,Oldham,1,119100.0,False
1,WV13 2LR,396607,298083,Unsurveyed/Urban,130,Walsall,1,84200.0,False
2,LS12 1LZ,427859,432937,Unsurveyed/Urban,60,Leeds,1,134900.0,False
3,SK15 1TS,395560,397900,Unsurveyed/Urban,120,Tameside,1,170200.0,False
4,TS17 9NN,445771,515362,Unsurveyed/Urban,20,Stockton-on-Tees,1,190600.0,False
...,...,...,...,...,...,...,...,...,...
29995,LS16 0BP,425977,438923,Unsurveyed/Urban,160,Leeds,1,,False
29996,SK8 4PG,384808,387982,Unsurveyed/Urban,40,Stockport,1,328700.0,False
29997,HD7 4PA,409215,416819,Cambisols,310,Kirklees,1,214500.0,False
29998,NE16 5YT,419672,560517,Unsurveyed/Urban,130,Gateshead,1,273100.0,False


In [3]:
# check the proportion of null value of local authority column
len_null = df['localAuthority'].isnull().sum() 
proportion_null = len_null / len(df['localAuthority'])

print('The length of null value of local authority column is: ', len_null)
print('The proportion of null value of local authority column is: ', proportion_null)

The length of null value of local authority column is:  0
The proportion of null value of local authority column is:  0.0


In [4]:
X = df.copy(deep=True).loc[:, 'easting':'northing']
y = df.copy(deep=True)['localAuthority']
X

Unnamed: 0,easting,northing
0,390978,403269
1,396607,298083
2,427859,432937
3,395560,397900
4,445771,515362
...,...,...
29995,425977,438923
29996,384808,387982
29997,409215,416819
29998,419672,560517


In [5]:
y.value_counts()

Birmingham       1679
Leeds            1456
Bradford          972
County Durham     957
Sheffield         852
                 ... 
Tamworth          108
Boston            105
Bolsover           97
Melton             86
Rutland            74
Name: localAuthority, Length: 91, dtype: int64

In [6]:
lat, long = get_gps_lat_long_from_easting_northing(X.easting.to_list(), X.northing.to_list())
lat, long

(array([53.52600442, 52.58054647, 53.79200533, ..., 53.64779586,
        54.93893651, 52.49427638]),
 array([-2.13756166, -2.05150617, -1.57859152, ..., -1.8620731 ,
        -1.69447943, -1.82948491]))

In [8]:
min_long = np.floor(long.min())
max_long = np.ceil(long.max())
min_lat = np.floor(lat.min())
max_lat = np.ceil(lat.max())

m = folium.Map(
    max_bounds=True,
    location=[(min_lat+max_lat)/2,(min_long+max_long)/2],
    zoom_start=6,
    min_lat=min_lat,
    max_lat=max_lat,
    min_lon=min_long,
    max_lon=max_long,
)

for i in range(500):
    folium.Marker(
        location=[lat[i], long[i]],
        popup=f'{y[i]}',
        icon=folium.Icon(color='green', icon='ok-sign'),
    ).add_to(m)


m

## Observation:

It is found that the local authority is depending on the location solely, which proximity sites should be under the same local authority.
>Initial idea: 

>>(1): SVM / SGD classifier 

>>(2)NCA >> KNN (weight = distance) pipeline / radius neighbors CLf

In [9]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.neighbors import (KNeighborsClassifier, NeighborhoodComponentsAnalysis)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
nca = NeighborhoodComponentsAnalysis(random_state=42)
knn = KNeighborsClassifier(n_neighbors=3)
nca_pipe = Pipeline([('nca', nca), ('knn', knn)])
nca_pipe.fit(X_train, y_train)

print('Test accuracy: %.2f%%' % (nca_pipe.score(X_test, y_test) * 100))



Test accuracy: 98.25%


In [10]:
# score the pipeline with precision recall f1 score
from sklearn.metrics import precision_recall_fscore_support
y_pred = nca_pipe.predict(X_test)
precision_recall_fscore_support(y_test, y_pred, average='weighted')


(0.9830163050878573, 0.9825, 0.982522691683106, None)