# DATA IMPORT AND PROCESSING

**Import Libraries**

In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

**Get Data From Wikipedia Page**

In [97]:
url='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
df = pd.read_html(url)[0]


*Rename Header of Table*

In [98]:
df.columns=['Postcode','Borought','Neighbourhood']
df.head(5)

Unnamed: 0,Postcode,Borought,Neighbourhood
0,Postcode,Borough,Neighbourhood
1,M1A,Not assigned,Not assigned
2,M2A,Not assigned,Not assigned
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village


*Drop First Row containing same value as in Header* 

In [99]:
df.drop(df.index[0], inplace=True)

**Drop Values Not assigned in Borought**

In [100]:
for_drop=df[df['Borought']=='Not assigned'].index
df.drop(for_drop, inplace=True)

**Combine Values of Neighbourhood of Same Postcode**

In [180]:
a=df['Neighbourhood']
foo = lambda a: ",".join(a) 
df=df.groupby(by='Postcode').agg({'Borought': 'first','Neighbourhood': foo}).reset_index()
df.head(5)

Unnamed: 0,Postcode,Borought,Neighbourhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


Replace Not assigned Values in Neighbourhood with Borought Values

In [104]:
df['Neighbourhood'].replace(to_replace = 'Not assigned', value =df['Borought'],inplace=True)

## Results

In [107]:
df.shape

(103, 3)

### Second Question

*Read and import location data into Data Frame*

In [142]:
loca=pd.read_csv('C:\\Users\\Hasnat\\Downloads\\Geospatial_Coordinates.csv')
loca.head(5)

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


Rename Headers

In [143]:
loca.columns=['Postcode','Latitude','Longitude']
loca.head(5)

Unnamed: 0,Postcode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


**Merge Both Data Frames based on Postcode**

In [181]:
comp=pd.merge(df, loca, how='inner', on = 'Postcode')
comp.head(5)

Unnamed: 0,Postcode,Borought,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge,Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood,Morningside,West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


## Question No 3
**Filter Data based Name**

In [174]:
complete_Toronto=comp.loc[comp['Borought'].str.contains('Toronto')]
complete_york=comp.loc[comp['Borought'].str.contains('York')]

*X values for Clustring*

In [178]:
X_Toronto = complete_Toronto[['Latitude', 'Longitude']]
X_York = complete_york[['Latitude', 'Longitude']]

*Required Packages*

In [160]:
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster
from sklearn.decomposition import PCA
from sklearn.cluster import DBSCAN
import matplotlib.cm as cm
from scipy.spatial.distance import cdist, pdist
from sklearn import metrics
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from datetime import datetime
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
plt.style.use('ggplot')
import copy
import json
import math
from collections import OrderedDict
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

**Cluster Using K Mean**

In [192]:
# set number of clusters
kclusters = 5


# run k-means clustering For Toronto
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(X_Toronto)

# check cluster labels generated for each row in the dataframe For Toronto
kmeans.labels_[0:10] 

# run k-means clustering For York
kmeans_York = KMeans(n_clusters=kclusters, random_state=0).fit(X_York)

# check cluster labels generated for each row in the dataframe for York
kmeans_York.labels_[0:10]

# Print Results
print(kmeans.labels_[0:10] ,kmeans_York.labels_[0:10] )

[3 3 3 3 1 1 1 1 1 1] [1 1 1 1 4 4 4 4 1 1]
