# Lab No 2: Data Manipulation and Working with Web Services: 2 Challenges

## Challenge No 1:

In [None]:
import pandas as pd
import numpy as np

# Define the total number of rows in the data frame
Total_num_rows = 100

# Define the value of cities, which will be used to create a loop
cities = ["New York", "London", "Tokyo"]

# defining the value in the table
data = {
    
    # ID no for each row, ranged from 0 to the total number of rows
    "ID": list(range(0, Total_num_rows)), 
    
    # A loop of 'cities'
    "City": [cities[i % len(cities)] for i in range(Total_num_rows)], 
    
    # Random value ranged from 18 to 65 as the age 
    "Age": np.random.randint(18, 65, Total_num_rows), 
    
    # Random value ranged from 0 to 999999 as the salary
    "Salary(USD)": np.random.randint(0, 99999, Total_num_rows)  
}

# framing the data frame
df = pd.DataFrame(data)

# Display the first five row of the data frame
print(df.head())



In [None]:
# create a subset from the original data frame 
subset_df = df.iloc[range(30),range(3)] #keep the frist 30 rows and 3 column

#Display the subset
print(subset_df)

In [None]:
# create a filtered subset from the original data frame with loc fuction
filtered_df = df.loc[
         (df["Age"] > 30) & # keep the data with 'age' geater than 30
         (df["City"] == "New York")] # keep the data with 'city' equal 'New york'

# Display the filtered subset
print(filtered_df)


In [None]:
# Display summary of fundamental statistical analysis
df.describe()

In [None]:
# calculate the mean for the numeric column in the data frame
df.mean(numeric_only=True)

In [None]:
# calculate the standard deviation for the numeric column in the data frame
df.std(numeric_only=True)

In [None]:
# Grouping by the dffierent values of City and calculate the mean
df.groupby('City').mean()

In [None]:
# Grouping by the dffierent values of City and calculate the standard deviatio
df.groupby('City').std()

## Challenge No 2:

**Part No 1:**

In [None]:
import requests # Allow us to access data via API
import pandas as pd
import geopandas as gpd

#define the API url
url_bikes = "https://api.glasgow.gov.uk/mobility/v1/get_rentals?startDate=2022-05-01&endDate=2023-05-01"

# Requests the data with the fuction requests.get
response = requests.get(url_bikes)

# Convert the data into json format
data = response.json()

# Clear the data and create data frame
rental_data = data['data']
rental_pd = pd.DataFrame(rental_data)


In [None]:
# testing for NaN values in the latitude and longitude column of the data frame
nan_in_column_Lat = rental_pd['endPlaceLat'].isna().any()
nan_in_column_Long = rental_pd['endPlaceLong'].isna().any()

# Display the result
print(nan_in_column_Lat,nan_in_column_Lat)

In [None]:
# Since there are NaN values, we have to remove it by the .dropna fuction
clean_rental_pd = rental_pd.dropna(subset=[
    'endPlaceLat','endPlaceLong'])

In [None]:
# We now create a Geo Data Frame 
# with 'endPlaceLong' as the longtitude and 'endPlaceLat' as the latitude
gdf_bikes_end = gpd.GeoDataFrame(clean_rental_pd, 
                                 geometry=gpd.points_from_xy(
                                     clean_rental_pd['endPlaceLong'], 
                                     clean_rental_pd['endPlaceLat']))

# Ensure that the CRS of the Geo Data Frame is EPSG:4326
gdf_bikes_end = gdf_bikes_end.set_crs("EPSG:4326")

# Display the explore map
gdf_bikes_end.explore()


In [None]:
# We now clean up the unnecessary attributes by creating a subset
keep_cols = [
    "endDate",
    "endPlaceId",
    "endPlaceName",
    "durationSeconds",
    "isInvalid",
    "price",
    "isEbike",
    "endPlaceLat",
    "endPlaceLong",
    "geometry",
]
gdf_bikes_end = gdf_bikes_end[keep_cols]

# Ensuring and chanaging data types
gdf_bikes_end.endPlaceId = gdf_bikes_end.endPlaceId.astype(int)
gdf_bikes_end.endPlaceName = gdf_bikes_end.endPlaceName.astype(str)
gdf_bikes_end['endDate'] = pd.to_datetime(gdf_bikes_end['endDate'], format='%Y-%m-%dT%H:%M:%SZ')


In [None]:
from sklearn.cluster import KMeans

# setting up the number of cluster area needed
num_clusters = 4
kmeans_collection = KMeans(n_clusters=num_clusters, random_state=42)


# calculate the cluster value with 'endPlaceLong', 'endPlaceLat' for the geo data frame
gdf_bikes_end['kmeans_cluster'] = kmeans_collection.fit_predict(gdf_bikes_end[
                                                                ['endPlaceLong', 'endPlaceLat']])


# Display the first five row in the data frame to check if the kmeans_cluster column
gdf_bikes_end.head()

In [None]:
# Check if the kmeans_cluster have four value
gdf_bikes_end['kmeans_cluster'].nunique()

In [None]:
import geopandas as gpd
import leafmap

# Create a map with leafmap, setting up the centre start point
m = leafmap.Map(center=(55.860166, -4.257505), zoom=12)

# Add the basemap in, with the map type of 'CartoDB.Positron'
m.add_basemap("CartoDB.Positron")

# I was not able to run the leafmap in the jupyter notebook on my MacBook
# So I ended up testing the code in google colab
# The first set of code that I used is without mcolors and cmap='Set1'
# It rendered a map with only three colors and that's why I added this set of code

import matplotlib.colors as mcolors
colors = ["red", "blue", "green", "purple"] 

# name the customized colour camp which allows us to input it into the .add_data function later
cmap = mcolors.ListedColormap(colors)


# Add the data 'gdf_bikes_end' with the 'Kmeans_clusters' column into the map 
m.add_data(
    gdf_bikes_end, 
    column="kmeans_cluster",
    cmap=cmap, # plug in the customized the colour
    legend_title="Clusters",
    add_legend=True,
)

# plotting the map
m

**Part No 2:**

In [None]:
import requests
import pandas as pd
import geopandas as gpd

#access the sensor locations via API url
url_sensor = 'http://api.glasgow.gov.uk/traffic/v1/movement/sites'
response = requests.get(url_sensor)

# Checking the response, code 200 means the request as satisfactory
response

In [None]:
#convert data to json format and create a data frame
data = response.json()
df = pd.DataFrame(data)

# Display data and review the data structure
data

In [None]:
# The longitude, latitude and description are stored under 'from'
# it has to be extracted from 'from'

# Create a new column called 'from_description' with the value from 'description' under 'from'
df['from_description'] = df['from'].apply(lambda x: x.get('description', '')) 

# Create a new column called 'from_lat' with the value from 'lat' under 'from'
df['from_lat'] = df['from'].apply(lambda x: float(x.get('lat', 0)))

# Create a new column called 'from_long' with the value from 'long' under 'from'
df['from_long'] = df['from'].apply(lambda x: float(x.get('long', 0)))

# After creating the new column, delete the original column 'to' and 'from'
df = df.drop(columns=['from', 'to'])

# Display the first five row to review the data
df.head()

In [None]:
# Testing for any NaN value in longitude and latitude
nan_in_column_Long = df['from_long'].isna().any()
nan_in_column_Lat = df['from_lat'].isna().any()

# Display the result
print(nan_in_column_Long,nan_in_column_Lat)

In [None]:
# Read the shape file with the provided path
zone = gpd.read_file("/Users/hoitik/Desktop/Assginment_1/Lab 2/WorkplaceZones2011Scotland")

# Setting CRS to EPSG:4326
zone = zone.to_crs("EPSG:4326")

# Since the shape is on a nation scale, it cost extra time to run
# To clip out the zone in Glasgow city, 
# only keep the row with 'S12000046' in the column 'LADCD'
zone_glasgow = zone[zone["LADCD"] == "S12000046"]

# Display the map and review
zone_glasgow.explore()

In [None]:
# After we read the zone file, 
# we now create a geo data frame with the sensors location and clip it with 'zone_glasgow'

# Create Geo Data Frame with 'from_long' as the longitude and 'from_lat' as the latitude
gdf_sensors = gpd.GeoDataFrame(df, 
                               geometry = gpd.points_from_xy(df['from_long'],
                                                             df['from_lat']), 
                               crs = 'EPSG:4326') # with CRS EPSG:4326

# Clip the sensors location with 'zone_glasgow'
gdf_sensors = gpd.clip(gdf_sensors, zone_glasgow)

# Display and review the map
gdf_sensors.explore()

In [None]:
# Join the two map with .sjoin
joined = gpd.sjoin( gdf_sensors, zone_glasgow, how="left", predicate="intersects")

# display the first five row to review
joined.head()

In [None]:
# Counts the number of sentor in each zone by 'WZCD'
# Also create a new column 'sensor_count' to store the result
sensor_counts = joined.groupby("WZCD").size().reset_index(name="sensor_count")

# Now we merge the new column into 'zone_glasgow'
counts_by_zone = zone_glasgow.merge(sensor_counts, on="WZCD", how="left").fillna(0)

# Poltting the map
counts_by_zone.explore(
    center=(55.860166, -4.257505),
    zoom=12, 
    column="sensor_count", 
    cmap="cividis", 
    legend=True)