# SensorData

## Imports <a name="imports"></a>

In [3]:
#Imports 
import pandas as pd
from sklearn.preprocessing import StandardScaler
import pickle

## Import Data <a name="importData"></a>

### Crowdedness Data <a name="crowdData"></a>
The crowdedness count per camera, per day, per hour

In [2]:
#Load in the data
crowd_df = pd.read_excel("../../../Data_thesis/CMSA/cmsa_data.xlsx")

In [3]:
#Example Contents
crowd_df.head()

Unnamed: 0,richting,Timestamp,StartPoint,StartPointName,EndPoint,EndPointName,MinMeasuredTime,MaxMeasuredTime,AvgMeasuredTime,MedianMeasuredTime,...,tijdstip,weekdag,tijdstip_verschoven,verschoven,datum_verschoven,weekdag_verschoven,tijdstip(15_min),weeknr,weekdag_tijd,weekend filter
0,2,2018-03-11 00:05:00,1,Start,2,End,0,0,0,0,...,00:04:00,,1900-01-01 00:04:00,1,2018-03-10,,1900-01-01 00:00:00,2018-10,1900-01-07,weekend
1,02R,2018-03-11 00:05:00,1,Start,2,End,0,0,0,0,...,00:04:00,,1900-01-01 00:04:00,1,2018-03-10,,1900-01-01 00:00:00,2018-10,1900-01-07,weekend
2,Oude Kennissteeg Occ wifi,2018-03-11 00:05:00,1,Start,2,End,0,0,0,0,...,00:04:00,,1900-01-01 00:04:00,1,2018-03-10,,1900-01-01 00:00:00,2018-10,1900-01-07,weekend
3,2,2018-03-11 00:10:00,1,Start,2,End,0,0,0,0,...,00:10:00,,1900-01-01 00:10:00,1,2018-03-10,,1900-01-01 00:00:00,2018-10,1900-01-07,weekend
4,02R,2018-03-11 00:10:00,1,Start,2,End,0,0,0,0,...,00:10:00,,1900-01-01 00:10:00,1,2018-03-10,,1900-01-01 00:00:00,2018-10,1900-01-07,weekend


### Crowdedness Data (Without Sensor column)
All the data resides in multiple files, where the filename equals the sensor number

In [4]:
blip_df = pd.read_csv("../../../Data_thesis/CMSA/BlipData.csv")

In [5]:
blip_df.head()

Unnamed: 0,Date,Sensor,CrowdednessCount,Hour
0,2018-04-01,GAWW-01,4271,0
1,2018-04-01,GAWW-01,3414,1
2,2018-04-01,GAWW-01,246,10
3,2019-01-01,GAWW-01,2788,3
4,2018-07-04,GAWW-01,1513,15


### Sensor Locations <a name="senData"></a>
The location of the cameras

In [6]:
#Load in the data
sensor_df = pd.read_csv("../../../Data_thesis/Open_Data/crowdedness_sensoren.csv", sep=";")

In [7]:
#Example contents
sensor_df.head()

Unnamed: 0,OBJECTNUMMER,Objectnummer,Soort,Voeding,Rotatie,Actief,Privacyverklaring,WKT_LNG_LAT,WKT_LAT_LNG,LNG,LAT,Unnamed: 11
0,1,GAWW-03,Telcamera,Eigen voeding via lichtnet,115,Ja,,"POINT(4.8973932,52.3725037)","POINT(52.3725037,4.8973932)",48973932,523725037,
1,2,GAWW-03,WiFi sensor,Eigen voeding via lichtnet,115,Ja,https://www.amsterdam.nl/privacy/specifieke/pr...,"POINT(4.8973336,52.3725237)","POINT(52.3725237,4.8973336)",48973336,523725237,
2,3,GAWW-02,WiFi sensor,Eigen voeding via lichtnet,115,Ja,https://www.amsterdam.nl/privacy/specifieke/pr...,"POINT(4.8988705,52.3737982)","POINT(52.3737982,4.8988705)",48988705,523737982,
3,4,GAWW-02,Telcamera,Eigen voeding via lichtnet,115,Ja,,"POINT(4.8989027,52.3737857)","POINT(52.3737857,4.8989027)",48989027,523737857,
4,5,GAWW-01,WiFi sensor,Eigen voeding via lichtnet,0,Ja,https://www.amsterdam.nl/privacy/specifieke/pr...,"POINT(4.8997667,52.374627)","POINT(52.374627,4.8997667)",48997667,52374627,


## Functions

In [4]:
def sensorCoordinates(coor_df, needed_sensors):
    """
    This function retrieves the Longitude and Latitude of the needed Sensors and returns these. 

    Parameters:
    - coor_df: DF with longitude and latitude of all the sensors in Amsterdam
    - needed_sensor: List with all the sensors from which the location must be retrieved

    Returns: Dict[SensorName] : {"Longitude": <longitude>, "Latitude": <latitude>}
    """

    #Variables

    #Dict to saved the needed locations in
    locations_dict = {}

    #Select columns to use
    coor_df = coor_df[["Objectnummer", "LNG", "LAT"]]

    #################################################################################

    #Change Df into dict (Needed for operations on data)
    coor_dict = coor_df.to_dict("index")

    for k, v in coor_dict.items():

        #Save only the cameras with the object nummer given above
        if v["Objectnummer"] in needed_sensors:

            #Replace the "." with "," to make sure the coordinates can be turned into floats
            v["LNG"] = float(v["LNG"].replace(",", "."))
            v["LAT"] = float(v["LAT"].replace(",", "."))

            #Save all contents in seperate dict
            locations_dict[v["Objectnummer"]] = {
                "Longitude": v["LNG"], "Latitude": v["LAT"]}

    return locations_dict

In [5]:
def sensorData(sensor_df, blip_df, locations_dict, needed_sensors, gaww_02, gaww_03, lon_scaler_filename, lat_scaler_filename):
    """
    This function takes all the relevant sensor date and combines this in a single DF 

    Parameters:
    - sensor_df (df): Custom made dataframe with subset sensor data
    - blip_df (df): Constructed df from imported blip data
    - locations_dict (dict): contains the longitude and latitude of the relevant sensors
    - needed_sensors (list): selection of given relevant sensors
    - gaww-02 (list): alternate names for the gaww-02 sensor
    - gaww-03 (list): alternate names for the gaww-03 sensor
    - lon_scaler_filename: where the longitude scaler should be saved
    - lat_scaler_filename: where the latitude scaler should be saved

    Returns: DF with all relevant sensor data
    """

    #Variables

    #Scaler to scale the latitude and longitude of sensors
    scaler = StandardScaler()

    #################################################################################

    #Group the counts of people per hour, per date, per camera
    sensor_df = sensor_df.groupby(["richting", "datum", "uur"])[
        "SampleCount"].sum().reset_index()

    #Rename the columns
    sensor_df = sensor_df.rename(index=str, columns={"richting": "Sensor", "datum": "Date", "uur": "Hour",
                                                   "SampleCount": "CrowdednessCount"})

    #Concatenate the two sensor DF's
    sensor_df = pd.concat([sensor_df, blip_df],
                         sort=True).reset_index().drop(columns={"index"})

    #For the longitude number of the sensor
    sensor_df.insert(3, "SensorLongitude", 0)

    #For the latitude number of the sensor
    sensor_df.insert(4, "SensorLatitude", 0)

    #For the number of the day of the week
    sensor_df.insert(3, "weekday", 99)

    #################################################################################

    #Change Df into dict
    crowd_dict = sensor_df.to_dict("index")

    #Loop over dict
    for k, v in crowd_dict.items():

        #Change camera name
        if v["Sensor"] in gaww_02:
            v["Sensor"] = "GAWW-02"

        #Change camera name
        elif v["Sensor"] in gaww_03:
            v["Sensor"] = "GAWW-03"

        #Make the longitude and latitude consistent
        if v["Sensor"] in needed_sensors:

            v["SensorLongitude"] = locations_dict[v["Sensor"]]["Longitude"]
            v["SensorLatitude"] = locations_dict[v["Sensor"]]["Latitude"]

        #Mulitply hour with 100 (Same structure as the other files)
        v["Hour"] *= 100

        #If the hour is 0, transform it to 2400
        if v["Hour"] == 0:
            v["Hour"] = 2400

        #Save the number of the day of the week       
        try:
            v["weekday"] = v["Date"].weekday()
        except:
            #If the above code fails, the date is not timestamp object yet
            v["Date"] = pd.Timestamp.strptime(v["Date"], "%Y-%m-%d")
            v["weekday"] = v["Date"].weekday()

    #Return from Dict
    full_df = pd.DataFrame.from_dict(crowd_dict, orient="index")

    #################################################################################

    #Only save the sensors that are relevant
    full_df = full_df[full_df["Sensor"].isin(needed_sensors)]

    #Group the multiple different sensor data from same date and hour together
    full_df = full_df.groupby(["Sensor", "Date", "Hour", "SensorLongitude",
                               "SensorLatitude", "weekday"])["CrowdednessCount"].sum().reset_index()

    #################################################################################

    #Scale the Longitude and latitude and save the scaler for later use
    full_df["LonScaled"] = scaler.fit_transform(
        full_df["SensorLongitude"].to_numpy().reshape(-1, 1))
    pickle.dump(scaler, open(lon_scaler_filename, 'wb'))

    full_df["LatScaled"] = scaler.fit_transform(
        full_df["SensorLatitude"].to_numpy().reshape(-1, 1))
    pickle.dump(scaler, open(lat_scaler_filename, 'wb'))

    return full_df

In [29]:
def constructSeriesDF(df):
    df = df.groupby(["weekday", "Sensor"]).agg({"CrowdednessCount": 'mean',
                                               "SensorLongitude": "first",
                                               "SensorLatitude": "first"})
    df.to_csv("../../../Data_thesis/Full_Datasets/TimeSeriesCrowdedness.csv", index=True)

### Variables

In [6]:
#Sensors to use in Sensor Data
needed_sensors = ["GAWW-01", "GAWW-02", "GAWW-03", "GAWW-04", "GAWW-05", "GAWW-06", "GAWW-07"]

#Alternative names Sensors
gaww_02 = [2, "02R", "2R", "Oude Kennissteeg Occ wifi"]
gaww_03 = [3, "03R"]

### Construct Full DF

In [7]:
#Import CSV file
sensor_df = pd.read_excel("../../../../Data_thesis/CMSA/cmsa_data.xlsx")
coor_df = pd.read_csv("../../../../Data_thesis/Open_Data/crowdedness_sensoren.csv", sep=";")
blip_df = pd.read_csv("../../../../Data_thesis/CMSA/BlipData.csv")

In [8]:
#Transform Sensor df
locations_dict = sensorCoordinates(coor_df, needed_sensors)

#Transform Crowdedness df
full_df = sensorData(sensor_df, blip_df, locations_dict,
                     needed_sensors, gaww_02, gaww_03, lon_scaler_filename, lat_scaler_filename)

NameError: name 'lon_scaler_filename' is not defined

In [30]:
constructSeriesDF(full_df)

In [None]:
#Example contents
full_df.head()

## DF to File <a name="CSV"></a>

In [44]:
full_df.to_csv("../../../Data_thesis/Full_Datasets/Crowdedness.csv", index=False)