# Import data and clean it

In [1]:
import streamlit as st
import json
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

In [2]:
#Open the json file
with open("Historique des positions.json", "r") as location_history:
    location_data = json.loads(location_history.read())

In [3]:
#take just the locations section
df = pd.DataFrame(location_data["locations"])

In [4]:
#remove useless columns
df.drop(
    columns=[
        "accuracy",
        "activity",
        "altitude",
        "verticalAccuracy",
        "velocity",
        "heading",
        "source",
        "deviceTag",
        "platform",
        "platformType",
        "locationMetadata"
        
    ],
    inplace=True,
)

In [5]:
#set latitude, longitude and time in a good format
df["latitude"] = df["latitudeE7"] / 10000000
df["longitude"] = df["longitudeE7"] / 10000000
df["ts"] = pd.to_datetime(df["timestampMs"], unit="ms")

In [6]:
print(df['ts'])

0        2016-09-03 07:37:26.200
1        2016-09-03 07:37:41.569
2        2016-09-03 07:38:01.732
3        2016-09-03 07:38:16.881
4        2016-09-03 07:39:33.123
                   ...          
344565   2021-09-01 16:10:52.642
344566   2021-09-01 16:11:08.000
344567   2021-09-01 16:11:24.581
344568   2021-09-01 16:11:40.121
344569   2021-09-01 16:11:56.261
Name: ts, Length: 344570, dtype: datetime64[ns]


In [7]:
#verification that the time is in good format
df['ts'] = df['ts'].map(pd.to_datetime)
df['ts']

0        2016-09-03 07:37:26.200
1        2016-09-03 07:37:41.569
2        2016-09-03 07:38:01.732
3        2016-09-03 07:38:16.881
4        2016-09-03 07:39:33.123
                   ...          
344565   2021-09-01 16:10:52.642
344566   2021-09-01 16:11:08.000
344567   2021-09-01 16:11:24.581
344568   2021-09-01 16:11:40.121
344569   2021-09-01 16:11:56.261
Name: ts, Length: 344570, dtype: datetime64[ns]

In [7]:
df

Unnamed: 0,timestampMs,latitudeE7,longitudeE7,latitude,longitude,ts
0,1472888246200,488210761,22914577,48.821076,2.291458,2016-09-03 07:37:26.200
1,1472888261569,488210661,22914678,48.821066,2.291468,2016-09-03 07:37:41.569
2,1472888281732,488210661,22914678,48.821066,2.291468,2016-09-03 07:38:01.732
3,1472888296881,488210688,22914651,48.821069,2.291465,2016-09-03 07:38:16.881
4,1472888373123,488211546,22914640,48.821155,2.291464,2016-09-03 07:39:33.123
...,...,...,...,...,...,...
344565,1630512652642,487884448,23819383,48.788445,2.381938,2021-09-01 16:10:52.642
344566,1630512668000,487884395,23819350,48.788440,2.381935,2021-09-01 16:11:08.000
344567,1630512684581,487884426,23819419,48.788443,2.381942,2021-09-01 16:11:24.581
344568,1630512700121,487884477,23819359,48.788448,2.381936,2021-09-01 16:11:40.121


In [8]:
df.groupby('latitude').size()

latitude
33.818854    22
33.823736     1
33.824151     1
33.824205    17
33.824298     1
             ..
49.417352     1
49.417367     1
49.417445     1
49.417448     1
49.421582     1
Length: 55402, dtype: int64

In [8]:
#add informations in the dataset
def get_weekday(dt):
    return dt.weekday()

def get_monthday(dt):
    return dt.month

def get_year(dt):
    return dt.year

def get_hour(dt):
    return dt.hour


In [9]:
df['weekday'] = df['ts'].map(get_weekday)

In [10]:
df['month'] = df['ts'].map(get_monthday)

In [11]:
df['year'] = df['ts'].map(get_year)

In [12]:
df['hour'] = df['ts'].map(get_hour)

In [13]:
def count_rows(rows):
    return len(rows)

In [14]:
df

Unnamed: 0,timestampMs,latitudeE7,longitudeE7,latitude,longitude,ts,weekday,month,year,hour
0,1472888246200,488210761,22914577,48.821076,2.291458,2016-09-03 07:37:26.200,5,9,2016,7
1,1472888261569,488210661,22914678,48.821066,2.291468,2016-09-03 07:37:41.569,5,9,2016,7
2,1472888281732,488210661,22914678,48.821066,2.291468,2016-09-03 07:38:01.732,5,9,2016,7
3,1472888296881,488210688,22914651,48.821069,2.291465,2016-09-03 07:38:16.881,5,9,2016,7
4,1472888373123,488211546,22914640,48.821155,2.291464,2016-09-03 07:39:33.123,5,9,2016,7
...,...,...,...,...,...,...,...,...,...,...
344565,1630512652642,487884448,23819383,48.788445,2.381938,2021-09-01 16:10:52.642,2,9,2021,16
344566,1630512668000,487884395,23819350,48.788440,2.381935,2021-09-01 16:11:08.000,2,9,2021,16
344567,1630512684581,487884426,23819419,48.788443,2.381942,2021-09-01 16:11:24.581,2,9,2021,16
344568,1630512700121,487884477,23819359,48.788448,2.381936,2021-09-01 16:11:40.121,2,9,2021,16


In [15]:
df[["latitude", "longitude","ts","weekday","month","year","hour"]].to_csv("all_clean_data.csv", index=False)

print("Cleaning completed")

Cleaning completed
