In [1]:
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
from datetime import datetime 
from IPython.display import display, HTML, clear_output
import gc
import seaborn as sns
from scipy.stats import normaltest, boxcox, probplot
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from random import shuffle
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.metrics import f1_score, balanced_accuracy_score
from statsmodels.tsa.stattools import adfuller

In [2]:
r'''
dataset2019: all sumo data with ocupation (requested in sumo office)
'''

dataset2019_path = r'C:\Users\ing_l\Tesis grado\Data\dataset_2019.csv'

In [3]:
dataset2019 = pd.read_csv(dataset2019_path)[['id_cuadra', 'direccion', 'fecha', 'tiempo', 'operacion', 'ocupacion', 'latitud', 'longitud']]
dataset2019['dia de la semana'] = -1
dataset2019.columns.names = ['dataset2019']

In [None]:
def correct_type(row):
    '''
    Corrects the dtype of fecha, tiempo and ocupación on the df
    '''
    row['fecha'] = pd.to_datetime(row['fecha'], dayfirst=True).date()
    row['tiempo'] = pd.to_datetime(row['tiempo']).time()
    row['ocupacion'] = int(row['ocupacion'])
    row['dia de la semana'] = row['fecha'].weekday()
    return row

print('Working on dataset2019...')
dataset2019 = dataset2019.apply(correct_type, axis=1).sort_values(by=['fecha', 'tiempo'])
dataset2019['mes'] = dataset2019['fecha'].apply(lambda x: x.month)
dataset2019['hora'] = dataset2019['tiempo'].apply(lambda x: x.hour)
clear_output()

Working on dataset2019...


In [None]:
dataset2019_backup = dataset2019.copy()

In [None]:
dataset2019['timestamp'] = dataset2019['tiempo'].apply(lambda x: x.hour * 60 * 60 + x.minute * 60 + x.second)

In [None]:
half_hour = 30 * 60

def set_half_hour(ts):
    global half_hour
    return (math.floor(ts / half_hour)) * half_hour

dataset2019['media hora'] = dataset2019['timestamp'].apply(set_half_hour)

In [None]:
dataset2019.reset_index(drop=True)[['id_cuadra','direccion','fecha','mes','dia de la semana','media hora','ocupacion']].sort_values(by=['id_cuadra','fecha','media hora'])

# Creación de los dos datasets. 

### Dataset_hh posee los datos agrupados cada media hora

In [None]:
dataset_hh = dataset2019.groupby(['id_cuadra','fecha','media hora']).max()
dataset_hh = dataset_hh.sort_values(by=['id_cuadra','fecha','media hora']).reset_index()[['id_cuadra','direccion','fecha','mes','dia de la semana','media hora','ocupacion']]

In [None]:
out = pd.DataFrame(columns=['id_cuadra','direccion','fecha','mes','dia de la semana','media hora','ocupacion','target'])

for s in dataset_hh['id_cuadra'].drop_duplicates().sort_values():
    clear_output()
    print('Street:', s)
    dataset_hh_s = dataset_hh.loc[dataset_hh['id_cuadra'] == s].reset_index(drop=True)
    
    for f in dataset_hh_s['fecha'].drop_duplicates().sort_values():
        
        to_append = pd.DataFrame(columns=['id_cuadra','direccion','fecha','mes','dia de la semana','media hora','ocupacion'])

        dataset_hh_f = dataset_hh_s.loc[dataset_hh_s['fecha'] == f].reset_index(drop=True)

        to_append = dataset_hh_f[:-1].reset_index(drop=True)
        
        to_append['target'] = dataset_hh_f['ocupacion'][1:].reset_index(drop=True)
        out = out.append(to_append)

dataset_hh = out

In [None]:
dataset_hh[0:50]

In [None]:
dataset_hh.to_csv(r'C:\Users\ing_l\Tesis grado\Data\Predictors data\Dataset2019_half_hour_max.csv')

### Dataset_hour posee los datos agrupados por hora.

In [None]:
dataset_hour = dataset2019.groupby(['id_cuadra','fecha','hora']).mean()
dataset_hour = dataset_hour.sort_values(by=['id_cuadra','fecha','hora']).reset_index()[['id_cuadra', 'fecha', 'hora', 'ocupacion', 'dia de la semana', 'mes']]

In [None]:
out = pd.DataFrame(columns=['id_cuadra', 'fecha', 'hora', 'ocupacion', 'dia de la semana', 'mes', 'targets'])

for s in dataset_hour['id_cuadra'].drop_duplicates().sort_values():
    clear_output()
    print('Street:', s)
    dataset_hour_s = dataset_hour.loc[dataset_hour['id_cuadra'] == s].reset_index(drop=True)
    
    for f in dataset_hour_s['fecha'].drop_duplicates().sort_values():
        to_append = pd.DataFrame(columns=['id_cuadra', 'fecha', 'hora', 'ocupacion', 'ocupacion anterior', 'dia de la semana', 'mes', 'targets'])

        dataset_hour_f = dataset_hour_s.loc[dataset_hour_s['fecha'] == f].reset_index(drop=True)

        to_append = dataset_hour_f[1:-1].reset_index(drop=True)
        
        to_append['ocupacion anterior'] = dataset_hour_f['ocupacion'][:-2].reset_index(drop=True)
        to_append['targets'] = dataset_hour_f['ocupacion'][2:].reset_index(drop=True)
        out = out.append(to_append)
dataset_hour = out

In [None]:
dataset_hour[0:50]

In [None]:
dataset_hour.to_csv(r'C:\Users\ing_l\Tesis grado\Data\Predictors data\Dataset2019_hour_mean.csv')