In [1]:
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
from datetime import datetime 
from IPython.display import display, HTML, clear_output
import gc
import seaborn as sns
from scipy.stats import normaltest, boxcox, probplot
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from random import shuffle
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.metrics import f1_score, balanced_accuracy_score
from statsmodels.tsa.stattools import adfuller

In [2]:
r'''
dataset2019: all sumo data with ocupation (requested in sumo office)
'''

dataset2019_path = r'C:\Users\ing_l\Tesis grado\Data\dataset_2019.csv'

In [3]:
dataset2019 = pd.read_csv(dataset2019_path)[['id_cuadra', 'direccion', 'fecha', 'tiempo', 'operacion', 'ocupacion', 'latitud', 'longitud']]
dataset2019['dia de la semana'] = -1
dataset2019.columns.names = ['dataset2019']

In [4]:
def correct_type(row):
    '''
    Corrects the dtype of fecha, tiempo and ocupación on the df
    '''
    row['fecha'] = pd.to_datetime(row['fecha'], dayfirst=True).date()
    row['tiempo'] = pd.to_datetime(row['tiempo']).time()
    row['ocupacion'] = int(row['ocupacion'])
    row['dia de la semana'] = row['fecha'].weekday()
    return row

print('Working on dataset2019...')
dataset2019 = dataset2019.apply(correct_type, axis=1).sort_values(by=['fecha', 'tiempo'])
dataset2019['mes'] = dataset2019['fecha'].apply(lambda x: x.month)
dataset2019['hora'] = dataset2019['tiempo'].apply(lambda x: x.hour)
clear_output()

In [5]:
dataset2019_backup = dataset2019.copy()

In [6]:
dataset2019['timestamp'] = dataset2019['tiempo'].apply(lambda x: x.hour * 60 * 60 + x.minute * 60 + x.second)

In [7]:
half_hour = 30 * 60

def set_half_hour(ts):
    global half_hour
    return (math.floor(ts / half_hour)) * half_hour

dataset2019['media hora'] = dataset2019['timestamp'].apply(set_half_hour)

In [8]:
dataset2019

dataset2019,id_cuadra,direccion,fecha,tiempo,operacion,ocupacion,latitud,longitud,dia de la semana,mes,hora,timestamp,media hora
0,16,Belgrano 660,2019-01-01,10:55:00,0,0,-37.328838,-59.134455,1,1,10,39300,37800
7,1,General Pinto 545,2019-01-01,15:12:00,1,1,-37.327782,-59.136657,1,1,15,54720,54000
9,65,Rodriguez 348,2019-01-01,15:13:00,1,1,-37.329387,-59.134833,1,1,15,54780,54000
8,1,General Pinto 545,2019-01-01,15:15:00,0,0,-37.327782,-59.136657,1,1,15,54900,54000
10,65,Rodriguez 348,2019-01-01,16:22:00,0,0,-37.329387,-59.134833,1,1,16,58920,57600
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2957257,18,Sarmiento 543,2019-12-31,19:45:00,1,4,-0.000000,-0.000000,1,12,19,71100,70200
2957258,18,Sarmiento 543,2019-12-31,19:46:00,0,3,-0.000000,-0.000000,1,12,19,71160,70200
2956381,24,Belgrano 364,2019-12-31,19:53:00,1,1,-37.330077,-59.138438,1,12,19,71580,70200
2956511,60,Chacabuco 357,2019-12-31,20:08:00,0,0,-37.330203,-59.137003,1,12,20,72480,72000


# Creación de los dos datasets. 

### Dataset_hh posee los datos agrupados cada media hora

In [33]:
dataset_hh = dataset2019.groupby(['id_cuadra','fecha','media hora']).mean()
dataset_hh = dataset_hh.sort_values(by=['id_cuadra','fecha','media hora']).reset_index()[['id_cuadra', 'fecha', 'media hora', 'ocupacion', 'dia de la semana', 'mes']]

In [34]:
out = pd.DataFrame(columns=['id_cuadra', 'fecha', 'media hora', 'ocupacion', 'ocupacion anterior', 'dia de la semana', 'mes', 'targets'])

for s in dataset_hh['id_cuadra'].drop_duplicates().sort_values():
    clear_output()
    print('Street:', s)
    dataset_hh_s = dataset_hh.loc[dataset_hh['id_cuadra'] == s].reset_index(drop=True)
    
    for f in dataset_hh_s['fecha'].drop_duplicates().sort_values():
        
        to_append = pd.DataFrame(columns=['id_cuadra', 'fecha', 'media hora', 'ocupacion', 'ocupacion anterior', 'dia de la semana', 'mes', 'targets'])

        dataset_hh_f = dataset_hh_s.loc[dataset_hh_s['fecha'] == f].reset_index(drop=True)

        to_append = dataset_hh_f[1:-1].reset_index(drop=True)
        
        to_append['ocupacion anterior'] = dataset_hh_f['ocupacion'][:-2].reset_index(drop=True)
        to_append['targets'] = dataset_hh_f['ocupacion'][2:].reset_index(drop=True)
        out = out.append(to_append)

dataset_hh = out

Street: 190


In [35]:
dataset_hh[0:50]

Unnamed: 0,dia de la semana,fecha,id_cuadra,media hora,mes,ocupacion,ocupacion anterior,targets
0,2.0,2019-01-02,0,36000,1.0,1.0,2.0,1.0
1,2.0,2019-01-02,0,37800,1.0,1.0,1.0,2.75
2,2.0,2019-01-02,0,39600,1.0,2.75,1.0,2.5
3,2.0,2019-01-02,0,41400,1.0,2.5,2.75,0.714286
4,2.0,2019-01-02,0,43200,1.0,0.714286,2.5,1.0
5,2.0,2019-01-02,0,45000,1.0,1.0,0.714286,2.7
6,2.0,2019-01-02,0,46800,1.0,2.7,1.0,2.166667
7,2.0,2019-01-02,0,48600,1.0,2.166667,2.7,3.9
8,2.0,2019-01-02,0,50400,1.0,3.9,2.166667,3.833333
9,2.0,2019-01-02,0,52200,1.0,3.833333,3.9,2.333333


In [36]:
dataset_hh.to_csv(r'C:\Users\ing_l\Tesis grado\Data\Predictors data\Dataset2019_half_hour_mean.csv')

### Dataset_hour posee los datos agrupados por hora.

In [49]:
dataset_hour = dataset2019.groupby(['id_cuadra','fecha','hora']).mean()
dataset_hour = dataset_hour.sort_values(by=['id_cuadra','fecha','hora']).reset_index()[['id_cuadra', 'fecha', 'hora', 'ocupacion', 'dia de la semana', 'mes']]

In [50]:
out = pd.DataFrame(columns=['id_cuadra', 'fecha', 'hora', 'ocupacion', 'dia de la semana', 'mes', 'targets'])

for s in dataset_hour['id_cuadra'].drop_duplicates().sort_values():
    clear_output()
    print('Street:', s)
    dataset_hour_s = dataset_hour.loc[dataset_hour['id_cuadra'] == s].reset_index(drop=True)
    
    for f in dataset_hour_s['fecha'].drop_duplicates().sort_values():
        to_append = pd.DataFrame(columns=['id_cuadra', 'fecha', 'hora', 'ocupacion', 'ocupacion anterior', 'dia de la semana', 'mes', 'targets'])

        dataset_hour_f = dataset_hour_s.loc[dataset_hour_s['fecha'] == f].reset_index(drop=True)

        to_append = dataset_hour_f[1:-1].reset_index(drop=True)
        
        to_append['ocupacion anterior'] = dataset_hour_f['ocupacion'][:-2].reset_index(drop=True)
        to_append['targets'] = dataset_hour_f['ocupacion'][2:].reset_index(drop=True)
        out = out.append(to_append)
dataset_hour = out

Street: 190


In [51]:
dataset_hour[0:50]

Unnamed: 0,dia de la semana,fecha,hora,id_cuadra,mes,ocupacion,ocupacion anterior,targets
0,2.0,2019-01-02,10,0,1.0,1.0,2.0,2.625
1,2.0,2019-01-02,11,0,1.0,2.625,1.0,0.857143
2,2.0,2019-01-02,12,0,1.0,0.857143,2.625,2.5
3,2.0,2019-01-02,13,0,1.0,2.5,0.857143,3.863636
4,2.0,2019-01-02,14,0,1.0,3.863636,2.5,2.0
5,2.0,2019-01-02,15,0,1.0,2.0,3.863636,4.0
6,2.0,2019-01-02,16,0,1.0,4.0,2.0,3.5
7,2.0,2019-01-02,17,0,1.0,3.5,4.0,4.0
8,2.0,2019-01-02,18,0,1.0,4.0,3.5,7.230769
9,2.0,2019-01-02,19,0,1.0,7.230769,4.0,1.2


In [52]:
dataset_hour.to_csv(r'C:\Users\ing_l\Tesis grado\Data\Predictors data\Dataset2019_hour_mean.csv')