In [1]:
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
from datetime import datetime 
from IPython.display import display, HTML, clear_output
import gc
import seaborn as sns
from scipy.stats import normaltest, boxcox, probplot
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from random import shuffle
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.metrics import f1_score, balanced_accuracy_score
from statsmodels.tsa.stattools import adfuller

In [2]:
r'''
dataset2019: all sumo data with ocupation (requested in sumo office)
'''

dataset2019_path = r'C:\Users\ing_l\Tesis grado\Data\dataset_2019.csv'

In [3]:
dataset2019 = pd.read_csv(dataset2019_path)[['id_cuadra', 'direccion', 'fecha', 'tiempo', 'operacion', 'ocupacion', 'latitud', 'longitud']]
dataset2019['dia de la semana'] = -1
dataset2019.columns.names = ['dataset2019']

In [4]:
def correct_type(row):
    '''
    Corrects the dtype of fecha, tiempo and ocupación on the df
    '''
    row['fecha'] = pd.to_datetime(row['fecha'], dayfirst=True).date()
    row['tiempo'] = pd.to_datetime(row['tiempo']).time()
    row['ocupacion'] = int(row['ocupacion'])
    row['dia de la semana'] = row['fecha'].weekday()
    return row

print('Working on dataset2019...')
dataset2019 = dataset2019.apply(correct_type, axis=1).sort_values(by=['fecha', 'tiempo'])
dataset2019['mes'] = dataset2019['fecha'].apply(lambda x: x.month)
dataset2019['hora'] = dataset2019['tiempo'].apply(lambda x: x.hour)
clear_output()

In [5]:
dataset2019_backup = dataset2019.copy()

In [6]:
dataset2019['timestamp'] = dataset2019['tiempo'].apply(lambda x: x.hour * 60 * 60 + x.minute * 60 + x.second)

In [7]:
dataset2019

dataset2019,id_cuadra,direccion,fecha,tiempo,operacion,ocupacion,latitud,longitud,dia de la semana,mes,hora,timestamp
0,16,Belgrano 660,2019-01-01,10:55:00,0,0,-37.328838,-59.134455,1,1,10,39300
7,1,General Pinto 545,2019-01-01,15:12:00,1,1,-37.327782,-59.136657,1,1,15,54720
9,65,Rodriguez 348,2019-01-01,15:13:00,1,1,-37.329387,-59.134833,1,1,15,54780
8,1,General Pinto 545,2019-01-01,15:15:00,0,0,-37.327782,-59.136657,1,1,15,54900
10,65,Rodriguez 348,2019-01-01,16:22:00,0,0,-37.329387,-59.134833,1,1,16,58920
...,...,...,...,...,...,...,...,...,...,...,...,...
2957257,18,Sarmiento 543,2019-12-31,19:45:00,1,4,-0.000000,-0.000000,1,12,19,71100
2957258,18,Sarmiento 543,2019-12-31,19:46:00,0,3,-0.000000,-0.000000,1,12,19,71160
2956381,24,Belgrano 364,2019-12-31,19:53:00,1,1,-37.330077,-59.138438,1,12,19,71580
2956511,60,Chacabuco 357,2019-12-31,20:08:00,0,0,-37.330203,-59.137003,1,12,20,72480


In [9]:
out = pd.DataFrame(columns=['id_cuadra', 'timestamp', 'a pred timestamp', 'ocupacion', 'ocupacion anterior', 'dia de la semana', 'mes', 'targets'])

for s in dataset2019['id_cuadra'].drop_duplicates().sort_values():
    clear_output()
    print('Cuadra:', s)
    dataset_street = dataset2019.loc[dataset2019['id_cuadra'] == s]

    
    for d in dataset_street['fecha'].drop_duplicates():
        dataset_date = dataset_street.loc[dataset_street['fecha'] == d]

        to_append = pd.DataFrame(columns=['id_cuadra', 'timestamp', 'a pred timestamp', 'ocupacion', 'ocupacion anterior', 'dia de la semana', 'mes', 'targets'])

        to_append = to_append.append(dataset_date[['id_cuadra', 'timestamp', 'ocupacion', 'dia de la semana', 'mes']][1:-1].reset_index(drop=True))
        
        to_append['ocupacion anterior'] = dataset_date['ocupacion'][:-2].reset_index(drop=True)
        to_append['a pred timestamp'] = dataset_date['timestamp'][2:].reset_index(drop=True)        
        to_append['targets'] = dataset_date['operacion'][2:].reset_index(drop=True)
        
        out = out.append(to_append)
        
out

Cuadra: 190


Unnamed: 0,a pred timestamp,dia de la semana,id_cuadra,mes,ocupacion,ocupacion anterior,targets,timestamp
0,34860,2,0,1,2,1,1,34740
1,35100,2,0,1,3,2,0,34860
2,35100,2,0,1,2,3,0,35100
3,35700,2,0,1,1,2,1,35100
4,35940,2,0,1,2,1,1,35700
...,...,...,...,...,...,...,...,...
3,42060,1,190,11,1,0,0,42060
4,43800,1,190,11,0,1,1,42060
5,43860,1,190,11,1,0,0,43800
0,37260,1,190,11,1,0,0,37260


In [13]:
out.to_csv(r'C:\Users\ing_l\Tesis grado\Data\Dataset2019_classifier.csv')