In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os.path
from datetime import datetime
import seaborn as sns
from statsmodels.tsa.seasonal import seasonal_decompose
import folium

In [2]:
try:
    list_tripFiles = ['dataFiles/data-sample_data-nyctaxi-trips-2009-json_corrigido.json',
                      'dataFiles/data-sample_data-nyctaxi-trips-2010-json_corrigido.json',
                      'dataFiles/data-sample_data-nyctaxi-trips-2011-json_corrigido.json',
                      'dataFiles/data-sample_data-nyctaxi-trips-2012-json_corrigido.json']
    
    for file in list_tripFiles:
        os.path.isfile(file)
        
    vendorsFile = 'dataFiles/data-vendor_lookup-csv.csv'
    dfDataVendors = pd.read_csv(vendorsFile)

    paymentModes = 'dataFiles/data-payment_lookup-csv.csv'
    dfPaymentModes = pd.read_csv(paymentModes)
        
except:
    print("Não foi possível abrir algum dos arquivos. Verifique se TODOS estão na pasta correta ['/dataFiles']!")

    

In [3]:
def createBigDF(list_files):
    dfAllFiles = pd.DataFrame()
    
    for file in list_files:
        dfOneFile = pd.read_json(file)
        dfAllFiles = dfAllFiles.append(dfOneFile, ignore_index=True)
        
    return dfAllFiles




In [4]:
def meanDistance(list_DataTrips):
    
    dfDataTrips = createBigDF(list_DataTrips)   
    
    dfPassengers = dfDataTrips.query('passenger_count <= 2')

    dfMeanTotal = np.mean(dfDataTrips['trip_distance'])
    dfMeanPass = np.mean(dfPassengers['trip_distance'])
    
    dfMeanTotal = round(dfMeanTotal, 2)
    dfMeanPass = round(dfMeanPass, 2)


    x = np.array(["Qualquer Nº de Passageiros", "2 ou menos Passageiros"])
    y = np.array([dfMeanTotal, dfMeanPass])
    
    
    plt.xlabel("Número de Passageiros")
    plt.ylabel("Distância Média da Corrida")
    plt.annotate(y[0], (-0.05, y[0]))
    plt.annotate(y[1], (0.95, y[1]))
    plt.bar(x, y, color='blue', alpha=0.75, width=0.5)
    
    plt.show()
    

In [None]:
meanDistance(list_tripFiles)

In [5]:
def bigVendors(dfVendors, list_DataTrips):
    
    dfTrips = createBigDF(list_DataTrips)
    
    total_vendors = len(dfVendors)
    i = 0
    df_big_vendors = pd.DataFrame()
    df_big_vendors['Alias'] = dfVendors['vendor_id']
    df_big_vendors['Name'] = dfVendors['name']
    list_amount = []
    list_fare_amount = []
    
    while i < total_vendors:
        vendor = dfVendors["vendor_id"][i]
        dfVendor = dfTrips.query(f'vendor_id == "{vendor}"')
        
        dfSumVendor = np.sum(dfVendor['total_amount']).round(2)
        list_amount.append(int(dfSumVendor))
        
        dfSumFares = np.sum(dfVendor['fare_amount'] + dfVendor['surcharge']).round(2)
        list_fare_amount.append(int(dfSumFares))
        
        i += 1
    
    df_big_vendors['Amount'] = list_amount
    df_big_vendors['Fare_Amount'] = list_fare_amount
    df_big_vendors = df_big_vendors.nlargest(3, 'Amount')
    
    x = np.arange(len(df_big_vendors['Name']))  # the label locations
    width = 0.35  # the width of the bars

    fig, ax = plt.subplots()
    rects1 = ax.bar(x - width/2, df_big_vendors['Amount'], width, label='Faturamento Total')
    rects2 = ax.bar(x + width/2, df_big_vendors['Fare_Amount'], width, label='Somente Tarifas')

    # Add some text for labels, title and custom x-axis tick labels, etc.
    ax.set_ylabel('Faturamento')
    ax.set_title('Maiores Companhias em Faturamento')
    ax.set_xticks(x)
    ax.set_xticklabels(df_big_vendors['Name'])
    ax.legend()
    
    plt.xticks(rotation=30)
    fig.tight_layout()
    plt.show()


In [None]:
bigVendors(dfDataVendors, list_tripFiles)

In [6]:
def histogramCash(list_DataTrips):
    
    dfDataTrips = createBigDF(list_DataTrips)   
    dfDataTrips['pickup_datetime'] = pd.to_datetime(dfDataTrips['pickup_datetime'])
    
    dfCashTrips = dfDataTrips.query('payment_type == "CASH" | payment_type == "Cash"')
    df_count_cash_trips = pd.DataFrame()
    
    list_count_trips = []
    date_m = 1
    
    while date_m <= 12:
           
        trips = dfCashTrips.query('pickup_datetime.dt.month == @date_m')
        count_trips = np.count_nonzero(trips['pickup_datetime'])
        list_count_trips.append(count_trips)
        date_m += 1

    df_count_cash_trips['Total_trips'] = list_count_trips

    y = np.array(df_count_cash_trips['Total_trips'])

    plt.ylabel("Quantidade de Meses")
    plt.xlabel("Total Recebido em Dinheiro")
    plt.title('Histograma de Faturamento Mensal')
    plt.hist(y, 20, facecolor='g', alpha=0.75)
    plt.xticks(rotation=45)
    plt.show()
    


In [None]:
histogramCash(list_tripFiles)

In [7]:
def timeSeries(dfDataTrips2012):
    dfDataTrips2012['pickup_datetime'] = pd.to_datetime(dfDataTrips2012['pickup_datetime'])

    df_tips = dfDataTrips2012.query("pickup_datetime.dt.month > 9 & tip_amount > 0")

    month = 10
    dict_diary_tips = {}
    list_days = []
    list_tips = []
    while month <= 12:
        day = 1

        while day <= 31:
            list_days.append((str(month)+"-"+str(day)))
            if month == 9 and day == 31:
                list_tips.append(0)
            else:
                diary_tips = df_tips.query(f"pickup_datetime.dt.month == {month} & pickup_datetime.dt.day == {day}")
                list_tips.append(len(diary_tips.index))

            day += 1
        dict_diary_tips["Dias"] = list_days
        dict_diary_tips["Gorgetas"] = list_tips
        month += 1

    df_diary_tips = pd.DataFrame(dict_diary_tips)

    df_diary_tips = df_diary_tips.set_index('Dias')
    df_diary_tips.plot()
    plt.tight_layout()
    
    resultado = seasonal_decompose(df_diary_tips["Gorgetas"], period=12, extrapolate_trend='freq')
    tendencia = resultado.trend
    sazonalidade = resultado.seasonal
    residuo = resultado.resid
    
    
    plt.xlabel('Meses-Dias')
    plt.xticks(rotation=45)
    plt.ylabel('Quantidade de Gorgetas')
    plt.title('Série Temporal')
    plt.show()
    
    plt.xlabel('Meses-Dias')
    plt.xticks([])
    plt.ylabel('Quantidade de Gorgetas')
    plt.title('Tendência')
    plt.plot(tendencia, color='r')
    plt.show()
    
    plt.xlabel('Meses-Dias')
    plt.xticks([])
    plt.ylabel('Quantidade de Gorgetas')
    plt.title('Sazonalidade')
    plt.plot(sazonalidade, color='y')
    plt.show()
    
    plt.xlabel('Meses-Dias')
    plt.xticks([])
    plt.ylabel('Quantidade de Gorgetas')
    plt.title('Resíduo')
    plt.plot(residuo, color='g')
    plt.show()
    

In [None]:
dfDataTrips12 = pd.read_json(list_tripFiles[3])
timeSeries(dfDataTrips12)

In [8]:
def timeRunsWeekend(list_DataTrips):
    
    dfDataTrips = createBigDF(list_DataTrips)
    
    dfDataTrips['pickup_datetime'] = pd.to_datetime(dfDataTrips['pickup_datetime'])
    dfDataTrips['dropoff_datetime'] = pd.to_datetime(dfDataTrips['dropoff_datetime'])
    
    dfWeekendTrips = dfDataTrips.query("pickup_datetime.dt.dayofweek >= 5")
    dfWeekTrips = dfDataTrips.query("pickup_datetime.dt.dayofweek < 5")
    
    df_weekend_times = pd.DataFrame()
    df_weekend_times['pickup'] = dfWeekendTrips['pickup_datetime']
    df_weekend_times['dropoff'] = dfWeekendTrips['dropoff_datetime']
    df_weekend_times['time_trip'] = df_weekend_times['dropoff'] - df_weekend_times['pickup']
    
    df_week_times = pd.DataFrame()
    df_week_times['pickup'] = dfWeekTrips['pickup_datetime']
    df_week_times['dropoff'] = dfWeekTrips['dropoff_datetime']
    df_week_times['time_trip'] = df_week_times['dropoff'] - df_week_times['pickup']
    
    meanWeekend = np.mean(df_weekend_times['time_trip'])
    meanWeek = np.mean(df_week_times['time_trip'])
    
    str_meanWeek = str(meanWeek)    
    str_meanWeekend = str(meanWeekend)
    
    cut_meanWeek = float(str_meanWeek[10:12] + '.' + str_meanWeek[13:15])
    cut_meanWeekend = float(str_meanWeekend[10:12] + '.' + str_meanWeekend[13:15])
    
    x = np.array(["Dias de Semana", "Finais de Semana"])
    y = np.array([cut_meanWeek, cut_meanWeekend])   
    
    plt.xlabel("Separação de Dias")
    plt.ylabel("Tempo Médio da Corrida (Minutos)")
    plt.annotate(y[0], (-0.05, y[0]))
    plt.annotate(y[1], (0.95, y[1]))
    plt.bar(x, y, width=0.5)
    
    plt.show()
    

In [None]:
timeRunsWeekend(list_tripFiles)

In [40]:
dfDataTrips10 = pd.read_json(list_tripFiles[1])
#dfDataTrips10 = pd.read_json('dataFiles/teste.json')

In [42]:
mapa = folium.Map(location=[40.74295,-74.004114],zoom_start=11)
i = 0
while i <= 1000:
    lat_in = dfDataTrips10.loc[i]['pickup_latitude']
    lat_out = dfDataTrips10.loc[i]['dropoff_latitude']
    long_in = dfDataTrips10.loc[i]['pickup_longitude']
    long_out = dfDataTrips10.loc[i]['dropoff_longitude']

    folium.Marker(
            [lat_in,long_in],
            popup='<i>Pickup Place</i>', 
            tooltip=i,
            icon=folium.Icon(color='red')
            ).add_to(mapa)
    folium.Marker(
            [lat_out,long_out],
            popup='<i>DropOff Place</i>', 
            tooltip=i,
            icon=folium.Icon(color='green')
            ).add_to(mapa)
    i += 1

mapa