In [4]:
import pandas as pd
import numpy as np
import seaborn as sns
import datetime 
from datetime import timedelta
import os
from constants import *
from trip_file import TripFile
from trip import Trip


### Read all files

In [118]:
taarif = pd.read_csv("../files/taarif.csv")
taarif = taarif.drop(0).reset_index().drop(columns=["index"]) # first row is nulls
new_drivers = pd.read_csv("../files/new_drivers.csv", index_col=0)
drivers_with_kviut = pd.read_csv("../files/drivers_with_kviut.csv",index_col=0)

In [119]:


def preprocess_drivers(drivers: pd.DataFrame, copy=False) -> pd.DataFrame:
    df = drivers
    if copy:
        df = drivers.copy()

    gender_mapping = {"F": FEMALE,
                    "M":MALE,
                    "m":MALE,
                    "male":MALE,
                    "boy":MALE,
                    "unknown":UNKOWN,
                    'woman':FEMALE,
                    'girl':FEMALE,
                    'none':UNKOWN,
                    'female':FEMALE}

    # Make preprocess
    df.gender.fillna(UNKOWN, inplace=True)
    df.gender = df.gender.apply(lambda x: gender_mapping[x])
    df.birthdate = pd.to_datetime(df.birthdate, format="mixed")
    drivers.birthdate.fillna(datetime.datetime.now(),inplace=True)

    return df

def preprocess_taarif(taarif_df: pd.DataFrame, copy=False) -> pd.DataFrame:
        df = taarif_df
        if copy:
            df = new_drivers.copy()

        return df


# Concat the two tables of drivers with and without kviut
def concat_drivers_tables(new_drivers, drivers_with_kviut):
    drivers_with_kviut["kviut"] = 1
    drivers = pd.concat([new_drivers, drivers_with_kviut])
    drivers["kviut"].fillna(0, inplace=True)
    return drivers
    

In [None]:
drivers = concat_drivers_tables(new_drivers, drivers_with_kviut)
drivers = preprocess_drivers(drivers, copy=True)

In [6]:
from tqdm import tqdm
def get_trip_files(folder, limit=100):
    files = np.array(list(os.listdir(folder)))
    if limit:
        files = files[:limit]
    files = np.array([name if not name[-5] == ")" else name[:-7]+".csv" for name in files])
    print("Processing files....")
    return np.array([TripFile(os.path.join(folder,file)) for file in tqdm(files)])


def is_friday(time):
    return time.weekday() == 4
def is_after_4pm(time):
    return time.hour >=16
def is_saturday(time):
    return time.weekday() == 5
def is_before_8pm(time):
    return time.hour < 20 or (time.hour == 20 and time.minute == 0 and time.second == 0) 

def is_after_hour(time, hour):
    return time.hour >= hour
def is_before_hour(time, hour):
    return time.hour < hour or (time.hour == hour and time.minute == 0 and time.second == 0) 

def count_time_in_weekend(start_time, end_time):
    count = 0
    current_time = start_time

    while current_time < end_time:
        next_time = min(end_time, current_time + datetime.timedelta(hours=1))
        if is_friday(next_time):
            if is_after_hour(next_time, 16):
                friday_4pm = datetime.datetime(next_time.year, next_time.month, next_time.day, 16, 0)
                count += (next_time - max(current_time, friday_4pm)) / timedelta(hours=1)
        elif is_saturday(next_time):
            if is_before_hour(current_time, 20) or is_friday(current_time): # when next time jumps to midnight the current time is still on friday
                saturday_8pm= datetime.datetime(next_time.year, next_time.month, next_time.day, 20, 0)
                count += (min(next_time, saturday_8pm) - current_time) / timedelta(hours=1)
        
        current_time = current_time + datetime.timedelta(hours=1)

    return count

def count_time_at_night(start_time, end_time):
    count = 0
    current_time = start_time

    while current_time < end_time:
        next_time = min(end_time, current_time + datetime.timedelta(hours=1))
        if is_after_hour(next_time, 22):
                ten_pm = datetime.datetime(next_time.year, next_time.month, next_time.day, 22, 0)
                count += (next_time - max(current_time, ten_pm)) / timedelta(hours=1)
        elif is_before_hour(current_time, 6) or is_after_hour(current_time, 22):
                six_am = datetime.datetime(next_time.year, next_time.month, next_time.day, 6, 0)
                count += (min(next_time, six_am) - current_time) / timedelta(hours=1)
        
        current_time = current_time + datetime.timedelta(hours=1)
    return count

In [7]:

# Calculate age function
def calculate_age(birth_date):
    current_date = datetime.datetime.now()
    if pd.isnull(birth_date):
        return np.nan
    age = current_date.year - birth_date.year - ((current_date.month, current_date.day) < (birth_date.month, birth_date.day))
    return age


def calculate_trips_cost(trips_with_fares):
    
    df = trips_with_fares.copy()
    
    weekend_hours = df.apply(lambda r: count_time_in_weekend(r.start_time, r.end_time), axis=1)
    night_hours = df.apply(lambda r: count_time_at_night(r.start_time, r.end_time), axis=1)

    is_weekend = weekend_hours.apply(lambda x: x > 0)
    is_night = night_hours.apply(lambda x: x > 0)

    km_per_hour = df["km"] / ((df["start_time"]-df["end_time"]) / timedelta(hour=1))
    

    basic_pay = df.km * df.basic_taarif
    extra_pay = df.km.apply(lambda km: max(0, km - 200)) * df.extra_milage
    payment_without_bonus = basic_pay + extra_pay

    # bonus_precentage = ((is_weekend.apply(int) * df.weekend_bonus) + (is_night.apply(int) * df.night_bonus))
    bonus_precentage = 0
    payment_with_bonus = payment_without_bonus * (1 + bonus_precentage/100) 

    df["basic_pay"] = basic_pay
    df["extra_pay"] = extra_pay
    df["weekend_hours"] = weekend_hours
    df["night_hours"] = night_hours
    df["kph"] = km_per_hour
    df["weekend_km"] = weekend_hours * km_per_hour
    # df["night_km"] =  
    df["bonus_precentage"] = bonus_precentage
    df["payment_without_bonus"] = payment_without_bonus
    df["payment_with_bonus"] = payment_with_bonus
    return df


def create_main_table():
    cum_table = pd.DataFrame(columns=["driver_id","month","year","total_income","total_km"])
    trip_files = get_trip_files("../files/trips_data", limit=1)
    print("Processing income of drivers...")
    for trip_file in tqdm(trip_files):
        
        trip_df = trip_file.get_data_frame()
        # print(trip_df)
        trips_with_fares = pd.merge(trip_df, taarif, on=["customer"],how="left")
        df = calculate_trips_cost(trips_with_fares)
        df["month"] = trip_file.get_month()
        df["year"] = trip_file.get_year()

        # print(df)
        subset_columns = ["driver_id", "month", "year", "payment_with_bonus", "km"]
        df_to_concat = df[subset_columns]
        new_column_names = {'payment_with_bonus': 'total_income', 'km': 'total_km'}
        df_to_concat = df_to_concat.rename(columns=new_column_names)
        
        cum_table = pd.concat([cum_table, df_to_concat], axis=0, join="outer")
        cum_table = cum_table.reset_index().drop(columns=["index"])
        cum_table = cum_table.groupby(["driver_id","month","year"]).sum().reset_index()

        drivers_copy = drivers.copy()
        drivers_copy["age"] = drivers.birthdate.apply(calculate_age)
        drivers_copy = drivers_copy.rename(columns={"id":"driver_id"})
        
    cum_table = pd.merge(cum_table, drivers_copy[["driver_id","gender","age","vetek","kviut"]], on=["driver_id"], how="left")

    return cum_table

table = create_main_table()

table[table["driver_id"] == 1]

Processing files....


100%|██████████| 1/1 [00:00<00:00, 29.97it/s]


Processing income of drivers...


  0%|          | 0/1 [00:00<?, ?it/s]


NameError: name 'taarif' is not defined

In [38]:
def count_time_at_night_and_weekend(start_time, end_time):
    count = 0
    current_time = start_time

    while current_time < end_time:
        next_time = min(end_time, current_time + datetime.timedelta(hours=1))
        if is_after_hour(next_time, 22):
            if is_friday(next_time):
                ten_pm = datetime.datetime(next_time.year, next_time.month, next_time.day, 22, 0)
                count += (next_time - max(current_time, ten_pm)) / timedelta(hours=1)
        elif (is_before_hour(current_time, 6) or is_after_hour(current_time, 22)):
            if is_saturday(next_time):
                six_am = datetime.datetime(next_time.year, next_time.month, next_time.day, 6, 0)
                count += (min(next_time, six_am) - current_time) / timedelta(hours=1)
        
        current_time = current_time + datetime.timedelta(hours=1)
    return count


# data = pd.DataFrame({
#     'start_time': ['2024-05-03 23:15:00', '2024-05-04 17:00:00', '2024-05-02 04:00:00', '2024-05-03 16:45:00'],
#     'end_time': ['2024-05-04 02:30:00', '2024-05-04 19:30:00', '2024-05-05 08:00:00', '2024-05-04 19:15:00']
# })

data = pd.DataFrame({
    'start_time': ['2024-05-03 23:00:00','2024-05-03 16:45:00'],
    'end_time': [ '2024-05-11 03:00:00','2024-05-04 03:30:00']
})

def count_this(r):
    return count_time_at_night_and_weekend(r["start_time"],r["end_time"])
    # return count_time_in_weekend(r["start_time"],r["end_time"])
    
# data = data.map(datetime.datetime)
data.start_time = pd.to_datetime(data.start_time)
data.end_time = pd.to_datetime(data.end_time)
data.apply(count_this, axis=1)

0    12.0
1     5.5
dtype: float64