In [13]:
import pandas as pd
import numpy as np
import seaborn as sns
import datetime as datetime
import os
from constants import *
from trip_file import TripFile
from trip import Trip


### Read all files

In [14]:
taarif = pd.read_csv("../files/taarif.csv")
taarif = taarif.drop(0).reset_index().drop(columns=["index"]) # first row is nulls
new_drivers = pd.read_csv("../files/new_drivers.csv", index_col=0)
drivers_with_kviut = pd.read_csv("../files/drivers_with_kviut.csv",index_col=0)

In [15]:


def preprocess_drivers(drivers: pd.DataFrame, copy=False) -> pd.DataFrame:
    df = drivers
    if copy:
        df = drivers.copy()

    gender_mapping = {"F": FEMALE,
                    "M":MALE,
                    "m":MALE,
                    "male":MALE,
                    "boy":MALE,
                    "unknown":MALE,
                    'woman':FEMALE,
                    'girl':FEMALE,
                    'none':MALE,
                    'female':FEMALE}

    # Make preprocess
    df.gender.fillna(MALE, inplace=True)
    df.gender = df.gender.apply(lambda x: gender_mapping[x])
    df.birthdate = pd.to_datetime(df.birthdate, format="mixed")
    drivers.birthdate.fillna(datetime.datetime.now(),inplace=True)

    return df

def preprocess_taarif(taarif_df: pd.DataFrame, copy=False) -> pd.DataFrame:
        df = taarif_df
        if copy:
            df = new_drivers.copy()

        return df


# Concat the two tables of drivers with and without kviut
def concat_drivers_tables(new_drivers, drivers_with_kviut):
    drivers_with_kviut["kviut"] = 1
    drivers = pd.concat([new_drivers, drivers_with_kviut])
    drivers["kviut"].fillna(0, inplace=True)
    return drivers
    

In [70]:
drivers = concat_drivers_tables(new_drivers, drivers_with_kviut)
drivers = preprocess_drivers(drivers, copy=True)

Unnamed: 0,birthdate,gender,id,vetek,kviut
0,1967-06-19,f,943,232.0,0.0
1,1979-02-13,m,966,28.0,0.0
2,1993-11-19,m,250,48.0,0.0
3,1961-02-02,m,156,272.0,0.0
4,1965-07-03,m,67,130.0,0.0
...,...,...,...,...,...
995,1960-04-01,m,119,15.4,1.0
996,1983-12-08,m,236,6.7,1.0
997,1987-06-13,m,449,18.7,1.0
998,1977-02-21,m,494,7.9,1.0


In [17]:
from tqdm import tqdm
def get_trip_files(folder, limit=100):
    files = np.array(list(os.listdir(folder)))
    if limit:
        files = files[:limit]
    files = np.array([name if not name[-5] == ")" else name[:-7]+".csv" for name in files])
    print("Processing files....")
    return np.array([TripFile(os.path.join(folder,file)) for file in tqdm(files)])

In [79]:

# Calculate age function
def calculate_age(birth_date):
    current_date = datetime.datetime.now()
    if pd.isnull(birth_date):
        return np.nan
    age = current_date.year - birth_date.year - ((current_date.month, current_date.day) < (birth_date.month, birth_date.day))
    return age

def check_if_weekend(start_time:datetime.datetime, end_time:datetime.datetime):
    return False

def check_if_night(start_time:datetime.datetime, end_time:datetime.datetime):
    return False

def calculate_trips_cost(trips_with_fares):
    
    df = trips_with_fares.copy()
    is_weekend = df.apply(lambda x: check_if_weekend(x.start_time, x.end_time), axis=1)
    is_night = df.apply(lambda x: check_if_night(x.start_time, x.end_time), axis=1)
    basic_pay = df.km * df.basic_taarif
    extra_pay = df.km.apply(lambda km: max(0, km - 200)) * df.extra_milage
    payment_without_bonus = basic_pay + extra_pay

    bonus_precentage = ((is_weekend.apply(int) * df.weekend_bonus) + (is_night.apply(int) * df.night_bonus))
    payment_with_bonus = payment_without_bonus * (1 + bonus_precentage/100) 
    df["is_weekend"] = is_weekend
    df["is_night"] = is_night
    df["basic_pay"] = basic_pay
    df["extra_pay"] = extra_pay
    df["bonus_precentage"] = bonus_precentage
    df["payment_without_bonus"] = payment_without_bonus
    df["payment_with_bonus"] = payment_with_bonus
    return df


def create_main_table():
    # rates = Rates("files/taarif.csv")
    # main_table = pd.DataFrame(columns=["id","month","year","total_income","total_km","gender","age","vetek","kviut"])
    cum_table = pd.DataFrame(columns=["driver_id","month","year","total_income","total_km"])
    trip_files = get_trip_files("../files/trips_data", limit=1000)
    print("Processing income of drivers...")
    for trip_file in tqdm(trip_files):
        
        trip_df = trip_file.get_data_frame()
        # print(trip_df)
        trips_with_fares = pd.merge(trip_df, taarif, on=["customer"],how="left")
        df = calculate_trips_cost(trips_with_fares)
        df["month"] = trip_file.get_month()
        df["year"] = trip_file.get_year()

        # print(df)
        subset_columns = ["driver_id", "month", "year", "payment_with_bonus", "km"]
        df_to_concat = df[subset_columns]
        new_column_names = {'payment_with_bonus': 'total_income', 'km': 'total_km'}
        df_to_concat = df_to_concat.rename(columns=new_column_names)
        
        cum_table = pd.concat([cum_table, df_to_concat], axis=0, join="outer")
        cum_table = cum_table.reset_index().drop(columns=["index"])
        cum_table = cum_table.groupby(["driver_id","month","year"]).sum().reset_index()

        drivers_copy = drivers.copy()
        drivers_copy["age"] = drivers.birthdate.apply(calculate_age)
        drivers_copy = drivers_copy.rename(columns={"id":"driver_id"})
        
    cum_table = pd.merge(cum_table, drivers_copy[["driver_id","gender","age","vetek","kviut"]], on=["driver_id"], how="left")

    return cum_table

table = create_main_table()

table[table["driver_id"] == 1]

Processing files....


100%|██████████| 1000/1000 [00:04<00:00, 222.19it/s]


Processing income of drivers...


100%|██████████| 1000/1000 [00:38<00:00, 25.90it/s]


Unnamed: 0,driver_id,month,year,total_income,total_km,gender,age,vetek,kviut
0,1,april,2015,66660.184002,8996.202082,f,41.0,1.4,1.0
1,1,august,2015,63390.516635,9642.550838,f,41.0,1.4,1.0


In [63]:
# Sample DataFrame
data = {
    'Category': ['A', 'B', 'A', 'B', 'A'],
    'Value': [10, 20, None, 40, 50],
    'some':["asdfasdf","ASFAs<","ASf","ASF","ASF"]
}

df = pd.DataFrame(data)

# Group by 'Category' column and calculate sum of 'Value' column
sum_by_category = df.groupby('Category')['Value'].sum()

print(sum_by_category)

Category
A    60.0
B    60.0
Name: Value, dtype: float64


In [None]:
data = {
    'start_time': ['2024-05-03 20:00:00', '2024-05-04 02:00:00', '2024-05-05 04:00:00'],
    'end_time': ['2024-05-04 01:00:00', '2024-05-04 08:00:00', '2024-05-05 08:00:00']
}

def iterate_hours(start_time, end_time):
    current_time = start_time
    while current_time <= end_time:
        yield current_time
        current_time += timedelta(hours=1)


def check_if_weekend(start_time:datetime.datetime, end_time:datetime.datetime):
    # iterate over all hours and find those who are in between friday 4pm and saturday 8pm


def check_if_night(start_time:datetime.datetime, end_time:datetime.datetime):
    # return False
    starts = data["start_time"]
    ends = data["end_time"]
    start = starts[0]
    end = ends[0]


