### Imports and loadings

In [5]:
from conf.parameters import parameters
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import networkx as nx
import plotly.express as px
from scipy.stats import ttest_ind
from statsmodels.tsa.stattools import adfuller

In [2]:
def load_data(data_path):
    """
    Load the dataset from the specified data path in the parameters file.

    Args:
        parameters["data_path"]: The data path of the dataset.

    Returns:
        pd.DataFrame: Loaded dataset as a pandas DataFrame.
    """
    # Retrieve the data path from the parameters

    # Load the Excel file into a pandas DataFrame
    df = pd.read_excel(data_path, header=1)

    return df

df = load_data(parameters["data_path"])
df.head()


Unnamed: 0,Cab_Driver_ID,Gender,PickUp_Colombo_ID,DropOff_Colombo_ID,N_Passengers,Date,PickUp_Time,Duration_Min,Tip,Total_Amount
0,5,M,2,6.650455,1,2021-10-20,Day,25,4.56,26.31
1,20,F,11,11.474133,4,2021-09-03,Night,25,0.57,22.32
2,16,F,11,12.162533,1,2021-09-22,Day,48,3.89,45.65
3,7,M,15,12.79283,3,2021-07-19,Day,30,9.64,35.74
4,15,F,5,12.668358,1,2021-11-20,Day,37,5.12,37.31


### Statistical testing for seasonality in Earnings

In [3]:
def calculate_aggregated_earnings(df: pd.DataFrame, date_column: str, earnings_column: str, driver_id_column: str) -> dict:
    """
    Calculate the total and average earnings for all drivers across daily, weekly, and monthly periods.

    Args:
        df (pd.DataFrame): The input DataFrame containing the dataset.
        date_column (str): The column name for the trip date.
        earnings_column (str): The column name for trip earnings.
        driver_id_column (str): The column name for cab driver ID.

    Returns:
        dict: A dictionary containing daily, weekly, and monthly averages for a driver.
    """
    if date_column not in df.columns or earnings_column not in df.columns or driver_id_column not in df.columns:
        raise ValueError("Specified columns do not exist in the DataFrame.")

    # Ensure the date column is a datetime type
    df[date_column] = pd.to_datetime(df[date_column])

    # Add Week and Month columns for aggregation
    df['Week'] = df[date_column].dt.isocalendar().week
    df['Month'] = df[date_column].dt.to_period('M')

    # Calculate total earnings across all drivers for daily, weekly, and monthly periods
    total_daily_earnings = df.groupby(df[date_column].dt.date)[earnings_column].sum()
    total_weekly_earnings = df.groupby('Week')[earnings_column].sum()
    total_monthly_earnings = df.groupby('Month')[earnings_column].sum()

    # Calculate the number of unique drivers who worked for each period
    daily_drivers = df.groupby(df[date_column].dt.date)[driver_id_column].nunique()
    weekly_drivers = df.groupby('Week')[driver_id_column].nunique()
    monthly_drivers = df.groupby('Month')[driver_id_column].nunique()

    # Calculate average earnings per driver for each period
    avg_daily_earnings_per_driver = (total_daily_earnings / daily_drivers).reset_index(name='Average Earnings Per Driver')
    avg_weekly_earnings_per_driver = (total_weekly_earnings / weekly_drivers).reset_index(name='Average Earnings Per Driver')
    avg_monthly_earnings_per_driver = (total_monthly_earnings / monthly_drivers).reset_index(name='Average Earnings Per Driver')

    # Return results as a dictionary
    return {
        "daily_avg_per_driver": avg_daily_earnings_per_driver,
        "weekly_avg_per_driver": avg_weekly_earnings_per_driver,
        "monthly_avg_per_driver": avg_monthly_earnings_per_driver
    }


# Calculate aggregated earnings
averages = calculate_aggregated_earnings(
    df,
    date_column="Date",
    earnings_column="Total_Amount",
    driver_id_column="Cab_Driver_ID"
)

# Display results
print("Daily Average Earnings Per Driver:")
print(averages["daily_avg_per_driver"])

print("\nWeekly Average Earnings Per Driver:")
print(averages["weekly_avg_per_driver"])

print("\nMonthly Average Earnings Per Driver:")
print(averages["monthly_avg_per_driver"])


Daily Average Earnings Per Driver:
           Date  Average Earnings Per Driver
0    2021-07-01                    35.297500
1    2021-07-02                    24.146000
2    2021-07-03                    42.990000
3    2021-07-04                    29.230000
4    2021-07-05                    30.226667
..          ...                          ...
178  2021-12-27                    55.505000
179  2021-12-28                    41.926000
180  2021-12-29                    39.020000
181  2021-12-30                    37.050000
182  2021-12-31                    38.975000

[183 rows x 2 columns]

Weekly Average Earnings Per Driver:
    Week  Average Earnings Per Driver
0     26                    51.231538
1     27                    63.350625
2     28                    57.102000
3     29                    89.191875
4     30                    74.151429
5     31                    52.698750
6     32                    68.396667
7     33                    85.228000
8     34              

In [7]:
def check_seasonality(df, column_name):
    """
    Check if the time series in the specified column has seasonality using the Augmented Dickey-Fuller (ADF) test.

    Args:
        df (pd.DataFrame): The DataFrame containing the time series data.
        column_name (str): The column name containing the time series data to test.

    Returns:
        str: A message indicating whether the time series has seasonality or not.
    """
    # Perform the Augmented Dickey-Fuller test
    result = adfuller(df[column_name])

    # Extract the p-value from the result
    p_value = result[1]

    # Print the p-value
    print(f'p-value: {p_value}')

    # Check if p-value is less than 0.05 (indicating stationarity and potential seasonality)
    if p_value < 0.05:
        return "The time series does not have a seasonal unit root (indicating seasonality)."
    else:
        return "The time series may not have seasonality."


result_message = check_seasonality(averages["daily_avg_per_driver"], 'Average Earnings Per Driver')
print(result_message)


p-value: 1.4078004877057219e-09
The time series does not have a seasonal unit root (indicating seasonality).


### Hypothesis testing to check whether there is a significance difference in earningd during weekdays and weekends.

Null Hypothesis (H₀): There is no difference between weekday and weekend earnings.

Alternative Hypothesis (H₁): There is a difference between weekday and weekend earnings.

In [14]:
def test_weekly_earnings(df, date_column, earnings_column):
    """
    Perform t-tests to compare weekday and weekend earnings for each week.

    Args:
        df (pd.DataFrame): The dataset containing trip data.
        date_column (str): The column name for the trip date.
        earnings_column (str): The column name for trip earnings.

    Returns:
        pd.DataFrame: A DataFrame with t-statistics, p-values, and the corresponding week number.
    """
    # Ensure the date column is parsed correctly
    df[date_column] = pd.to_datetime(df[date_column], errors='coerce')

    # Add WeekNumber and IsWeekend columns
    df['WeekNumber'] = df[date_column].dt.isocalendar().week
    df['IsWeekend'] = df[date_column].dt.weekday.apply(lambda x: 'Weekend' if x >= 5 else 'Weekday')

    # Initialize results list
    results = []
    for week in df['WeekNumber'].unique():
        weekly_data = df[df['WeekNumber'] == week]
        weekday_earnings = weekly_data[weekly_data['IsWeekend'] == 'Weekday'][earnings_column]
        weekend_earnings = weekly_data[weekly_data['IsWeekend'] == 'Weekend'][earnings_column]

        if len(weekday_earnings) > 1 and len(weekend_earnings) > 1:  # Ensure enough data for testing
            t_stat, p_value = ttest_ind(weekday_earnings, weekend_earnings, equal_var=False)
            results.append({'WeekNumber': week, 'T-Statistic': t_stat, 'P-Value': p_value})

    # Convert the results into a DataFrame
    weekly_results_df = pd.DataFrame(results)
    weekly_results_df.set_index('WeekNumber', inplace=True)
    return weekly_results_df


weekly_ttest_results = test_weekly_earnings(df, date_column='Date', earnings_column='Total_Amount')
print(weekly_ttest_results)


            T-Statistic   P-Value
WeekNumber                       
42            -0.996168  0.334784
35            -0.918507  0.403944
38            -1.611054  0.122784
29             0.621637  0.540052
46             0.110473  0.912979
49            -0.147303  0.884831
39            -0.552653  0.590351
41             1.623215  0.119501
50            -0.442299  0.660491
44            -1.035800  0.322869
33             0.792638  0.438554
34            -0.466566  0.647147
45            -0.558509  0.581391
36            -0.958408  0.344478
28            -2.236662  0.033555
48             0.858797  0.397911
30             2.597194  0.014747
40            -0.526461  0.603039
43            -0.020497  0.983779
37            -1.520884  0.140429
47             0.088792  0.931262
26            -1.739751  0.101650
31            -0.372171  0.712976
51            -0.276833  0.783711
32            -1.032570  0.314366
27             0.281939  0.781305
