In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib
import plotly.express as px

In [3]:

df = pd.read_csv("dataset-1.csv")
df

Unnamed: 0,id_1,id_2,route,moto,car,rv,bus,truck
0,829,827,1,2.05,4.14,4.14,10.1,15.2
1,829,821,4,6.63,13.26,13.26,32.4,48.5
2,829,804,7,14.41,28.92,28.92,64.7,97.0
3,829,822,6,5.90,11.81,11.81,28.8,43.2
4,829,826,9,2.87,5.81,5.81,14.2,21.2
...,...,...,...,...,...,...,...,...
336,803,802,3,1.70,3.40,3.40,6.9,10.3
337,803,805,4,3.00,6.00,6.00,12.0,17.9
338,803,825,3,11.59,23.28,23.28,50.1,75.2
339,803,806,9,3.80,7.70,7.70,15.3,23.0


In [6]:
import pandas as pd

def generate_car_matrix(df: pd.DataFrame) -> pd.DataFrame:
    """
    Creates a DataFrame for id combinations.

    Args:
        df (pandas.DataFrame)

    Returns:
        pandas.DataFrame: Matrix generated with 'car' values,
                          where 'id_1' and 'id_2' are used as indices and columns respectively.
    """
    car_matrix = df.pivot(index='id_1', columns='id_2', values='car').fillna(0)
    return car_matrix




In [7]:
def get_type_count(df: pd.DataFrame) -> dict:
    """
    Categorizes 'car' values into types and returns a dictionary of counts.

    Args:
        df (pandas.DataFrame)

    Returns:
        dict: A dictionary with car types as keys and their counts as values.
    """
    type_counts = df['car'].value_counts().to_dict()
    return type_counts




In [8]:
def get_bus_indexes(df: pd.DataFrame) -> list:
    """
    Returns the indexes where the 'bus' values are greater than twice the mean.

    Args:
        df (pandas.DataFrame)

    Returns:
        list: List of indexes where 'bus' values exceed twice the mean.
    """
    mean_bus_value = df['bus'].mean()
    bus_indexes = df[df['bus'] > 2 * mean_bus_value].index.tolist()
    return bus_indexes




In [9]:
def filter_routes(df: pd.DataFrame) -> list:
    """
    Filters and returns routes with average 'truck' values greater than 7.

    Args:
        df (pandas.DataFrame)

    Returns:
        list: List of route names with average 'truck' values greater than 7.
    """
    average_truck_values = df.groupby('route')['truck'].mean()
    routes_above_threshold = average_truck_values[average_truck_values > 7].index.tolist()
    return routes_above_threshold




In [10]:
def multiply_matrix(matrix: pd.DataFrame) -> pd.DataFrame:
    """
    Multiplies matrix values with custom conditions.

    Args:
        matrix (pandas.DataFrame)

    Returns:
        pandas.DataFrame: Modified matrix with values multiplied based on custom conditions.
    """
    modified_matrix = matrix.applymap(lambda x: x * 2 if x > 10 else x)
    return modified_matrix




In [11]:
def time_check(df: pd.DataFrame) -> pd.Series:
    """
    Use shared dataset-2 to verify the completeness of the data by checking whether the timestamps for each unique (`id`, `id_2`) pair cover a full 24-hour and 7 days period

    Args:
        df (pandas.DataFrame)

    Returns:
        pd.Series: return a boolean series
    """
    # Assuming 'timestamp' is the column representing time in the dataframe
    time_check_series = df.groupby(['id', 'id_2'])['timestamp'].agg(lambda x: (x.max() - x.min()).total_seconds() == (7 * 24 * 3600))
    return time_check_series
