In [1]:
import pandas as pd 
from datetime import timedelta
import time
import helpers.data_engineering_library as dlib

def get_user_input():
    """
    Prompts the user to input file paths for loading and saving data.

    This function prompts the user to enter three file paths:
    1. The path to the file from which the data is loaded.
    2. The path to the file from which the detectors data is loaded.
    3. The path to where the file should be saved.

    Example paths:
    - C:\\Users\\samue\\OneDrive\\AIML\\HS2024\\Data Sicence Projekt\\Data\\London_UTD19.csv
    - C:\\Users\\samue\\OneDrive\\AIML\\HS2024\\Data Sicence Projekt\\Data\\London_detectors.csv
    - C:\\Users\\samue\\OneDrive\\AIML\\HS2024\\Data Sicence Projekt\\Data

    Returns:
    tuple: A tuple containing three strings:
        - pathFrom (str): The path to the file from which the data is loaded.
        - pathDetectors (str): The path to the file from which the detectors data is loaded.
        - pathTo (str): The path to where the file should be saved.
    """
    pathFrom = input("Enter the path to the file from which the data is loaded: ")
    pathDetectors = input("Enter the path to the file from which the detectors data is loaded: ")
    pathTo = input("Enter the path to where the file should be saved is saved: ")
    #Only for testing
    #pathFrom = r"C:\Users\samue\OneDrive\AIML\HS2024\Data Sicence Projekt\Data\London\London_UTD19.csv"
    #pathDetectors = r"C:\Users\samue\OneDrive\AIML\HS2024\Data Sicence Projekt\Data\London\London_detectors.csv"
    #pathTo = r"C:\Users\samue\OneDrive\AIML\HS2024\Data Sicence Projekt\Data"
    return pathFrom, pathTo, pathDetectors

def export_modified_dataset(df, path):
    """
    Export the modified DataFrame to a CSV file.

    Parameters:
    df (pd.DataFrame): The modified DataFrame.
    path (str): The path to save the CSV file.
    """
    df.to_csv(f"{path}\\London_UTD19_modified.csv", index=False)
#-------------------------Main-------------------------------------
print("Starting script")
path_from, path_to, path_detectors = get_user_input()
start_time = time.time()

print("Loading data from: ", path_from)
dataframe_London_UTD19 = pd.DataFrame(dlib.load_data(path=path_from, nrows=None))
print("Loading data from: ", path_detectors)
dataframe_detectors = pd.DataFrame(dlib.load_data(path=path_detectors))
print("Data loaded")

print("Preprocessing data")
preprocess_start = time.time()
dataframe_London_UTD19, errors = dlib.preprocess_dataframe(dataframe_London_UTD19)
print(f"Errors found and dropped: {errors}")
print(f"Preprocessing data took {round(time.time() - preprocess_start)} seconds")

print("Drop bad days")
drop_bad_days_start = time.time()
dataframe_London_UTD19 = dlib.drop_false_values_by_date(dataframe_London_UTD19, column='flow')
print(f"Drop bad days took {round(time.time() - drop_bad_days_start)} seconds")

print("Clipping outliers on occ")
clip_outliers_start = time.time()
dataframe_London_UTD19 = dlib.clip_outliers(dataframe_London_UTD19, column='occ', group_by_detid=True, outlier_factor=3)
print(f"Clipping outliers on occ took {round(time.time() - clip_outliers_start)} seconds")

print("Clipping outliers on flow")
clip_outliers_start = time.time()
dataframe_London_UTD19 = dlib.clip_outliers(dataframe_London_UTD19, column='flow', group_by_detid=True, outlier_factor=3)
print(f"Clipping outliers on flow took {round(time.time() - clip_outliers_start)} seconds")

print("Calculating traffic")
traffic_start = time.time()
dataframe_London_UTD19 = dlib.calculate_traffic_speed(dataframe_London_UTD19)
print(f"Calculating traffic took {round(time.time() - traffic_start)} seconds")

print("Droping outliers on traffic")
drop_outliers_start = time.time()
dataframe_London_UTD19 = dlib.drop_outliers(dataframe_London_UTD19, column='traffic', group_by_detid=True, outlier_factor=2)
print(f"Droping outliers on traffic took {round(time.time() - drop_outliers_start)} seconds")

print("Detecting anomalies")
detect_anomalies_start = time.time()
dataframe_anomalies = dlib.detect_anomalies(dataframe_London_UTD19, column='traffic', factor=3, min_IQR=5, min_days=10, min_daily_records=230)
dataframe_London_UTD19 = dataframe_London_UTD19[~dataframe_London_UTD19['detid'].isin(dataframe_anomalies['detid'])]
print(f"Detecting anomalies took {round(time.time() - detect_anomalies_start)} seconds")

print("Exporting anomalies to: ", path_to)
exporting_anomalies_start = time.time()
dataframe_anomalies.to_csv(f"{path_to}\\Anomalies.csv", index=False)
print(f"Exporting anomalies took {round(time.time() - exporting_anomalies_start)} seconds")

print("Combine datapoints")
combine_datapoints_start = time.time()
dataframe_London_UTD19 = dlib.combine_datapoints(dataframe_London_UTD19, ratio=3600)
print(f"Combine datapoints took {round(time.time() - combine_datapoints_start)} seconds")

print("Clipping to max traffic value")
clip_max_traffic_start = time.time()
dataframe_London_UTD19 = dlib.clip_to_high_values(dataframe_London_UTD19, column='traffic', threshold=200)
print(f"Clipping to max traffic value took {round(time.time() - clip_max_traffic_start)} seconds")

print("Normalizing traffic")
normalize_traffic_start = time.time()
dataframe_London_UTD19 = dlib.normalize_traffic(dataframe_London_UTD19)
print(f"Normalizing traffic took {round(time.time() - normalize_traffic_start)} seconds")

print("Merging dataframes")
merge_dataframes_start = time.time()
dataframe_London_UTD19 = dlib.merge_dataframes_on_detid(dataframe_London_UTD19, dataframe_detectors)
print(f"Merging dataframes took {round(time.time() - merge_dataframes_start)} seconds")

print("Final processing")
final_process_start = time.time()
dataframe_London_UTD19 = dlib.final_process_dataframe(dataframe_London_UTD19)
print(f"Final processing took {round(time.time() - final_process_start)} seconds")

print("Exporting modified dataset to: ", path_to)
export_start = time.time()
export_modified_dataset(dataframe_London_UTD19, path_to)
print(f"Exporting modified dataset took {round(time.time() - export_start)} seconds")

total_time = time.time() - start_time
print("Script finished")
print(f"Total script execution time: {round(total_time)} seconds")

Starting script


Enter the path to the file from which the data is loaded:  C:\\Users\\PC\\Documents\\data_scientists\\data\\darmstadt\\darmstadt_UTD19.csv
Enter the path to the file from which the detectors data is loaded:  C:\\Users\\PC\\Documents\\data_scientists\\data\\darmstadt\\darmstadt_sensors.csv
Enter the path to where the file should be saved is saved:  C:\\Users\\PC\\Documents\\data_scientists\\data\\darmstadt


Loading data from:  C:\\Users\\PC\\Documents\\data_scientists\\data\\darmstadt\\darmstadt_UTD19.csv
Loading data from:  C:\\Users\\PC\\Documents\\data_scientists\\data\\darmstadt\\darmstadt_sensors.csv
Data loaded
Preprocessing data


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['day'] = pd.to_datetime(df['day'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['weekday'] = df['day'].dt.day_name()


Errors found and dropped: 2650344
Preprocessing data took 0 seconds
Drop bad days


  filtered_df = df.groupby(['day', 'detid']).apply(drop_by_group).reset_index(drop=True)


Total outliers detected and removed: 2304
Drop bad days took 4 seconds
Clipping outliers on occ
Total outliers clipped: 157
Clipping outliers on occ took 3 seconds
Clipping outliers on flow
Total outliers clipped: 0
Clipping outliers on flow took 3 seconds
Calculating traffic
Calculating traffic took 0 seconds
Droping outliers on traffic
Total outliers dropped: 82235
Droping outliers on traffic took 1 seconds
Detecting anomalies
Anomalies detected based on IQR: 3


  metrics = df.groupby('detid').apply(calculate_metrics).reset_index()


Anomalies detected based on IQR or range conditions: 9
Anomalies not enough data: 208
Total anomalies detected: 208
Detecting anomalies took 1 seconds
Exporting anomalies to:  C:\\Users\\PC\\Documents\\data_scientists\\data\\darmstadt
Exporting anomalies took 0 seconds
Combine datapoints
Combine datapoints took 0 seconds
Clipping to max traffic value
Clipping to max traffic value took 0 seconds
Normalizing traffic
traffic range was between:nan and nan
Normalizing traffic took 0 seconds
Merging dataframes
Merging dataframes took 0 seconds
Final processing
Final processing took 0 seconds
Exporting modified dataset to:  C:\\Users\\PC\\Documents\\data_scientists\\data\\darmstadt
Exporting modified dataset took 0 seconds
Script finished
Total script execution time: 14 seconds
