## xlsx joiner

Scipt to read xlsx files and join them as a simple csv

In [42]:
import datetime
import os
import re

import pandas

### File read and filter


In [43]:
def get_xlsx_files(directory):
    files_extension = "xlsx"
    current_path = os.getcwd()
    xlsx_files = list()
    year_dirs = os.listdir(f"{current_path}/{directory_name}")
    year_dirs.sort()
    for year_dir in year_dirs:
        file_names = os.listdir(f"{current_path}/{directory_name}/{year_dir}")
        file_names.sort()
        for file_name in file_names:
            if re.match(".*\.xlsx$", file_name):
                xlsx_files.append(
                    f"{current_path}/{directory_name}/{year_dir}/{file_name}"
                )
    return xlsx_files


In [44]:
def get_volume_df(filepath):
    volume = pandas.read_excel(filepath, engine='openpyxl', sheet_name=0)
    # Remove first usless header
    df = volume.drop([0])
    df.columns = range(df.shape[1])

    def get_street_id(x):
        match =  re.search('^(.*) \|', x)
        if match:
            return match.group(1)

    # Get header rows and fill null values
    headers = df[0:3].copy()
    headers.iloc[0] = headers.iloc[0].apply(lambda x: None if x == "Total" else x)
    headers = headers.apply(lambda x: x.ffill(), axis=1)
    headers.iloc[1] = headers.iloc[1].apply(get_street_id)

    cameras = df.iloc[3].fillna("Total")

    # Concatenate all headers in one row
    headers = headers.append(cameras)
    column_names = headers.loc[1] + "_" + headers.loc[2] + " " +headers.loc[4]
    column_names[0] = "date"
    column_names[1] = "time"
    values = df[[isinstance(x, datetime.datetime) for x in df[0]]].copy()
    values.columns = column_names
    values.reset_index(inplace=True, drop=True)
    values['date'] = [d.date() for d in values['date']]
    return values

In [45]:
def get_speed_df(filepath):
    speed = pandas.read_excel(filepath, engine='openpyxl', sheet_name=1)
    df = speed.drop([0])
    df.columns = range(df.shape[1])
    df = df.transpose()
    values = df[[isinstance(x, datetime.datetime) for x in df[1]]].copy()
    column_names = df.iloc[0].apply(lambda x: "V_" + str(x))
    column_names[1] = "datetime"
    values.columns = column_names
    values['date'] = [d.date() for d in values['datetime']]
    values['time'] = [d.time() for d in values['datetime']]
    del values["datetime"]
    return values



In [46]:
def get_volume_and_speed_df(filename):
    volume_df = get_volume_df(filename)
    speed_df = get_speed_df(filename)
    merged = volume_df.merge(
        speed_df,
        how="left",
        left_on=["date", "time"],
        right_on=["date", "time"]
    )
    return merged

In [52]:
directory_name = "AFOROS"

xlsx_files = get_xlsx_files(directory_name)

integrated_df = pandas.DataFrame()
for _, xlsx_file in enumerate(xlsx_files):
    df = get_volume_and_speed_df(xlsx_file)
    integrated_df = pandas.concat([integrated_df, df])


KeyboardInterrupt: 