In [None]:
import pandas as pd
from tqdm import tqdm
import os
import numpy as np

In [None]:
def load_and_calculate_avg_price(csv_file):
    df = pd.read_csv(csv_file)
    
    # outlier condition
    outlier_condition = (df['ask_1'] <= 0) | (df['bid_1'] <= 0) | ((df['ask_1'] / df['bid_1']) > 2)
    
    df['mid_price'] = np.where(outlier_condition, np.nan, (df['ask_1'] + df['bid_1']) / 2)
    
    return df[['time', 'mid_price']]

In [None]:
def compile_price_data(root_dir):
    compiled_data = []
    
    for stock_folder in tqdm(os.listdir(root_dir)): # loop through all stocks

        stock_folder_path = os.path.join(root_dir, stock_folder) # folder for one stock
            
        if os.path.isdir(stock_folder_path): 
            try:
                stock_name = stock_folder.split('_')[6]
            except Exception as e:
                print(f"File path: {stock_folder_path}. Error: {e}")
            for csv_file in os.listdir(stock_folder_path): # daily csv files of one stock
                
                    csv_file_path = os.path.join(stock_folder_path, csv_file)
                    if csv_file_path.endswith('.csv'):
                        try:
                            date_str = csv_file.split('_')[1]
                            daily_df = load_and_calculate_avg_price(csv_file_path)
                            
                            daily_df['date'] = date_str
                            daily_df['stock'] = stock_name
                            
                            compiled_data.append(daily_df)
                        except Exception as e:
                            print(f"File path: {csv_file_path}. Error: {e}")
    
    result_df = pd.concat(compiled_data)
    result_df.to_pickle('./result_df.pkl')
    result_df = result_df.pivot(index=['date', 'time'], columns='stock', values='mid_price')
    
    return result_df.reset_index()

root_directory = '../LOB_516_Minutely_2007_2021/data_by_stocks'
price_data_df = compile_price_data(root_directory)
price_data_df.head()

In [None]:
price_data_df.to_csv('./mid_price_1min.csv', index=False)