In [6]:
import pandas as pd
import numpy as np
import os

import datetime
HOME = "/home/dimitri/epita/big_data/project/bourse/data/"

In [12]:
def extract_date_hours(path):
    file_name = path.split('/')[-1]  # Extract the file name
    date_str, time_str = file_name.split(' ')[1:3]  # Extract date and time parts
    time_str = ".".join(time_str.split('.')[0:2])
    date_time_str = f"{date_str} {time_str}"
    date_time = datetime.datetime.strptime(date_time_str, '%Y-%m-%d %H:%M:%S.%f')
    return date_time

def merge_df(file_path, df):
    df_tmp = pd.read_pickle(
    file_path,
    )
    # df.reset_index(drop=True, inplace=True)
    df_tmp['date'] = extract_date_hours(file_path)
    return pd.concat([df, df_tmp])
  
def get_df_bourso(num_files=100):
    df = pd.DataFrame()
    dir = os.listdir(HOME + "boursorama")
    i = 0
    for dir_date in dir :
        list_file_path = os.listdir(HOME + "boursorama/" + dir_date)
        for file_path in list_file_path :
            df = merge_df(HOME + "boursorama/" + dir_date + "/" + file_path, df)
            if (i == num_files) :
                break
            i+=1
        break
    return df

In [16]:
import pandas as pd
import glob
import os
import re

def load_dataset(data_path, n):
    # Print the number of files in the directory
    files = os.listdir(data_path)
    print(f"Number of files in the directory: {len(files)}")

    # Use glob to get all files that start with 'Euronext_Equities_' and end with .csv or .xlsx
    file_pattern = os.path.join(data_path, "Euronext_Equities_*.*")
    files = glob.glob(file_pattern)

    # List to store individual dataframes
    df_list = []
    counter = 0

    # Loop through each file and process based on file extension
    for file in files:
        if counter == n:
            break
        try:
            if file.lower().endswith('.csv'):
                df = pd.read_csv(file, encoding='utf-8', sep='\t')
            elif file.lower().endswith('.xlsx'):
                df = pd.read_excel(file)
            else:
                # Skip unknown file types
                print("Unsupported file type")
                continue

            # Extract the date from the filename using a regex (assuming format YYYY-MM-DD)
            date_match = re.search(r'(\d{4}-\d{2}-\d{2})', os.path.basename(file))
            if date_match:
                file_date = date_match.group(1)
                # Add the date as a new column (as datetime type)
                df['file_date'] = pd.to_datetime(file_date)
            else:
                # Optionally log or handle files without a proper date in the filename
                df['file_date'] = pd.NaT

            # Append the dataframe to the list
            df_list.append(df)
            counter += 1
        except Exception as e:
            print(f"Error processing file {file}: {e}")

    return df_list

def get_df_euronext(n):
    # Define the path to the directory containing the files
    data_path = HOME + "euronext"

    df_list = load_dataset(data_path, n)

    # Concatenate all dataframes into one robust dataframe
    if df_list:
        combined_df = pd.concat(df_list, ignore_index=True)
    else:
        combined_df = pd.DataFrame()

    # Standardize column names: trim whitespace, lower case, and replace spaces with underscores
    combined_df.columns = combined_df.columns.str.strip().str.lower().str.replace(' ', '_')

    # Define a mapping of equivalent column names to merge differences between CSV and XLSX files
    column_synonyms = {
        'ticker': ['ticker', 'symbol'],
        'company': ['company', 'company_name', 'name'],
        'price': ['price', 'closing_price', 'last_price', 'last'],
        'currency': ['currency', 'trading_currency'],
        'open': ['open', 'open_price'],
        'high': ['high', 'high_price'],
        'low': ['low', 'low_price'],
        'last_trade_time': ['last_trade_time', 'last_trade_mic_time', 'last_date/time']
    }

    # Merge equivalent columns
    for canonical, synonyms in column_synonyms.items():
        # Find which of the synonym columns are present in the dataframe
        cols_present = [col for col in synonyms if col in combined_df.columns]
        if len(cols_present) > 1:
            # Merge the columns: use the first non-null value among the columns
            combined_df[canonical] = combined_df[cols_present[0]].combine_first(combined_df[cols_present[1]])
            for col in cols_present[2:]:
                combined_df[canonical] = combined_df[canonical].combine_first(combined_df[col])
            # Drop the extra synonym columns, keeping the canonical one
            for col in cols_present:
                if col != canonical:
                    combined_df.drop(columns=col, inplace=True)
        elif len(cols_present) == 1 and cols_present[0] != canonical:
            # Rename the column to the canonical name
            combined_df.rename(columns={cols_present[0]: canonical}, inplace=True)

    # Remove the closing price column if it exists
    if 'closing_price_datetime' in combined_df.columns:
        combined_df.drop(columns='closing_price_datetime', inplace=True)

    # Optionally, drop duplicates
    combined_df.drop_duplicates(inplace=True)

    # Remove the rows where the isin value is NaN
    combined_df = combined_df[~combined_df['isin'].isna()]

    # Add a new column 'pea' based on the currency column
    if 'currency' in combined_df.columns:
        combined_df['pea'] = combined_df['currency'].apply(
            lambda x: True if isinstance(x, str) and x.upper() == 'EUR' else False)
    else:
        combined_df['pea'] = False

    # Reset the index of the dataframe
    combined_df.reset_index(drop=True, inplace=True)

    return combined_df


In [17]:
df_boursorama = get_df_bourso(10)
df_boursorama.head()

Unnamed: 0_level_0,last,volume,symbol,name,date
symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1rPABBV,58.01,0,1rPABBV,ABBVIE,2019-02-28 13:32:01.512834
1rPAC,37.29,405421,1rPAC,ACCOR,2019-02-28 13:32:01.512834
1rPACNV,46.02,0,1rPACNV,ACCOR,2019-02-28 13:32:01.512834
1rPADP,171.7,22674,1rPADP,ADP,2019-02-28 13:32:01.512834
1rPAF,10.86,3512926,1rPAF,AIR FRANCE - KLM,2019-02-28 13:32:01.512834


In [18]:
df_euronext = get_df_euronext(10)
df_euronext.head()

Number of files in the directory: 775


Unnamed: 0,company,isin,ticker,market,open,high,low,time_zone,volume,turnover,file_date,currency,price,last_trade_time,pea
0,1000MERCIS,FR0010285965,ALMIL,Euronext Growth Paris,29.7,29.9,29.7,CET,561,16710.5,2022-03-31,EUR,29.8,31/03/2022 13:37,True
1,2CRSI,FR0013341781,2CRSI,Euronext Paris,4.36,4.38,4.28,CET,5752,24887.265,2022-03-31,EUR,4.33,31/03/2022 17:35,True
2,2MX ORGANIC,FR0014000T90,2MX,Euronext Paris,9.79,9.879,9.79,CET,150,1477.305,2022-03-31,EUR,9.865,31/03/2022 14:03,True
3,2MX ORGANIC BS,FR0014000TB2,2MXBS,Euronext Paris,0.12,0.12,0.12,CET,3000,360.0,2022-03-31,EUR,0.12,16/03/2022 09:10,True
4,A TOUTE VITESSE,FR0010050773,MLATV,Euronext Access Paris,1.48,1.48,1.35,CET,378,521.87,2022-03-31,EUR,1.35,13/11/2019 16:53,True
