In [1]:
import pandas as pd
import numpy as np
import os

import datetime
HOME = "/home/dimitri/epita/big_data/project/bourse/data/"

In [2]:
def extract_date_hours(path):
    file_name = path.split('/')[-1]  # Extract the file name
    date_str, time_str = file_name.split(' ')[1:3]  # Extract date and time parts
    time_str = ".".join(time_str.split('.')[0:2])
    date_time_str = f"{date_str} {time_str}"
    date_time = datetime.datetime.strptime(date_time_str, '%Y-%m-%d %H:%M:%S.%f')
    return date_time

def merge_df(file_path, df):
    df_tmp = pd.read_pickle(
    file_path,
    )
    # df.reset_index(drop=True, inplace=True)
    df_tmp['date'] = extract_date_hours(file_path)
    return pd.concat([df, df_tmp])
def extract_symbole(df):
    df["boursorama"] = df["symbol"].copy()
    df["symbol"] = df["symbol"].apply(lambda x: x[3: len(x)])
def extract_identifiant_companies(df):
    df["prefix"] = df["boursorama"].apply(lambda x: x[0:3])
    
def get_df_bourso(num_files=100):
    df = pd.DataFrame()
    dir = os.listdir(HOME + "boursorama")
    i = 0
    for dir_date in dir :
        list_file_path = os.listdir(HOME + "boursorama/" + dir_date)
        for file_path in list_file_path :
            df = merge_df(HOME + "boursorama/" + dir_date + "/" + file_path, df)
            if (i == num_files) :
                break
            i+=1
        break
    extract_symbole(df)
    extract_identifiant_companies(df)

    return df

In [None]:
import pandas as pd
import glob
import os
import re

def load_dataset(data_path, n):
    # Print the number of files in the directory
    files = os.listdir(data_path)
    print(f"Number of files in the directory: {len(files)}")

    # Use glob to get all files that start with 'Euronext_Equities_' and end with .csv or .xlsx
    file_pattern = os.path.join(data_path, "Euronext_Equities_*.*")
    files = glob.glob(file_pattern)

    # List to store individual dataframes
    df_list = []
    counter = 0

    # Loop through each file and process based on file extension
    for file in files:
        if counter == n:
            break
        try:
            if file.lower().endswith('.csv'):
                df = pd.read_csv(file, encoding='utf-8', sep='\t')
            elif file.lower().endswith('.xlsx'):
                df = pd.read_excel(file)
            else:
                # Skip unknown file types
                print("Unsupported file type")
                continue

            # Extract the date from the filename using a regex (assuming format YYYY-MM-DD)
            date_match = re.search(r'(\d{4}-\d{2}-\d{2})', os.path.basename(file))
            if date_match:
                file_date = date_match.group(1)
                # Add the second and milisecond to the date
                file_date = file_date + " 00:00:00.000000"
                file_date = datetime.datetime.strptime(file_date, '%Y-%m-%d %H:%M:%S.%f')
                # Add the date as a new column (as datetime type)
                df['date'] = file_date
            else:
                # Optionally log or handle files without a proper date in the filename
                df['file_date'] = pd.NaT

            # Append the dataframe to the list
            df_list.append(df)
            counter += 1
        except Exception as e:
            print(f"Error processing file {file}: {e}")

    return df_list

def get_df_euronext(n):
    # Define the path to the directory containing the files
    data_path = HOME + "euronext"

    df_list = load_dataset(data_path, n)

    # Concatenate all dataframes into one robust dataframe
    if df_list:
        combined_df = pd.concat(df_list, ignore_index=True)
    else:
        combined_df = pd.DataFrame()

    # Standardize column names: trim whitespace, lower case, and replace spaces with underscores
    combined_df.columns = combined_df.columns.str.strip().str.lower().str.replace(' ', '_')

    # Define a mapping of equivalent column names to merge differences between CSV and XLSX files
    column_synonyms = {
        'ticker': ['ticker', 'symbol'],
        'name': ['company', 'company_name', 'name'],
        'price': ['price', 'closing_price', 'last_price', 'last'],
        'currency': ['currency', 'trading_currency'],
        'open': ['open', 'open_price'],
        'high': ['high', 'high_price'],
        'low': ['low', 'low_price'],
        'last_trade_time': ['last_trade_time', 'last_trade_mic_time', 'last_date/time']
    }

    # Merge equivalent columns
    for canonical, synonyms in column_synonyms.items():
        # Find which of the synonym columns are present in the dataframe
        cols_present = [col for col in synonyms if col in combined_df.columns]
        if len(cols_present) > 1:
            # Merge the columns: use the first non-null value among the columns
            combined_df[canonical] = combined_df[cols_present[0]].combine_first(combined_df[cols_present[1]])
            for col in cols_present[2:]:
                combined_df[canonical] = combined_df[canonical].combine_first(combined_df[col])
            # Drop the extra synonym columns, keeping the canonical one
            for col in cols_present:
                if col != canonical:
                    combined_df.drop(columns=col, inplace=True)
        elif len(cols_present) == 1 and cols_present[0] != canonical:
            # Rename the column to the canonical name
            combined_df.rename(columns={cols_present[0]: canonical}, inplace=True)

    # Remove the closing price column if it exists
    if 'closing_price_datetime' in combined_df.columns:
        combined_df.drop(columns='closing_price_datetime', inplace=True)

    # Optionally, drop duplicates
    combined_df.drop_duplicates(inplace=True)

    # Remove the rows where the isin value is NaN
    combined_df = combined_df[~combined_df['isin'].isna()]

    # Add a new column 'pea' based on the currency column
    if 'currency' in combined_df.columns:
        combined_df['pea'] = combined_df['currency'].apply(
            lambda x: True if isinstance(x, str) and x.upper() == 'EUR' else False)
    else:
        combined_df['pea'] = False

    # Reset the index of the dataframe
    combined_df.reset_index(drop=True, inplace=True)

    #   Replace invalid values with NaN
    combined_df["high"] = combined_df["high"].replace('-', np.nan)
    combined_df["low"] = combined_df["low"].replace('-', np.nan)

    # Convert columns to float
    combined_df["high"] = combined_df["high"].astype(float)
    combined_df["low"] = combined_df["low"].astype(float)
    
    # TODO: Add a close column to the dataframe that's will be the open value of the next day
    return combined_df


In [4]:
df_boursorama = get_df_bourso(100)
df_boursorama.head(10)

Unnamed: 0_level_0,last,volume,symbol,name,date,boursorama,prefix
symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1rPABBV,58.01,0,ABBV,ABBVIE,2019-02-28 13:32:01.512834,1rPABBV,1rP
1rPAC,37.29,405421,AC,ACCOR,2019-02-28 13:32:01.512834,1rPAC,1rP
1rPACNV,46.02,0,ACNV,ACCOR,2019-02-28 13:32:01.512834,1rPACNV,1rP
1rPADP,171.7,22674,ADP,ADP,2019-02-28 13:32:01.512834,1rPADP,1rP
1rPAF,10.86,3512926,AF,AIR FRANCE - KLM,2019-02-28 13:32:01.512834,1rPAF,1rP
1rPAI,109.65,163127,AI,AIR LIQUIDE,2019-02-28 13:32:01.512834,1rPAI,1rP
1rPAIR,113.24,285724,AIR,AIRBUS,2019-02-28 13:32:01.512834,1rPAIR,1rP
1rPAKA,54.8,2613,AKA,AKKA TECHNOLOGIES,2019-02-28 13:32:01.512834,1rPAKA,1rP
1rPALUNV,3.46,0,ALUNV,ALCATEL I15,2019-02-28 13:32:01.512834,1rPALUNV,1rP
1rPALU,3.5,0,ALU,ALCATEL-LUCENT,2019-02-28 13:32:01.512834,1rPALU,1rP


In [5]:
df_euronext = get_df_euronext(10)
df_euronext.head()

Number of files in the directory: 775


Unnamed: 0,name,isin,ticker,market,open,high,low,time_zone,volume,turnover,date,currency,price,last_trade_time,pea
0,1000MERCIS,FR0010285965,ALMIL,Euronext Growth Paris,29.7,29.9,29.7,CET,561,16710.5,2022-03-31,EUR,29.8,31/03/2022 13:37,True
1,2CRSI,FR0013341781,2CRSI,Euronext Paris,4.36,4.38,4.28,CET,5752,24887.265,2022-03-31,EUR,4.33,31/03/2022 17:35,True
2,2MX ORGANIC,FR0014000T90,2MX,Euronext Paris,9.79,9.879,9.79,CET,150,1477.305,2022-03-31,EUR,9.865,31/03/2022 14:03,True
3,2MX ORGANIC BS,FR0014000TB2,2MXBS,Euronext Paris,0.12,0.12,0.12,CET,3000,360.0,2022-03-31,EUR,0.12,16/03/2022 09:10,True
4,A TOUTE VITESSE,FR0010050773,MLATV,Euronext Access Paris,1.48,1.48,1.35,CET,378,521.87,2022-03-31,EUR,1.35,13/11/2019 16:53,True


In [6]:
def merge_dataset(df_boursorama, df_euronext, delete_name_alone=True):
    # delete_name_alone : delete the rows when the name is only in bourso and not in euronext
    if delete_name_alone:
        # TODO: make the same for euronext
        df_boursorama2 = df_boursorama[df_boursorama['name'].isin(df_euronext['name'])].copy()
    else:
        # TODO: make the same for euronext
        df_boursorama2 = df_boursorama.copy()

    all_columns = list(set(df_euronext.columns).union(set(df_boursorama2.columns)))

    for col in all_columns:
        if col not in df_euronext.columns:
            df_euronext[col] = np.nan

        if col not in df_boursorama2.columns:
            df_boursorama2[col] = np.nan

    df = pd.concat([df_boursorama2, df_euronext])


    isin_mapping = df_euronext.set_index('name')['isin'].to_dict()
    df['isin'] = df['isin'].fillna(df['name'].map(isin_mapping))
    df_boursorama['isin'] = np.nan
    df_boursorama['isin'] = df_boursorama['isin'].fillna(df_boursorama['name'].map(isin_mapping))
    #TODO : fill the other variables with the euronext values
    #TODO: fill the symbole value in the euronext with the bourso value

    
    return df

df = merge_dataset(df_boursorama, df_euronext).head()


In [7]:
df.head()

Unnamed: 0,last,volume,symbol,name,date,boursorama,prefix,time_zone,currency,market,pea,low,ticker,price,turnover,last_trade_time,high,isin,open
1rPAC,37.29,405421,AC,ACCOR,2019-02-28 13:32:01.512834,1rPAC,1rP,,,,,,,,,,,FR0000120404,
1rPACNV,46.02,0,ACNV,ACCOR,2019-02-28 13:32:01.512834,1rPACNV,1rP,,,,,,,,,,,FR0000120404,
1rPADP,171.7,22674,ADP,ADP,2019-02-28 13:32:01.512834,1rPADP,1rP,,,,,,,,,,,FR0010340141,
1rPAI,109.65,163127,AI,AIR LIQUIDE,2019-02-28 13:32:01.512834,1rPAI,1rP,,,,,,,,,,,FR0000120073,
1rPAIR,113.24,285724,AIR,AIRBUS,2019-02-28 13:32:01.512834,1rPAIR,1rP,,,,,,,,,,,NL0000235190,


| **Table**     | **Colonne**   | **Type**           | **Description** |
|--------------|-------------|------------------|--------------|
| **companies** | id        | SMALLINT (PK)   | Identifiant unique de l'entreprise |
|             | name        | VARCHAR         | Nom de l'entreprise |
|             | mid        | SMALLINT        | Identifiant du marché (référence vers `markets.id`) |
|             | symbol     | VARCHAR         | Symbole boursier de l'entreprise |
|             | isin       | CHAR(12)        | Code ISIN (International Securities Identification Number) |
|             | boursorama | VARCHAR         | Identifiant de l'entreprise sur Boursorama |
|             | euronext   | VARCHAR         | Identifiant de l'entreprise sur Euronext |
|             | pea       | BOOLEAN         | Indique si l'action est éligible au Plan d'Épargne en Actions (PEA) |
|             | sector1   | VARCHAR         | Secteur principal de l'entreprise |
|             | sector2   | VARCHAR         | Secteur secondaire de l'entreprise |
|             | sector3   | VARCHAR         | Secteur tertiaire de l'entreprise |


In [8]:

def populate_companies(df, db, df_markets):
    df_companies = pd.DataFrame()

    df_companies["isin"] = df["isin"].values
    df_companies["name"] = df["name"].values

    mid = df["prefix"].map(
        lambda prefix: df_markets.loc[df_markets["boursorama"] == prefix, "id"].values[0]
    ).values
    
    df_companies["mid"] = mid
    df_companies["symbol"] = df["symbol"].values
    df_companies["boursorama"] = df["boursorama"].values
    df_companies["id"] =  np.arange(len(df_companies))

    df_companies["euronext"] = df["ticker"].values
    eligible_pea = ["Bourse de Milan", "Mercados Espanoles", "Amsterdam","Paris", "Deutsche Borse","Bruxelle"]

    names = df["prefix"].map(
        lambda prefix: df_markets.loc[df_markets["boursorama"] == prefix, "name"].values[0]
    ).values
    
    is_in_eligible_pea = [name in eligible_pea for name in names]
    df_companies["pea"] = is_in_eligible_pea


    df_companies["sector1"] = "" # TODO
    df_companies["sector2"] = "" # TODO
    df_companies["sector3"] = "" # TODO

    # db.df_write(df_companies, 'companies')
    return df_companies

Voici un tableau explicatif des colonnes de chaque table :  

| **Table**     | **Colonne**   | **Type**           | **Description** |
|--------------|-------------|------------------|--------------|
| **markets** | id          | SMALLINT (PK)   | Identifiant unique du marché |
|             | name        | VARCHAR         | Nom du marché |
|             | alias       | VARCHAR         | Alias du marché |
|             | boursorama  | VARCHAR         | Préfixe du marché sur Boursorama |
|             | sws        | VARCHAR         | Nom du marché sur Simply Wall Street |
|             | euronext    | VARCHAR         | Nom du marché sur Euronext |

In [9]:
from timescaledb_model import initial_markets_data

def populate_markets(df: pd.DataFrame, db, df_bourso: pd.DataFrame, df_eronext: pd.DataFrame):
    # Convert initial_markets_data to a DataFrame for easier manipulation
    initial_data = pd.DataFrame(
        initial_markets_data,
        columns=["id", "name", "alias", "boursorama", "euronext", "sws"]
    )

    # Create the markets DataFrame
    df_markets = pd.DataFrame()
    df_markets["id"] = initial_data["id"]
    df_markets["name"] = initial_data["name"]
    df_markets["alias"] = initial_data["alias"]

    # Map boursorama prefixes to the corresponding markets
    df_markets["boursorama"] = initial_data["boursorama"]

    # Map euronext tickers to the corresponding markets
    df_markets["euronext"] = np.nan

    # Fill the "sws" column with data from initial_markets_data
    df_markets["sws"] = initial_data["sws"]

    # Write the populated DataFrame to the database
    # db.df_write(df_markets, "markets")

    return df_markets

Voici un tableau explicatif des colonnes de chaque table :  

| **Table**     | **Colonne**   | **Type**           | **Description** |
|--------------|-------------|------------------|--------------|
| **stocks**  | date      | TIMESTAMPTZ     | Date et heure de la cotation |
|             | cid       | SMALLINT        | Identifiant de l'entreprise (référence vers `companies.id`) |
|             | value     | FLOAT4          | Valeur de l'action à cet instant |
|             | volume    | FLOAT4          | Volume échangé à cet instant |


In [10]:
def populate_stocks(df_boursorama: pd.DataFrame, db, df_companies):
    df_stocks = pd.DataFrame()
    df_stocks["date"] = df_boursorama["date"].values

    cid = df_boursorama["isin"].map(
        lambda prefix: df_companies.loc[df_companies["isin"] == prefix, "id"].values[0]
        if not df_companies.loc[df_companies["isin"] == prefix, "id"].empty
        else pd.NA
    ).values
    df_stocks["cid"] = cid
    df_stocks["value"] = df_boursorama["last"].values
    df_stocks["volume"] = df_boursorama["volume"].values
    # db.df_write(df_stocks, "stocks")
    return df_stocks

In [21]:
def populate_daystocks(df_euronext: pd.DataFrame, db, df_companies):
    df_daystocks = pd.DataFrame()
    df_daystocks["date"] = df_euronext["date"].values
    df_daystocks["cid"] = df_euronext["isin"].map(
        lambda prefix: df_companies.loc[df_companies["isin"] == prefix, "id"].values[0]
        if not df_companies.loc[df_companies["isin"] == prefix, "id"].empty
        else pd.NA
    ).values

    df_daystocks["open"] = df_euronext["open"].values
    df_daystocks["close"] = df_euronext["last"].values
    df_daystocks["high"] = df_euronext["high"].values
    df_daystocks["low"] = df_euronext["low"].values
    df_daystocks["volume"] = df_euronext["volume"].values
    df_daystocks["mean"] = (df_euronext["high"].values + df_euronext["low"].values) / 2
    df_daystocks["std"] = df_euronext["high"].values - df_euronext["low"].values
    # db.df_write(df_daystocks, "daystocks")
    return df_daystocks

In [25]:
import numpy as np

# Replace invalid values with NaN
df_euronext["high"] = df_euronext["high"].replace('-', np.nan)
df_euronext["low"] = df_euronext["low"].replace('-', np.nan)

# Convert columns to float
df_euronext["high"] = df_euronext["high"].astype(float)
df_euronext["low"] = df_euronext["low"].astype(float)

In [26]:
df_daystocks = populate_daystocks(df_euronext, None, df_companies)
df_daystocks.head()

Unnamed: 0,date,cid,open,close,high,low,volume,mean,std
0,2022-03-31,,29.7,,29.9,29.7,561,29.8,0.2
1,2022-03-31,,4.36,,4.38,4.28,5752,4.33,0.1
2,2022-03-31,,9.79,,9.879,9.79,150,9.8345,0.089
3,2022-03-31,,0.12,,0.12,0.12,3000,0.12,0.0
4,2022-03-31,,1.48,,1.48,1.35,378,1.415,0.13


In [15]:
df_markets = populate_markets(None, None, None, None)
df_markets.head()

Unnamed: 0,id,name,alias,boursorama,euronext,sws
0,1,New York,nyse,,,
1,2,London Stock Exchange,lse,1u*.L,,
2,3,Bourse de Milan,milano,1g,,
3,4,Mercados Espanoles,mercados,FF55-,,
4,5,Amsterdam,amsterdam,1rA,,Amsterdam


In [16]:
df_companies = populate_companies(df, None, df_markets)
df_companies.head()

Unnamed: 0,isin,name,mid,symbol,boursorama,id,euronext,pea,sector1,sector2,sector3
0,FR0000120404,ACCOR,6,AC,1rPAC,0,,True,,,
1,FR0000120404,ACCOR,6,ACNV,1rPACNV,1,,True,,,
2,FR0010340141,ADP,6,ADP,1rPADP,2,,True,,,
3,FR0000120073,AIR LIQUIDE,6,AI,1rPAI,3,,True,,,
4,NL0000235190,AIRBUS,6,AIR,1rPAIR,4,,True,,,


In [17]:
df_stocks = populate_stocks(df_boursorama, None, df_companies)
df_stocks.head()

Unnamed: 0,date,cid,value,volume
0,2019-02-28 13:32:01.512834,,58.01,0
1,2019-02-28 13:32:01.512834,0.0,37.29,405421
2,2019-02-28 13:32:01.512834,0.0,46.02,0
3,2019-02-28 13:32:01.512834,2.0,171.7,22674
4,2019-02-28 13:32:01.512834,,10.86,3512926


In [38]:
df_boursorama.head()

Unnamed: 0_level_0,last,volume,symbol,name,date,boursorama,prefix,isin
symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1rPABBV,58.01,0,ABBV,ABBVIE,2019-02-28 13:32:01.512834,1rPABBV,1rP,
1rPAC,37.29,405421,AC,ACCOR,2019-02-28 13:32:01.512834,1rPAC,1rP,FR0000120404
1rPACNV,46.02,0,ACNV,ACCOR,2019-02-28 13:32:01.512834,1rPACNV,1rP,FR0000120404
1rPADP,171.7,22674,ADP,ADP,2019-02-28 13:32:01.512834,1rPADP,1rP,FR0010340141
1rPAF,10.86,3512926,AF,AIR FRANCE - KLM,2019-02-28 13:32:01.512834,1rPAF,1rP,
