In [1]:
import pandas as pd
import numpy as np
import os

import datetime
from logging import getLogger
HOME = "/home/dimitri/epita/big_data/project/bourse/data/"
logger = getLogger(__name__)

In [41]:
import os
import pandas as pd
import datetime
import re
from concurrent.futures import ThreadPoolExecutor
from concurrent.futures import ProcessPoolExecutor

# Regex optimisée pour extraire date + heure
_date_re = re.compile(r'(\d{4}-\d{2}-\d{2}) (\d{2}:\d{2}:\d{2}\.\d+)')

def extract_date_hours(path):
    file_name = path.split('/')[-1]
    match = _date_re.search(file_name)
    if not match:
        return None

    date_str = match.group(1)
    time_str = match.group(2)
    return datetime.datetime.strptime(f"{date_str} {time_str}", '%Y-%m-%d %H:%M:%S.%f')


def extract_symbole(df):
    df["boursorama"] = df["symbol"].str[:3]
    df["symbol"] = df["symbol"].str[3:]


def list_all_file():
    list_path = []
    for dir_date in os.listdir(HOME + "boursorama"):
        for file_name in os.listdir(HOME + "boursorama/" + dir_date):
            full_path = os.path.join(HOME, "boursorama", dir_date, file_name)
            list_path.append(full_path)
    return list_path
def delete_volument_equal_zero(df):
    df = df[df['volume'] != 0]
    return df
def delete_symbole_ending_with_NV(df):
    df = df[~df['symbol'].str.endswith('NV')]
    return df
def get_df(list_path, start = 0, end = 10000):
    df_list = []
    for path in list_path:
        df_tmp = pd.read_pickle(path)
        df_tmp['date'] = extract_date_hours(path)
        df_list.append(df_tmp)
        start += 1
        if start >= end:
            break
    df = pd.concat(df_list, ignore_index=True)
    extract_symbole(df)
    df = delete_symbole_ending_with_NV(df)
    df = delete_volument_equal_zero(df)
    logger.info("All files loaded")
    return df


In [None]:
import pandas as pd
import glob
import os
import re

def load_dataset(data_path, n):
    # Print the number of files in the directory
    files = os.listdir(data_path)
    print(f"Number of files in the directory: {len(files)}")

    # Use glob to get all files that start with 'Euronext_Equities_' and end with .csv or .xlsx
    file_pattern = os.path.join(data_path, "Euronext_Equities_*.*")
    files = glob.glob(file_pattern)

    # List to store individual dataframes
    df_list = []
    counter = 0

    # Loop through each file and process based on file extension
    for file in files:
        if counter == n:
            break
        try:
            if file.lower().endswith('.csv'):
                df = pd.read_csv(file, encoding='utf-8', sep='\t')
            elif file.lower().endswith('.xlsx'):
                df = pd.read_excel(file)
            else:
                # Skip unknown file types
                print("Unsupported file type")
                continue

            # Extract the date from the filename using a regex (assuming format YYYY-MM-DD)
            date_match = re.search(r'(\d{4}-\d{2}-\d{2})', os.path.basename(file))
            if date_match:
                file_date = date_match.group(1)
                # Add the second and milisecond to the date
                file_date = file_date + " 00:00:00.000000"
                file_date = datetime.datetime.strptime(file_date, '%Y-%m-%d %H:%M:%S.%f')
                # Add the date as a new column (as datetime type)
                df['date'] = file_date
            else:
                # Optionally log or handle files without a proper date in the filename
                df['file_date'] = pd.NaT

            # Append the dataframe to the list
            df_list.append(df)
            counter += 1
        except Exception as e:
            print(f"Error processing file {file}: {e}")

    return df_list
def delete_symbole_ending_with_NV(df):
    df = df[~df['ticker'].str.endswith('NV')]
    return df
def get_df_euronext(n):
    # Define the path to the directory containing the files
    data_path = HOME + "euronext"

    df_list = load_dataset(data_path, n)

    # Concatenate all dataframes into one robust dataframe
    if df_list:
        combined_df = pd.concat(df_list, ignore_index=True)
    else:
        combined_df = pd.DataFrame()

    # Standardize column names: trim whitespace, lower case, and replace spaces with underscores
    combined_df.columns = combined_df.columns.str.strip().str.lower().str.replace(' ', '_')

    # Define a mapping of equivalent column names to merge differences between CSV and XLSX files
    column_synonyms = {
        'ticker': ['ticker', 'symbol'],
        'name': ['company', 'company_name', 'name'],
        'price': ['price', 'closing_price', 'last_price', 'last'],
        'currency': ['currency', 'trading_currency'],
        'open': ['open', 'open_price'],
        'high': ['high', 'high_price'],
        'low': ['low', 'low_price'],
        'last_trade_time': ['last_trade_time', 'last_trade_mic_time', 'last_date/time']
    }

    # Merge equivalent columns
    for canonical, synonyms in column_synonyms.items():
        # Find which of the synonym columns are present in the dataframe
        cols_present = [col for col in synonyms if col in combined_df.columns]
        if len(cols_present) > 1:
            # Merge the columns: use the first non-null value among the columns
            combined_df[canonical] = combined_df[cols_present[0]].combine_first(combined_df[cols_present[1]])
            for col in cols_present[2:]:
                combined_df[canonical] = combined_df[canonical].combine_first(combined_df[col])
            # Drop the extra synonym columns, keeping the canonical one
            for col in cols_present:
                if col != canonical:
                    combined_df.drop(columns=col, inplace=True)
        elif len(cols_present) == 1 and cols_present[0] != canonical:
            # Rename the column to the canonical name
            combined_df.rename(columns={cols_present[0]: canonical}, inplace=True)

    # Remove the closing price column if it exists
    if 'closing_price_datetime' in combined_df.columns:
        combined_df.drop(columns='closing_price_datetime', inplace=True)

    # Optionally, drop duplicates
    combined_df.drop_duplicates(inplace=True)

    # Remove the rows where the isin value is NaN
    combined_df = combined_df[~combined_df['isin'].isna()]

    # Add a new column 'pea' based on the currency column
    if 'currency' in combined_df.columns:
        combined_df['pea'] = combined_df['currency'].apply(
            lambda x: True if isinstance(x, str) and x.upper() == 'EUR' else False)
    else:
        combined_df['pea'] = False

    # Reset the index of the dataframe
    combined_df.reset_index(drop=True, inplace=True)

    #   Replace invalid values with NaN
    combined_df["high"] = combined_df["high"].replace('-', np.nan)
    combined_df["low"] = combined_df["low"].replace('-', np.nan)

    # Convert columns to float
    combined_df["high"] = combined_df["high"].astype(float)
    combined_df["low"] = combined_df["low"].astype(float)
    
    # Trier les données par 'isin' et 'date' pour garantir l'ordre correct
    if 'isin' in combined_df.columns and 'date' in combined_df.columns:
        combined_df.sort_values(by=['isin', 'date'], inplace=True)

    # Ajouter une colonne 'close' qui correspond à la valeur 'open' du jour suivant pour chaque 'isin'
    if 'open' in combined_df.columns:
        combined_df['close'] = combined_df.groupby('isin')['open'].shift(-1)
    else:
        combined_df['close'] = pd.NA

    # Réinitialiser l'index après avoir ajouté la colonne 'close'
    combined_df.reset_index(drop=True, inplace=True)
    # remove all the line that contains NaN in the column close, high, low
    combined_df = combined_df.dropna(subset=['close', 'high', 'low'])
    # combined_df = delete_symbole_ending_with_NV(combined_df)
    return combined_df


In [4]:
dir = os.listdir(HOME + "boursorama")
i = 0
for dir_date in dir :
    list_file_path = os.listdir(HOME + "boursorama/" + dir_date)
    print(len(list_file_path))

44893
24931
25887
28177
28056
15054


In [5]:
list_path = list_all_file()
len(list_path)

166998

In [20]:
df_boursorama = get_df(list_path, 0, 1000)
df_boursorama.head(10)

Unnamed: 0,last,volume,symbol,name,date,boursorama
1,37.29,405421,AC,ACCOR,2019-02-28 13:32:01.512834,1rP
3,171.7,22674,ADP,ADP,2019-02-28 13:32:01.512834,1rP
4,10.86,3512926,AF,AIR FRANCE - KLM,2019-02-28 13:32:01.512834,1rP
5,109.65,163127,AI,AIR LIQUIDE,2019-02-28 13:32:01.512834,1rP
6,113.24,285724,AIR,AIRBUS,2019-02-28 13:32:01.512834,1rP
7,54.8,2613,AKA,AKKA TECHNOLOGIES,2019-02-28 13:32:01.512834,1rP
11,12.79,16658,ALD,ALD,2019-02-28 13:32:01.512834,1rP
13,37.43,124982,ALO,ALSTOM,2019-02-28 13:32:01.512834,1rP
15,181.4,313,ALTA,ALTAREA,2019-02-28 13:32:01.512834,1rP
16,90.7,15412,ATE,ALTEN,2019-02-28 13:32:01.512834,1rP


In [55]:
df_euronext = get_df_euronext(999)
df_euronext.head()

Number of files in the directory: 775


Unnamed: 0,name,isin,ticker,market,open,high,low,time_zone,volume,turnover,date,currency,price,last_trade_time,pea,close
0,SCHLUMBERGER,AN8068571086,SLB,Euronext Paris,16.9,17.5,15.45,CET,158448,2609873.0,2020-05-01,EUR,15.6,30/04/20 17:35,True,15.5
1,SCHLUMBERGER,AN8068571086,SLB,Euronext Paris,15.5,15.9,13.85,CET,167582,2413145.1,2020-05-04,EUR,14.4,04/05/20 17:35,True,15.0
2,SCHLUMBERGER,AN8068571086,SLB,Euronext Paris,15.0,15.9,15.0,CET,123769,1908206.3,2020-05-05,EUR,15.5,05/05/20 17:38,True,15.3
3,SCHLUMBERGER,AN8068571086,SLB,Euronext Paris,15.3,15.5,14.7,CET,66775,1003210.2,2020-05-06,EUR,15.25,06/05/20 17:35,True,15.1
4,SCHLUMBERGER,AN8068571086,SLB,Euronext Paris,15.1,15.55,14.8,CET,46711,717229.05,2020-05-07,EUR,15.45,07/05/20 17:35,True,16.25


In [56]:
axa = df_euronext[df_euronext['name'].str.startswith('AXA')]
# date between 2022-10-15 and 2022-10-20
a = axa[(axa['date'] >= '2022-10-15') & (axa['date'] <= '2023-10-20')]
# order by date
a = a.sort_values(by='date')
a.head()

Unnamed: 0,name,isin,ticker,market,open,high,low,time_zone,volume,turnover,date,currency,price,last_trade_time,pea,close
227251,AXA,FR0000120628,CS,Euronext Paris,24.48,24.485,24.195,CET,3613957,87883873.685,2022-10-20,EUR,24.29,20/10/2022 17:35,True,26.82
227252,AXA,FR0000120628,CS,Euronext Paris,26.82,26.83,26.335,CET,3477713,92496565.67,2022-11-15,EUR,26.63,15/11/2022 17:35,True,29.45
227253,AXA,FR0000120628,CS,Euronext Paris,29.45,29.54,29.07,CET,7264808,213297819.37,2023-03-10,EUR,29.405,10/03/2023 17:35,True,29.23
637721,AXA NV23,FR001400ED13,CSNV,Euronext Paris,28.02,28.02,28.02,CET,9783,274119.66,2023-03-10,EUR,28.02,09/03/2023 16:30,True,26.05
227254,AXA,FR0000120628,CS,Euronext Paris,29.23,29.235,27.5,CET,12067943,336716101.28,2023-03-13,EUR,27.675,13/03/2023 17:38,True,27.48


In [28]:
def merge_dataset(df_boursorama, df_euronext, delete_name_alone=True):
    # delete_name_alone : delete the rows when the name is only in bourso and not in euronext
    if delete_name_alone:
        # Filter rows where the name exists in both datasets
        df_boursorama2 = df_boursorama[df_boursorama['name'].isin(df_euronext['name'])].copy()
    else:
        df_boursorama2 = df_boursorama.copy()

    # Get all unique columns from both datasets
    all_columns = list(set(df_euronext.columns).union(set(df_boursorama2.columns)))

    # Ensure both dataframes have the same columns
    for col in all_columns:
        if col not in df_euronext.columns:
            df_euronext[col] = np.nan
        if col not in df_boursorama2.columns:
            df_boursorama2[col] = np.nan

    # Concatenate the two datasets
    df = pd.concat([df_boursorama2, df_euronext], ignore_index=True)

    # Fill missing 'isin' values in df with the mapping from df_euronext
    isin_mapping = df_euronext.set_index('name')['isin'].to_dict()
    df['isin'] = df['isin'].fillna(df['name'].map(isin_mapping))

    # Remove duplicates based on all columns
    

    # Optionally, you can remove duplicates based on specific columns (e.g., 'name' and 'isin')
    # df = df.drop_duplicates(subset=['name', 'isin'])

    # TODO: Fill the other variables with the euronext values
    # TODO: Fill the 'symbol' value in the euronext with the bourso value

    return df

In [29]:
df = merge_dataset(df_boursorama, df_euronext, delete_name_alone=True)
df.head()

Unnamed: 0,last,volume,symbol,name,date,boursorama,low,last_trade_time,price,isin,close,turnover,currency,time_zone,pea,market,ticker,high,open
0,37.29,405421,AC,ACCOR,2019-02-28 13:32:01.512834,1rP,,,,FR0000120404,,,,,,,,,
1,171.7,22674,ADP,ADP,2019-02-28 13:32:01.512834,1rP,,,,FR0010340141,,,,,,,,,
2,109.65,163127,AI,AIR LIQUIDE,2019-02-28 13:32:01.512834,1rP,,,,FR0000120073,,,,,,,,,
3,113.24,285724,AIR,AIRBUS,2019-02-28 13:32:01.512834,1rP,,,,NL0000235190,,,,,,,,,
4,54.8,2613,AKA,AKKA TECHNOLOGIES,2019-02-28 13:32:01.512834,1rP,,,,FR0004180537,,,,,,,,,


In [30]:
from time import time

In [31]:
import numpy as np
import pandas as pd

def clean_numeric_column_fast(series: pd.Series) -> pd.Series:
    # Convertir en tableau de chaînes une fois pour toutes
    arr = np.char.asarray(series.astype(str).values)

    # Remplacer "," par "." et supprimer les espaces
    arr = np.char.replace(arr, ',', '.')
    arr = np.char.replace(arr, ' ', '')

    # Mettre NaN pour les simples tirets
    mask_dash = arr == '-'
    arr[mask_dash] = 'nan'

    # Convertir en float de façon vectorisée
    return pd.to_numeric(arr, errors='coerce')

In [32]:
from timescaledb_model import initial_markets_data

def populate_markets():
    # Convert initial_markets_data to a DataFrame for easier manipulation
    initial_data = pd.DataFrame(
        initial_markets_data,
        columns=["id", "name", "alias", "boursorama", "euronext", "sws"]
    )

    # Create the markets DataFrame
    df_markets = pd.DataFrame()
    df_markets["id"] = initial_data["id"]
    df_markets["name"] = initial_data["name"]
    df_markets["alias"] = initial_data["alias"]

    # Map boursorama prefixes to the corresponding markets
    df_markets["boursorama"] = initial_data["boursorama"]

    # Map euronext tickers to the corresponding markets
    df_markets["euronext"] = np.nan

    # Fill the "sws" column with data from initial_markets_data
    df_markets["sws"] = initial_data["sws"]


    return df_markets

In [33]:
def populate_companies(df_boursorama, df_euronext, df_market):
    """
    Merge Boursorama et Euronext pour conserver :
      - name         (issu de Boursorama ou Euronext)
      - isin         (issu d'Euronext, NaN si manquant)
      - symbol       (clé de merge)
      - boursorama   (last price, issu de Boursorama, NaN si manquant)
      - euronext     (price issu d'Euronext, NaN si manquant)
      - mid          (market id de df_market basé sur les 3 premières lettres du code boursorama)
      - pea, sector1, sector2, sector3
    """

    # Côté Boursorama : garder name, symbol et boursorama
    small_bourso = df_boursorama[["name", "symbol", "boursorama"]]
    unique_bourso = (
        small_bourso.drop_duplicates(subset=["name", "symbol", "boursorama"])
        .sort_values(by=["name", "symbol"])
        .rename(columns={"name": "name_bourso"})
    )

    # Côté Euronext : garder name, isin et ticker (rajoute 'euronext') en créant une copie
    small_euronext = df_euronext[["name", "isin", "ticker"]].copy()
    small_euronext["euronext"] = small_euronext["ticker"]
    unique_euronext = (
        small_euronext.drop_duplicates(subset=["name", "isin", "ticker"])
        .sort_values(by=["isin"])
        .rename(columns={"name": "name_euronext", "ticker": "symbol"})
    )

    # Outer merge sur "symbol" pour conserver tous les enregistrements
    merged_df = pd.merge(unique_bourso, unique_euronext, on="symbol", how="outer", indicator=True, suffixes=("", ""))
    
    # Fusionner les noms : on privilégie le nom de Boursorama s'il existe, sinon celui d'Euronext
    merged_df["name"] = merged_df["name_bourso"].combine_first(merged_df["name_euronext"])
    merged_df.drop(columns=["name_bourso", "name_euronext", "_merge"], inplace=True)
    
    # Calculer le préfixe marché à partir du code boursorama et join avec df_market
    merged_df["market_prefix"] = merged_df["boursorama"].str[:3]
    df_market2 = df_market.copy()
    df_market2["market_prefix"] = df_market2["boursorama"].str[:3]
    market_mapping = (
        df_market2[["market_prefix", "id"]]
        .rename(columns={"id": "mid"})
        .drop_duplicates("market_prefix")
    )
    merged_df = merged_df.merge(market_mapping, on="market_prefix", how="left")
    merged_df.drop(columns="market_prefix", inplace=True)
    
    # Colonnes supplémentaires
    merged_df["pea"] = False
    merged_df["sector1"] = ""  # TODO
    merged_df["sector2"] = ""  # TODO
    merged_df["sector3"] = ""  # TODO
    
    # Convertir mid en int en remplaçant les valeurs manquantes par -1
    merged_df["mid"] = merged_df["mid"].fillna(-1).astype(int)
    
    # Optionnel : assigner un nouvel identifiant unique
    merged_df["id"] = np.arange(len(merged_df))
    
    return merged_df

In [None]:
def populate_stocks(df_boursorama: pd.DataFrame, df_companies: pd.DataFrame, save_path: str = "daystocks.parquet"):
    df_stocks = pd.DataFrame()

    # Reset index to avoid ambiguity with 'symbol', without adding the old index as a column
    df_boursorama = df_boursorama.reset_index(drop=True)
    df_companies = df_companies.reset_index(drop=True)

    # Merge df_boursorama with df_companies on the 'symbol' column
    merged_df = df_boursorama.merge(
        df_companies[['symbol', 'id', 'name']],
        how='left',
        left_on='name',
        right_on='name',
        suffixes=('', '_company')  # Avoid suffix conflicts
    )

    # Populate the stocks dataframe
    df_stocks["date"] = merged_df["date"]
    df_stocks["cid"] = merged_df["id"].fillna(-1).astype(int)  # Ensure 'cid' is an integer
    df_stocks["value"] = clean_numeric_column_fast(merged_df["last"])
    df_stocks["volume"] = merged_df["volume"]
    # delete all line where the cid is -1 but print it
    logger.info(f"Number of stocks with cid -1: {len(df_stocks[df_stocks['cid'] == -1])}")
    df_stocks = df_stocks[df_stocks["cid"] != -1]

    return df_stocks


In [35]:
def populate_daystocks(df_euronext: pd.DataFrame, df_companies: pd.DataFrame):
    # Merge des deux DataFrames
    merged_df = df_euronext.merge(
        df_companies[['isin', 'id']],
        how='left',
        on='isin'
    )

    # Initialisation du df final
    df_daystocks = pd.DataFrame()
    df_daystocks["date"] = merged_df["date"]
    df_daystocks["cid"] = merged_df["id"].fillna(-1).astype(int)

    # Colonnes à nettoyer
    numeric_cols = ["open", "close", "high", "low", "volume"]
    tps = time()
    df_daystocks[numeric_cols] = merged_df[numeric_cols].apply(clean_numeric_column_fast)
    print("Time to clean numeric columns:", time() - tps)

    # Calculs dérivés (mean et std après clean)
    df_daystocks["mean"] = (df_daystocks["high"] + df_daystocks["low"]) / 2
    df_daystocks["std"] = df_daystocks["high"] - df_daystocks["low"]
    # delete all nan
    df_daystocks = df_daystocks.dropna(subset=["volume", "open", "close"])

    return df_daystocks


In [36]:
df_markets = populate_markets()
df_markets.head(10)

Unnamed: 0,id,name,alias,boursorama,euronext,sws
0,1,New York,nyse,,,
1,2,London Stock Exchange,lse,1u*.L,,
2,3,Bourse de Milan,milano,1g,,
3,4,Mercados Espanoles,mercados,FF55-,,
4,5,Amsterdam,amsterdam,1rA,,Amsterdam
5,6,Paris,paris,1rP,,Paris
6,7,Deutsche Borse,xetra,1z,,
7,8,Bruxelle,bruxelle,FF11_,,Brussels
8,9,Australie,asx,,,
9,100,International,int,,,


In [37]:
df_companies = populate_companies(df_boursorama, df_euronext, df_markets)
df_companies.head(10)

Unnamed: 0,symbol,boursorama,isin,euronext,name,mid,pea,sector1,sector2,sector3,id
0,AB,1rP,FR0010557264,AB,AB SCIENCE,6,False,,,,0
1,ABCA,1rP,FR0004040608,ABCA,ABC ARBITRAGE,6,False,,,,1
2,ABEO,1rP,FR0013185857,ABEO,ABEO,6,False,,,,2
3,AC,1rP,FR0000120404,AC,ACCOR,6,False,,,,3
4,ATI,1rP,FR0000076655,ATI,ACTIA GROUP,6,False,,,,4
5,ADP,1rP,FR0010340141,ADP,ADP,6,False,,,,5
6,AGTA,1rP,CH0008853209,AGTA,AGTA RECORD I,6,False,,,,6
7,AF,1rP,FR0000031122,AF,AIR FRANCE - KLM,6,False,,,,7
8,AF,1rP,FR001400J770,AF,AIR FRANCE - KLM,6,False,,,,8
9,AI,1rP,FR0000120073,AI,AIR LIQUIDE,6,False,,,,9


Unnamed: 0,symbol,boursorama,isin,euronext,name,mid,pea,sector1,sector2,sector3,id
384,SOLB,,BE0003470755,SOLB,SOLVAY,-1,False,,,,384
385,MONT,,BE0003853703,MONT,MONTEA C.V.A.,-1,False,,,,385
386,MONT,,BE0003853703,MONT,MONTEA,-1,False,,,,386
387,ALCOI,,BE0160342011,ALCOI,COIL,-1,False,,,,387
388,MLPHO,,BE0948608451,MLPHO,PHOTONIKE CAPITAL,-1,False,,,,388


In [45]:
df_daystocks = populate_daystocks(df_euronext, df_companies)
df_daystocks.head()

Time to clean numeric columns: 5.011451244354248


Unnamed: 0,date,cid,open,close,high,low,volume,mean,std
0,2020-05-01,303,16.9,15.5,17.5,15.45,158448.0,16.475,2.05
1,2020-05-04,303,15.5,15.0,15.9,13.85,167582.0,14.875,2.05
2,2020-05-05,303,15.0,15.3,15.9,15.0,123769.0,15.45,0.9
3,2020-05-06,303,15.3,15.1,15.5,14.7,66775.0,15.1,0.8
4,2020-05-07,303,15.1,16.25,15.55,14.8,46711.0,15.175,0.75


In [46]:
df_stocks = populate_stocks(df_boursorama, df_companies)
df_stocks.head(10)

Number of stocks with cid -1: 0


Unnamed: 0,date,cid,value,volume
0,2019-02-28 13:32:01.512834,3,37.29,405421
1,2019-02-28 13:32:01.512834,5,171.7,22674
2,2019-02-28 13:32:01.512834,7,10.86,3512926
3,2019-02-28 13:32:01.512834,8,10.86,3512926
4,2019-02-28 13:32:01.512834,9,109.65,163127
5,2019-02-28 13:32:01.512834,10,113.24,285724
6,2019-02-28 13:32:01.512834,11,54.8,2613
7,2019-02-28 13:32:01.512834,14,12.79,16658
8,2019-02-28 13:32:01.512834,16,37.43,124982
9,2019-02-28 13:32:01.512834,17,37.43,124982


In [236]:
a = df_stocks[df_stocks["cid"] == -1]
a.head()

Unnamed: 0,date,cid,value,volume,name


In [237]:
# print the name when it start with "BNP"
df_euronext[df_euronext["name"].str.startswith("ABB")].head(5)

Unnamed: 0,name,isin,ticker,market,open,high,low,time_zone,volume,turnover,date,currency,price,last_trade_time,pea,close,boursorama,last,symbol


In [169]:
df_boursorama[df_boursorama['name'].str.startswith("AIR FRANCE - KLM")].head(20)

Unnamed: 0,last,volume,symbol,name,date,boursorama
4,10.86,3512926,AF,AIR FRANCE - KLM,2019-02-28 13:32:01.512834,1rP
25,9.612,0,AF,AIR FRANCE - KLM,2019-02-28 13:32:01.512834,1rA
326,10.73,248129,AF,AIR FRANCE - KLM,2019-03-19 10:42:02.024981,1rP
347,9.612,0,AF,AIR FRANCE - KLM,2019-03-19 10:42:02.024981,1rA
932,7.616,56170,AF,AIR FRANCE - KLM,2019-06-25 09:32:01.694746,1rP
953,9.612(c),0,AF,AIR FRANCE - KLM,2019-06-25 09:32:01.694746,1rA
1256,9.002,634619,AF,AIR FRANCE - KLM,2019-07-22 11:02:02.271202,1rP
1277,9.612,0,AF,AIR FRANCE - KLM,2019-07-22 11:02:02.271202,1rA
1579,9.156,1586265,AF,AIR FRANCE - KLM,2019-01-04 16:32:01.355906,1rP
1600,9.612,0,AF,AIR FRANCE - KLM,2019-01-04 16:32:01.355906,1rA
