# Import des librairies

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import zipfile
import os

from pandas import Timestamp
from pandas.core.interchange.dataframe_protocol import DataFrame
from polars.datatypes.group import DATETIME_DTYPES

# Extraction du fichier CSV à partir du fichier .zip brut

In [3]:
if not os.path.exists("data/raw/btcusd_1-min_data.csv"):
    print("Fichier CSV inexistant, extraction à partir du fichier .zip ...")
    with zipfile.ZipFile("data/raw/btcusd_1-min_data_11_30_2025.zip","r") as zip_ref:
        zip_ref.extractall("data/raw/")
        print("Fichier CSV créé !")
else:
    print("Fichier CSV déjà existant, poursuite de l'exécution ...")


Fichier CSV déjà existant, poursuite de l'exécution ...


In [4]:
df_bitcoin_raw = pd.read_csv("data/raw/btcusd_1-min_data.csv")

In [5]:
df_bitcoin_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7317759 entries, 0 to 7317758
Data columns (total 6 columns):
 #   Column     Dtype  
---  ------     -----  
 0   Timestamp  float64
 1   Open       float64
 2   High       float64
 3   Low        float64
 4   Close      float64
 5   Volume     float64
dtypes: float64(6)
memory usage: 335.0 MB


In [6]:
df_bitcoin_raw.describe()

Unnamed: 0,Timestamp,Open,High,Low,Close,Volume
count,7317759.0,7317759.0,7317759.0,7317759.0,7317759.0,7317759.0
mean,1544948000.0,21249.84,21257.47,21242.0,21249.85,5.128479
std,126753200.0,29921.2,29929.35,29912.93,29921.2,22.06739
min,1325412000.0,3.8,3.8,3.8,3.8,0.0
25%,1435178000.0,443.0,443.14,442.88,443.0,0.0195988
50%,1544945000.0,7280.77,7285.0,7276.34,7280.65,0.4481621
75%,1654711000.0,30663.0,30673.0,30653.17,30663.01,2.897836
max,1764547000.0,126202.0,126272.0,126158.0,126202.0,5853.852


In [7]:
df_bitcoin_raw.dtypes

Timestamp    float64
Open         float64
High         float64
Low          float64
Close        float64
Volume       float64
dtype: object

In [8]:
print(df_bitcoin_raw.isna().sum())

Timestamp    0
Open         0
High         0
Low          0
Close        0
Volume       0
dtype: int64


# Conversion de "Timestamp" (float64, unix epoch time) en DateTime

In [9]:
print(df_bitcoin_raw["Timestamp"].iloc[0])

1325412060.0


In [10]:
df_bitcoin_raw["Timestamp"] = pd.to_datetime(df_bitcoin_raw["Timestamp"].astype(int), unit="s")

In [11]:
df_bitcoin_raw

Unnamed: 0,Timestamp,Open,High,Low,Close,Volume
0,2012-01-01 10:01:00,4.58,4.58,4.58,4.58,0.000000
1,2012-01-01 10:02:00,4.58,4.58,4.58,4.58,0.000000
2,2012-01-01 10:03:00,4.58,4.58,4.58,4.58,0.000000
3,2012-01-01 10:04:00,4.58,4.58,4.58,4.58,0.000000
4,2012-01-01 10:05:00,4.58,4.58,4.58,4.58,0.000000
...,...,...,...,...,...,...
7317754,2025-11-30 23:55:00,90405.00,90452.00,90403.00,90452.00,0.531700
7317755,2025-11-30 23:56:00,90452.00,90481.00,90420.00,90420.00,0.055547
7317756,2025-11-30 23:57:00,90412.00,90458.00,90396.00,90435.00,0.301931
7317757,2025-11-30 23:58:00,90428.00,90428.00,90362.00,90362.00,4.591653


In [12]:
df_bitcoin_raw.nunique()

Timestamp    7317759
Open         1746155
High         1710653
Low          1722000
Close        1737856
Volume       5220555
dtype: int64

In [13]:
print(df_bitcoin_raw.duplicated().sum())

0


Pas de doublons, pas de NaN, on peut procéder à la suite de l'analyse

# Analyse univariée

In [14]:
df_bitcoin_raw.set_index("Timestamp")

Unnamed: 0_level_0,Open,High,Low,Close,Volume
Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2012-01-01 10:01:00,4.58,4.58,4.58,4.58,0.000000
2012-01-01 10:02:00,4.58,4.58,4.58,4.58,0.000000
2012-01-01 10:03:00,4.58,4.58,4.58,4.58,0.000000
2012-01-01 10:04:00,4.58,4.58,4.58,4.58,0.000000
2012-01-01 10:05:00,4.58,4.58,4.58,4.58,0.000000
...,...,...,...,...,...
2025-11-30 23:55:00,90405.00,90452.00,90403.00,90452.00,0.531700
2025-11-30 23:56:00,90452.00,90481.00,90420.00,90420.00,0.055547
2025-11-30 23:57:00,90412.00,90458.00,90396.00,90435.00,0.301931
2025-11-30 23:58:00,90428.00,90428.00,90362.00,90362.00,4.591653


In [15]:
from pandas import DataFrame

In [16]:
def identify_distribution_to_df(df: DataFrame) -> tuple:
    """
    Analyse les colonnes numériques d'un DataFrame et identifie leurs valeurs skew
    et kurt. Retourne les valeurs dans un nouveau DataFrame.\n
    skew: Asymétrie (0 = symétrique)\n
    kurt: Aplatissement (>3 = queues épaisses)
    :param df: DataFrame
    :type df: pandas.DataFrame
    :return: DataFrame contenant les noms des colonnes d'entrée, leurs valeurs
     skew et kurt.
    :rtype: tuple
    """
    distribution_list = []
    for series_name, series in df.select_dtypes(include=np.number).items():
        skew = series.skew()
        kurt = series.kurt()
        #z_scores = (series - series.mean()) / series.std()

        column_dict = {"Column": series_name, "skew": skew, "kurt": kurt}
        distribution_list.append(column_dict)

    df_stats = pd.DataFrame(distribution_list)

    return df_stats


In [17]:
df_bitcoin_stats = identify_distribution_to_df(df_bitcoin_raw)

In [18]:
df_bitcoin_stats

Unnamed: 0,Column,skew,kurt
0,Open,1.654101,1.891232
1,High,1.653784,1.88991
2,Low,1.654428,1.892597
3,Close,1.654093,1.891205
4,Volume,27.799998,2479.572929


## Calcul des Z-Scores

In [19]:
def calculate_z_scores(df: DataFrame)-> DataFrame:
    z_score_list = []

    for series_name, series in df.select_dtypes(include=np.number).items():
        z_scores = (df[series_name] - df[series_name].mean()) / df[series_name].std()
        z_score_list.append(z_scores)

    return pd.DataFrame(z_score_list)



On transpose le DataFrame afin de conserver les mêmes colonnes

In [26]:
df_z_score = calculate_z_scores(df_bitcoin_raw).T

In [41]:
df_z_score.set_index(df_bitcoin_raw["Timestamp"], inplace=True)

In [42]:
df_z_score = df_z_score[df_z_score.gt(3).any(axis=1)]

In [43]:
print(df_z_score)

                         Open      High       Low     Close    Volume
Timestamp                                                            
2012-02-23 14:48:00 -0.710034 -0.710096 -0.709969 -0.710035  3.966323
2012-02-24 03:09:00 -0.710026 -0.710088 -0.709961 -0.710027  3.392858
2012-03-08 21:46:00 -0.710026 -0.710088 -0.709961 -0.710027  3.911397
2012-03-09 15:45:00 -0.710026 -0.710087 -0.709960 -0.710026  4.334996
2012-03-13 17:02:00 -0.710025 -0.710086 -0.709961 -0.710027  3.995558
...                       ...       ...       ...       ...       ...
2025-11-21 15:13:00  2.105803  2.105643  2.097488  2.100055  3.877852
2025-11-21 18:05:00  2.102494  2.104106  2.096819  2.096011  3.118334
2025-11-21 18:48:00  2.113824  2.128096  2.113835  2.125354  4.109853
2025-11-21 20:29:00  2.082876  2.082054  2.080037  2.079233  3.026075
2025-11-21 21:22:00  2.119406  2.121547  2.120254  2.121979  3.212936

[215636 rows x 5 columns]
