# Import des librairies

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import zipfile
import os

from pandas import Timestamp
from pandas.conftest import datetime64_dtype
from pandas.core.interchange.dataframe_protocol import DataFrame
from polars.datatypes.group import DATETIME_DTYPES

# Extraction du fichier CSV à partir du fichier .zip brut

In [3]:
if not os.path.exists("data/raw/btcusd_1-min_data.csv"):
    print("Fichier CSV inexistant, extraction à partir du fichier .zip ...")
    with zipfile.ZipFile("data/raw/btcusd_1-min_data_11_30_2025.zip","r") as zip_ref:
        zip_ref.extractall("data/raw/")
        print("Fichier CSV créé !")
else:
    print("Fichier CSV déjà existant, poursuite de l'exécution ...")


Fichier CSV déjà existant, poursuite de l'exécution ...


In [4]:
df_bitcoin_raw = pd.read_csv("data/raw/btcusd_1-min_data.csv")

In [5]:
df_bitcoin_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7317759 entries, 0 to 7317758
Data columns (total 6 columns):
 #   Column     Dtype  
---  ------     -----  
 0   Timestamp  float64
 1   Open       float64
 2   High       float64
 3   Low        float64
 4   Close      float64
 5   Volume     float64
dtypes: float64(6)
memory usage: 335.0 MB


In [6]:
df_bitcoin_raw.describe()

Unnamed: 0,Timestamp,Open,High,Low,Close,Volume
count,7317759.0,7317759.0,7317759.0,7317759.0,7317759.0,7317759.0
mean,1544948000.0,21249.84,21257.47,21242.0,21249.85,5.128479
std,126753200.0,29921.2,29929.35,29912.93,29921.2,22.06739
min,1325412000.0,3.8,3.8,3.8,3.8,0.0
25%,1435178000.0,443.0,443.14,442.88,443.0,0.0195988
50%,1544945000.0,7280.77,7285.0,7276.34,7280.65,0.4481621
75%,1654711000.0,30663.0,30673.0,30653.17,30663.01,2.897836
max,1764547000.0,126202.0,126272.0,126158.0,126202.0,5853.852


In [7]:
df_bitcoin_raw.dtypes

Timestamp    float64
Open         float64
High         float64
Low          float64
Close        float64
Volume       float64
dtype: object

In [8]:
print(df_bitcoin_raw.isna().sum())

Timestamp    0
Open         0
High         0
Low          0
Close        0
Volume       0
dtype: int64


# Conversion de "Timestamp" (float64, unix epoch time) en DateTime

In [9]:
df_bitcoin_raw["Timestamp"] = pd.to_datetime(df_bitcoin_raw["Timestamp"], origin="unix")

In [10]:
df_bitcoin_raw.head()

Unnamed: 0,Timestamp,Open,High,Low,Close,Volume
0,1970-01-01 00:00:01.325412060,4.58,4.58,4.58,4.58,0.0
1,1970-01-01 00:00:01.325412120,4.58,4.58,4.58,4.58,0.0
2,1970-01-01 00:00:01.325412180,4.58,4.58,4.58,4.58,0.0
3,1970-01-01 00:00:01.325412240,4.58,4.58,4.58,4.58,0.0
4,1970-01-01 00:00:01.325412300,4.58,4.58,4.58,4.58,0.0


In [13]:
df_bitcoin_raw.nunique()

Timestamp    7317759
Open         1746155
High         1710653
Low          1722000
Close        1737856
Volume       5220555
dtype: int64

In [16]:
print(df_bitcoin_raw.duplicated().sum())

0


Pas de doublons, pas de NaN, on peut procéder à la suite de l'analyse

# Analyse univariée

In [36]:
from pandas import DataFrame

In [44]:
def identify_distribution_to_df(df: DataFrame) -> DataFrame:
    """
    Analyse les colonnes numériques d'un DataFrame et identifie leurs valeurs skew
    et kurt. Retourne les valeurs dans un nouveau DataFrame.\n
    skew: Asymétrie (0 = symétrique)\n
    kurt: Aplatissement (>3 = queues épaisses)
    :param df: DataFrame
    :type df: pandas.DataFrame
    :return: DataFrame contenant les noms des colonnes d'entrée, leurs valeurs
     skew et kurt.
    :rtype: pandas.DataFrame
    """
    distribution_list = []
    for series_name, series in df.select_dtypes(include=np.number).items():
        skew = series.skew()
        kurt = series.kurt()
        column_dict = {"Column": series_name, "skew": skew, "kurt": kurt}
        distribution_list.append(column_dict)

    return pd.DataFrame(distribution_list)


In [45]:
identify_distribution_to_df(df_bitcoin_raw)

Unnamed: 0,Column,skew,kurt
0,Open,1.654101,1.891232
1,High,1.653784,1.88991
2,Low,1.654428,1.892597
3,Close,1.654093,1.891205
4,Volume,27.799998,2479.572929
