In [None]:
import pandas as pd

# Charger le fichier CSV
dataFile = "./Data/Echantillon.csv"
data = pd.read_csv(dataFile, delimiter=",")

In [None]:
# Afficher les types de variables du DataFrame
print(data.dtypes)

event_time        object
event_type        object
product_id         int64
category_id        int64
category_code     object
brand             object
price            float64
user_id            int64
user_session      object
dtype: object


In [None]:
# Convertir la colonne 'event_time' au format datetime
data['event_time'] = pd.to_datetime(data['event_time'])

# Pour afficher uniquement la date lors de l'affichage
data['event_time'] = data['event_time'].dt.normalize()  # Cela garde le type datetime sans heures

# Vérifier les données
print(data.dtypes)
print(data.head())

event_time       datetime64[ns, UTC]
event_type                    object
product_id                     int64
category_id                    int64
category_code                 object
brand                         object
price                        float64
user_id                        int64
user_session                  object
dtype: object
                 event_time event_type  product_id          category_id  \
0 2019-10-01 00:00:00+00:00       view    44600062  2103807459595387724   
1 2019-10-01 00:00:00+00:00       view     3900821  2053013552326770905   
2 2019-10-01 00:00:00+00:00       view    17200506  2053013559792632471   
3 2019-10-01 00:00:00+00:00       view     1307067  2053013558920217191   
4 2019-10-01 00:00:00+00:00       view     1004237  2053013555631882655   

                         category_code     brand    price    user_id  \
0                                  NaN  shiseido    35.79  541312140   
1  appliances.environment.water_heater      aqua    33.20 

In [None]:
# Dictionnaire des types de données souhaités
desired_types = {
    'event_time': 'datetime64[ns]',
    'event_type': 'object',
    'product_id': 'int64',
    'category_id': 'int64',
    'category_code': 'object',
    'brand': 'object',
    'price': 'float64',
    'user_id': 'int64',
    'user_session': 'object'
}

def load_and_check_data(file_path):
    # Charger le jeu de données
    new_data = pd.read_csv(file_path)
    
    # Parcourir les types souhaités
    for column, desired_type in desired_types.items():
        if column in new_data.columns:
            # Vérifier et convertir le type si nécessaire
            if desired_type == 'datetime64[ns]':
                new_data[column] = pd.to_datetime(new_data[column], errors='coerce')
            elif desired_type == 'int64':
                new_data[column] = pd.to_numeric(new_data[column], errors='coerce').fillna(0).astype(int)
            elif desired_type == 'float64':
                new_data[column] = pd.to_numeric(new_data[column], errors='coerce')
            else:
                new_data[column] = new_data[column].astype(desired_type, errors='ignore')
    
    # Afficher les types de données après conversion
    print(new_data.dtypes)
    
    return new_data

new_data = load_and_check_data(dataFile)

event_time       datetime64[ns, UTC]
event_type                    object
product_id                     int32
category_id                    int32
category_code                 object
brand                         object
price                        float64
user_id                        int32
user_session                  object
dtype: object


In [None]:
# Division de la colonne 'category_code' en 3 autres colonnes
data[['category_main', 'category_sub', 'category_type']] = data['category_code'].str.split('.', n=2, expand=True)

data.to_csv('output.csv', index=False)