# Nettoyage et préparation du dataset de vins

In [181]:
import pandas as pd
from pandas import isnull

file_path = "../data/raw/winemag-data-130k-v2.csv"

# Chargement et nettoyage par chunks

In [182]:
chunksize = 20000
clean_chunks = []

for chunk in pd.read_csv(file_path, chunksize=chunksize):
    # Renommage des colonnes
    chunk = chunk.rename(columns={
        "Unnamed: 0": "id",
        "region_1": "primary_region",
        "region_2": "secondary_region",
    })

    # Suppression des lignes complètement vides
    chunk = chunk.dropna(how="all")

    # Suppression des doublons
    chunk = chunk.drop_duplicates()

    clean_chunks.append(chunk)

# Concaténation de tous les chunks
df_clean = pd.concat(clean_chunks, ignore_index=True)

# Suppression finale des doublons sur l'ensemble du dataset
df_clean = df_clean.drop_duplicates()

print(f"Nombre de lignes après nettoyage : {len(df_clean)}")
print(f"\nAperçu des données :")
print(df_clean.head())

Nombre de lignes après nettoyage : 129971

Aperçu des données :
   id   country                                        description  \
0   0     Italy  Aromas include tropical fruit, broom, brimston...   
1   1  Portugal  This is ripe and fruity, a wine that is smooth...   
2   2        US  Tart and snappy, the flavors of lime flesh and...   
3   3        US  Pineapple rind, lemon pith and orange blossom ...   
4   4        US  Much like the regular bottling from 2012, this...   

                          designation  points  price           province  \
0                        Vulkà Bianco      87    NaN  Sicily & Sardinia   
1                            Avidagos      87   15.0              Douro   
2                                 NaN      87   14.0             Oregon   
3                Reserve Late Harvest      87   13.0           Michigan   
4  Vintner's Reserve Wild Child Block      87   65.0             Oregon   

        primary_region   secondary_region         taster_name  \

## Suppression de la colonne `secondary_region` et des lignes avec des valeurs nulles

In [183]:
if "secondary_region" in df_clean.columns:
    df_clean = df_clean.drop("secondary_region", axis=1)
if "taster_twitter_handle" in df_clean.columns:
    df_clean = df_clean.drop("taster_twitter_handle", axis=1)
else:
    print('La colone secondary_region n\'existe pas.')

df_clean = df_clean.drop(df_clean[df_clean["designation"].isnull()].index)
df_clean = df_clean.drop(df_clean[df_clean["primary_region"].isnull()].index)
df_clean = df_clean.drop(df_clean[df_clean["price"].isnull()].index)
df_clean = df_clean.drop(df_clean[df_clean["country"].isnull()].index)
df_clean = df_clean.drop(df_clean[df_clean["province"].isnull()].index)
#df_clean = df_clean.drop(df_clean[df_clean["taster_twitter_handle"].isnull()].index)
df_clean = df_clean.drop(df_clean[df_clean["taster_name"].isnull()].index)

In [184]:
missing_values = df_clean.isnull().sum()
if missing_values.sum() > 0:
    print("\nValeurs manquantes par colonne :")
    missing_percent = (missing_values / len(df_clean)) * 100
    missing_df = pd.DataFrame({
        'Valeurs manquantes': missing_values,
        'Pourcentage': missing_percent
    })
    print(missing_df[missing_df['Valeurs manquantes'] > 0].sort_values('Pourcentage', ascending=False))
else:
    print("Aucune valeurs manquantes.")



Aucune valeurs manquantes.


# Sauvegarde des données nettoyées

In [185]:
df_clean.to_csv("../data/cleaned/wine_data_cleaned.csv", index=False)
print(f"\nDonnées sauvegardées : {len(df_clean)} lignes")



Données sauvegardées : 54170 lignes
