In [None]:
import os
import s3fs
import pandas as pd

# Create filesystem object
S3_ENDPOINT_URL = "https://" + os.environ["AWS_S3_ENDPOINT"]
fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})
BUCKET = "atoubert-ensae"
FILE_KEY_S3 = "X_train_Hi5.csv"
FILE_PATH_S3 = BUCKET + "/Hackathon Hiparis/" + FILE_KEY_S3

with fs.open(FILE_PATH_S3, mode="rb") as file_in:
    x_train = pd.read_csv(file_in, sep=",")

In [None]:
x_train.tail(50)

In [None]:
x_train.describe

In [None]:
train_sample = x_train.sample(n=10_000, random_state=42)

In [None]:
train_sample.head()

In [None]:
train_sample.tail(10)

In [None]:
nan_columns = train_sample.columns[train_sample.isna().all()]
print("Colonnes contenant uniquement des NaN :", nan_columns.tolist())

In [None]:
train_sample = train_sample.loc[:, train_sample.isna().sum() <= 8000]

In [None]:
train_sample.describe(include='all')

In [None]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd

# Étape 1 : Préparation des données
# Filtrer les colonnes numériques
train_sample_numeric = train_sample.select_dtypes(include=['number'])

# Remplir les valeurs manquantes
train_sample_numeric = train_sample_numeric.fillna(train_sample_numeric.mean())

# Standardiser les données
scaler = StandardScaler()
train_sample_scaled = scaler.fit_transform(train_sample_numeric)

# Étape 2 : Effectuer la PCA
pca = PCA(n_components=5)  # Choisissez le nombre de composantes principales
train_sample_pca = pca.fit_transform(train_sample_scaled)

# Transformer en DataFrame pour une visualisation facile
train_sample_pca_df = pd.DataFrame(
    train_sample_pca,
    columns=[f'PC{i+1}' for i in range(train_sample_pca.shape[1])]
)

# Étape 3 : Tracer une heatmap des composantes principales
sns.heatmap(train_sample_pca_df.corr(), annot=True, cmap='coolwarm')
plt.title("Heatmap des composantes principales")
plt.show()

In [None]:
train_sample.head()


In [None]:
x_train.iloc[1268326, :]

In [None]:
categorical_data = train_sample.select_dtypes(include=['category', 'object'])
categorical_data.head()

In [None]:
categorical_data.columns

In [None]:
train_sample['piezo_groundwater_level_category'].nunique()

In [None]:
unique_counts = {col: train_sample[col].nunique() for col in categorical_data}

# Étape 3 : Trouver la colonne avec le plus grand nombre de classes
max_classes_col = max(unique_counts, key=unique_counts.get)
max_classes_value = unique_counts[max_classes_col]

In [None]:
max_classes_col

In [None]:
import copy as cp
for proportion_nan_prct in [19]:
    new_df = cp.deepcopy(x_train)
    for col in new_df.columns:
        if new_df[col].isnull().sum() * 100 / len(x_train) > proportion_nan_prct:
            new_df.drop(col, axis=1, inplace=True)

new_df.head()


In [None]:
df=new_df
categorical_data = df.select_dtypes(include=["object", "category"])

In [None]:
categorical_data.head()


In [None]:
categorical_columns = df.select_dtypes(include=["object", "category"]).columns
for col in categorical_columns:
    unique_classes = df[col].unique()  # Récupère les classes uniques
    print(f"Variable '{col}': {unique_classes[:5]}")  # Affiche un extrait (5 premières classes)

In [None]:
df=x_train

# Étape 1 : Convertir en type datetime
df["piezo_station_update_date"] = pd.to_datetime(df["piezo_station_update_date"], errors="coerce")

# Étape 2 : Extraire les mois
df["month"] = df["piezo_station_update_date"].dt.month

# Étape 3 : Définir les saisons
def get_season(month):
    if month in [12, 1, 2]:
        return "Hiver"
    elif month in [3, 4, 5]:
        return "Printemps"
    elif month in [6, 7, 8]:
        return "Été"
    elif month in [9, 10, 11]:
        return "Automne"

df["season"] = df["month"].apply(get_season)

# Étape 4 : Compter les occurrences par saison
season_counts = df["season"].value_counts()
print(season_counts)


In [None]:
x_train["piezo_station_update_date"]

In [None]:
df["month"].unique()

In [None]:
BUCKET = "atoubert-ensae"
FILE_KEY_S3 = "X_test_Hi5.csv"
FILE_PATH_S3 = BUCKET + "/Hackathon Hiparis/" + FILE_KEY_S3

with fs.open(FILE_PATH_S3, mode="rb") as file_in:
    x_test = pd.read_csv(file_in, sep=",")

In [None]:
x_train["piezo_measurement_date"]

In [None]:
df=x_train
df['piezo_measurement_date'] = pd.to_datetime(df['piezo_measurement_date'])

# Créer la colonne 'saison' en fonction du mois
df['saison'] = df['piezo_measurement_date'].dt.month.map({
    1: 'Hiver', 2: 'Hiver', 12: 'Hiver',
    3: 'Printemps', 4: 'Printemps', 5: 'Printemps',
    6: 'Été', 7: 'Été', 8: 'Été',
    9: 'Automne', 10: 'Automne', 11: 'Automne'
})

# Compter le nombre d'observations par saison
count_par_saison = df.groupby('saison').size()

print(count_par_saison)


In [None]:
df["piezo_measurement_date"]

In [None]:
x_train["piezo_measurement_date"]

In [None]:
df["month"].head()

In [None]:
df["piezo_station_update_date"]

In [None]:
df=x_train

df['mois'] = pd.to_datetime(df['ezo_station_update_date'], format='%a %b %d %H:%M:%S %Z %Y').dt.month

In [None]:
df["month"]

In [None]:
# Étape 2 : Extraire les mois
df["month"] = df["piezo_station_update_date"].dt.month

# Étape 3 : Définir les saisons
def get_season(month):
    if month in [12, 1, 2]:
        return "Hiver"
    elif month in [3, 4, 5]:
        return "Printemps"
    elif month in [6, 7, 8]:
        return "Été"
    elif month in [9, 10, 11]:
        return "Automne"

df["season"] = df["month"].apply(get_season)

# Étape 4 : Compter les occurrences par saison
season_counts = df["season"].value_counts()
print(season_counts)
df["month"]

In [None]:
df=x_test
df['piezo_measurement_date'] = pd.to_datetime(df['piezo_measurement_date'])

# Créer la colonne 'saison' en fonction du mois
df['saison'] = df['piezo_measurement_date'].dt.month.map({
    1: 'Hiver', 2: 'Hiver', 12: 'Hiver',
    3: 'Printemps', 4: 'Printemps', 5: 'Printemps',
    6: 'Été', 7: 'Été', 8: 'Été',
    9: 'Automne', 10: 'Automne', 11: 'Automne'
})

# Compter le nombre d'observations par saison
count_par_saison = df.groupby('saison').size()

print(count_par_saison)
x_test["piezo_measurement_date"]

In [None]:
x_test["piezo_measurement_date"]

In [None]:
x_test['piezo_measurement_date'].drop_duplicates().head(1000) 