In [None]:
import os
import s3fs
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder


In [None]:
S3_ENDPOINT_URL = "https://" + os.environ["AWS_S3_ENDPOINT"]
fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})
BUCKET = "ebahri-ensae"
FILE_KEY_S3 = "X_train_Hi5.csv"
FILE_PATH_S3 = BUCKET + "/" + FILE_KEY_S3

with fs.open(FILE_PATH_S3, mode="rb") as file_in:
    x_train = pd.read_csv(file_in, sep=",")

In [None]:
x_train.head()

In [None]:
x_train['piezo_station_update_date'] = pd.to_datetime(x_train['piezo_station_update_date'], errors='coerce')
print(x_train['piezo_station_update_date'].head())  # Display the first few rows to confirm the conversion

In [None]:
x_train.info()


In [None]:

print(x_train.describe(include='all').T)  # Detailed statistics for numeric and non-numeric columns

In [None]:
# Separate numeric columns
numeric_cols = x_train.select_dtypes(include=['number']).columns

# Separate categorical columns
categorical_cols = x_train.select_dtypes(include=['object']).columns

# Separate datetime columns
datetime_cols = x_train.select_dtypes(include=['datetime']).columns

print("Numeric Columns:", numeric_cols)
print("Categorical Columns:", categorical_cols)
print("Datetime Columns:", datetime_cols)

In [None]:
# Show the count of each unique value in the column
value_counts = x_train['piezo_groundwater_level_category'].value_counts()
print("Value counts:")
print(value_counts)

# Show the unique values in the column
unique_values = x_train['piezo_groundwater_level_category'].unique()
print("\nUnique values:")
print(unique_values)

In [None]:
# Select columns that are not numeric
non_numeric_columns = x_train.select_dtypes(exclude=['number'])

# Display the names of the non-numeric columns
print(non_numeric_columns.columns)
print("number",len(non_numeric_columns.columns))

In [None]:
import pandas as pd

# Set pandas options to display all rows and columns
pd.set_option('display.max_columns', None)  # Show all columns
pd.set_option('display.max_rows', None)     # Show all rows
pd.set_option('display.width', None)        # No line wrap
pd.set_option('display.max_colwidth', None) # No truncation of columns


# Calculate the percentage of NaN values in each column
nan_percentage = x_train.isna().mean() * 100

# Display the percentage of NaN values per column
print(nan_percentage.sort_values(ascending=False))

In [None]:
# Show the count of each unique value in the column
value_counts = x_train['piezo_station_update_date'].value_counts()
print("Value counts:")
print(value_counts)

# Show the unique values in the column
unique_values = x_train['piezo_station_update_date'].unique()
print("\nUnique values:")
print(unique_values)

In [None]:
import seaborn as sns

# Correlation heatmap
plt.figure(figsize=(15, 10))
sns.heatmap(x_train[numeric_cols].corr(), annot=False, cmap='coolwarm', linewidths=0.5)
plt.title("Correlation Matrix")
plt.show()

In [None]:
# Filter correlations above 0.7 or below -0.7
corr_matrix = x_train[numeric_cols].corr()
high_corr = corr_matrix[(corr_matrix > 0.7) | (corr_matrix < -0.7)]

plt.figure(figsize=(15, 10))
sns.heatmap(high_corr, annot=True, cmap='coolwarm', linewidths=0.5, mask=high_corr.isnull())
plt.title("High Correlation Matrix (>|0.7|)")
plt.show()

In [None]:
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(x_train['piezo_groundwater_level_category'])


# Random Forest to estimate feature importance
rf = RandomForestClassifier(random_state=42)
rf.fit(x_train.dropna().select_dtypes(include=['number']), y_train)

# Display feature importance
feature_importance = pd.DataFrame({
    'Feature': numeric_cols,
    'Importance': rf.feature_importances_
}).sort_values(by='Importance', ascending=False)

print(feature_importance)

In [None]:

subset_columns = ['piezo_groundwater_level_category']  # replace with actual column names
x_train[subset_columns].hist(bins=30, figsize=(15, 5))

plt.tight_layout()
plt.show()

In [None]:
# Boxplot to detect outliers
plt.figure(figsize=(10, 8))
sns.boxplot(data=x_train)
plt.show()