In [2]:
## 1. Importations des couches Application et Infrastructure
import pandas as pd
from churn_gym.infrastructure.data_sources.csv_loader_pandas import PandasCSVLoader
from churn_gym.infrastructure.ml.preprocessing.preprocessing_pipeline import BasicPreprocessingPipeline
from churn_gym.infrastructure.ml.preprocessing.robust_preprocessing_pipeline import (RobustPreprocessingPipeline)
from churn_gym.infrastructure.ml.features.advanced_feature_engineering import AdvancedFeatureEngineeringPipeline

from churn_gym.application.use_cases.load_dataset import LoadDatasetUseCase
from churn_gym.application.use_cases.preprocess_dataset import PreprocessDatasetUseCase
from churn_gym.application.use_cases.build_features import BuildFeaturesUseCase


## 2. Chargement du Dataset (Ingestion)
# Initialisation du loader et du cas d'utilisation
loader = PandasCSVLoader("../data/gym_churn.csv")
load_uc = LoadDatasetUseCase(loader)

# Exécution du chargement
raw_records = load_uc.execute()

# Visualisation des deux premiers enregistrements bruts
print(f"Nombre d'enregistrements chargés : {len(raw_records)}")
display(raw_records[:2])


## 3. Initialisation du pipeline de preprocessing et du cas d'utilisation
#preprocess_uc = PreprocessDatasetUseCase(
#    pipeline=BasicPreprocessingPipeline()
#)
preprocess_uc = PreprocessDatasetUseCase(
    pipeline=RobustPreprocessingPipeline()
)

# Transformation en entités du domaine (MemberRecord)
member_records = preprocess_uc.execute(raw_records)

print("Aperçu des données après prétraitement :")
display(member_records[:2])

## 4. Ingénierie des Caractéristiques (Feature Engineering)
# Initialisation de l'ingénierie avancée et du cas d'utilisation
feature_uc = BuildFeaturesUseCase(
    pipeline=AdvancedFeatureEngineeringPipeline()
)

# Génération des vecteurs de caractéristiques (features)
feature_vectors = feature_uc.execute(member_records)

print("Vecteurs de caractéristiques prêts pour l'entraînement :")
display(feature_vectors[:2])

## 5. Analyse de Qualité et Validation
# Conversion en DataFrame pour l'analyse
df = pd.DataFrame([fv.__dict__ for fv in raw_records])

print("--- Types de données détectés ---")
display(df.dtypes)

print("\n--- Analyse des valeurs manquantes ---")
display(df.isna().sum())

print("\n--- Statistiques descriptives ---")
display(df.describe())

Nombre d'enregistrements chargés : 150


[RawMemberRecord(member_id='1', name=nan, age=19.0, gender='Male', address='Street 171, City 39', phone_number='032-51510359', membership_type='Quarterly', join_date=datetime.date(2022, 7, 23), last_visit_date=datetime.date(2022, 11, 3), favorite_exercise='Pull-ups', avg_workout_duration_min=93, avg_calories_burned=214.0, total_weight_lifted_kg=13995.0, visits_per_month=18.0, churn='No'),
 RawMemberRecord(member_id='2', name='Shanza', age=19.0, gender='Female', address='Street 111, City 18', phone_number='039-19243328', membership_type='Monthly', join_date=datetime.date(2023, 12, 4), last_visit_date=datetime.date(2024, 2, 14), favorite_exercise='Squats', avg_workout_duration_min=37, avg_calories_burned=436.0, total_weight_lifted_kg=4612.0, visits_per_month=11.0, churn='No')]

Aperçu des données après prétraitement :


[MemberRecord(member_id='1', age=19.0, gender='male', membership_type='Quarterly', join_date=datetime.date(2022, 7, 23), last_visit_date=datetime.date(2022, 11, 3), favorite_exercise='Pull-ups', avg_workout_duration_min=93.0, avg_calories_burned=214.0, total_weight_lifted_kg=13995.0, visits_per_month=18.0, churn=0),
 MemberRecord(member_id='2', age=19.0, gender='female', membership_type='Monthly', join_date=datetime.date(2023, 12, 4), last_visit_date=datetime.date(2024, 2, 14), favorite_exercise='Squats', avg_workout_duration_min=37.0, avg_calories_burned=436.0, total_weight_lifted_kg=4612.0, visits_per_month=11.0, churn=0)]

Vecteurs de caractéristiques prêts pour l'entraînement :


[FeatureVector(member_id='1', age=19.0, avg_workout_duration_min=93.0, avg_calories_burned=214.0, total_weight_lifted_kg=13995.0, visits_per_month=18.0, gender='male', membership_type='Quarterly', favorite_exercise='Pull-ups', tenure_days=103, days_since_last_visit=1142, visit_recency_bucket='stale', tenure_bucket='medium', calories_per_minute=2.3010752688172045, weight_per_visit=777.5, churn=0),
 FeatureVector(member_id='2', age=19.0, avg_workout_duration_min=37.0, avg_calories_burned=436.0, total_weight_lifted_kg=4612.0, visits_per_month=11.0, gender='female', membership_type='Monthly', favorite_exercise='Squats', tenure_days=72, days_since_last_visit=674, visit_recency_bucket='stale', tenure_bucket='short', calories_per_minute=11.783783783783784, weight_per_visit=419.27272727272725, churn=0)]

--- Types de données détectés ---


member_id                    object
name                         object
age                         float64
gender                       object
address                      object
phone_number                 object
membership_type              object
join_date                    object
last_visit_date              object
favorite_exercise            object
avg_workout_duration_min      int64
avg_calories_burned         float64
total_weight_lifted_kg      float64
visits_per_month            float64
churn                        object
dtype: object


--- Analyse des valeurs manquantes ---


member_id                    0
name                        23
age                         13
gender                       0
address                      0
phone_number                 0
membership_type              0
join_date                    9
last_visit_date              0
favorite_exercise            0
avg_workout_duration_min     0
avg_calories_burned         11
total_weight_lifted_kg       8
visits_per_month            12
churn                        0
dtype: int64


--- Statistiques descriptives ---


Unnamed: 0,age,avg_workout_duration_min,avg_calories_burned,total_weight_lifted_kg,visits_per_month
count,137.0,150.0,139.0,142.0,138.0
mean,35.905109,73.326667,488.309353,9486.978873,14.224638
std,11.654892,25.87956,168.649317,5127.955128,6.385451
min,18.0,30.0,209.0,1009.0,4.0
25%,25.0,53.0,341.5,5274.0,8.0
50%,36.0,74.0,496.0,9072.5,14.0
75%,46.0,94.5,627.5,13620.75,20.0
max,54.0,119.0,788.0,19722.0,24.0


In [4]:
len(member_records[0].__dict__)
len(feature_vectors[0].__dict__)

16

In [23]:
from churn_gym.infrastructure.ml.preprocessing.robust_preprocessing_pipeline import (RobustPreprocessingPipeline)
from churn_gym.infrastructure.data_sources.csv_loader_pandas import PandasCSVLoader
from churn_gym.application.use_cases.load_dataset import LoadDatasetUseCase

# 1. here i'm loading the dataset use pandas 
loader = PandasCSVLoader("../data/gym_churn.csv")
load_uc = LoadDatasetUseCase(loader)
raw_records = load_uc.execute()

# 2. here i'm applying the robust preprocessing
preprocess_uc = PreprocessDatasetUseCase(
    pipeline=RobustPreprocessingPipeline()
)

raw_records_processed = preprocess_uc.execute(raw_records)


import pandas as pd

df = pd.DataFrame([fv.__dict__ for fv in raw_records_processed])

df.isna().sum()

#feature_vectors = feature_uc.execute(member_records)

#display(feature_vectors[:2])




member_id                    0
age                         13
gender                       0
membership_type              0
join_date                    9
last_visit_date              0
favorite_exercise            0
avg_workout_duration_min     0
avg_calories_burned         11
total_weight_lifted_kg       8
visits_per_month            12
churn                        0
dtype: int64

In [24]:
from pandas import DataFrame

data = DataFrame([fv.__dict__ for fv in raw_records])
display(data)

mask_age_null = data.age.isna()

d = data[mask_age_null].copy(deep=True)

def transform(x):
    return str(x).strip().lower() if x else "UNKNOWN"



d["gender"] = d["gender"].apply(lambda x : transform(x))

print(d["gender"].value_counts())

print(print(data["gender"].value_counts()))

Unnamed: 0,member_id,name,age,gender,address,phone_number,membership_type,join_date,last_visit_date,favorite_exercise,avg_workout_duration_min,avg_calories_burned,total_weight_lifted_kg,visits_per_month,churn
0,1,,19.0,Male,"Street 171, City 39",032-51510359,Quarterly,2022-07-23,2022-11-03,Pull-ups,93,214.0,13995.0,18.0,No
1,2,Shanza,19.0,Female,"Street 111, City 18",039-19243328,Monthly,2023-12-04,2024-02-14,Squats,37,436.0,4612.0,11.0,No
2,3,Ubaidullah,52.0,Male,"Street 69, City 21",033-60221501,Quarterly,2024-09-13,2024-12-07,Bench Press,98,523.0,3124.0,4.0,Yes
3,4,Mansoor Ahmed,52.0,Male,"Street 72, City 39",035-55527902,Monthly,2024-06-05,2025-02-24,Pull-ups,66,282.0,4586.0,24.0,No
4,5,Hanzala,32.0,Male,"Street 6, City 12",038-93946322,Monthly,2022-06-15,2022-12-02,Bench Press,32,522.0,16353.0,18.0,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
145,146,Mohammed,54.0,Male,"Street 2, City 15",036-24733357,Quarterly,2023-12-10,2024-04-27,Pull-ups,64,733.0,6082.0,14.0,No
146,147,Haseena,45.0,Female,"Street 31, City 22",038-73797777,Monthly,2023-02-10,2023-04-02,Bench Press,95,641.0,7375.0,12.0,No
147,148,Hassaan,54.0,Male,"Street 121, City 31",032-16612823,Quarterly,2023-03-27,2023-12-07,Deadlift,80,471.0,6138.0,14.0,No
148,149,Najat,26.0,Female,"Street 190, City 9",034-49625944,Monthly,2023-05-05,2024-03-26,Bench Press,50,473.0,18336.0,4.0,Yes


gender
female    8
male      5
Name: count, dtype: int64
gender
Male      75
Female    75
Name: count, dtype: int64
None


In [13]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 15 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   member_id                 150 non-null    object 
 1   name                      127 non-null    object 
 2   age                       137 non-null    float64
 3   gender                    150 non-null    object 
 4   address                   150 non-null    object 
 5   phone_number              150 non-null    object 
 6   membership_type           150 non-null    object 
 7   join_date                 141 non-null    object 
 8   last_visit_date           150 non-null    object 
 9   favorite_exercise         150 non-null    object 
 10  avg_workout_duration_min  150 non-null    int64  
 11  avg_calories_burned       139 non-null    float64
 12  total_weight_lifted_kg    142 non-null    float64
 13  visits_per_month          138 non-null    float64
 14  churn     

In [None]:
mask_age_null = df.age.isna()

df[mask_age_null]

In [8]:
mask_age_null = df.age.isna()

df[mask_age_null]

Unnamed: 0,member_id,age,gender,membership_type,join_date,last_visit_date,favorite_exercise,avg_workout_duration_min,avg_calories_burned,total_weight_lifted_kg,visits_per_month,churn
9,10,,female,Monthly,2022-03-24,2022-06-26,Pull-ups,74.0,,2113.0,19.0,0
24,25,,female,Quarterly,2024-05-26,2025-05-22,Squats,30.0,256.0,1009.0,18.0,0
31,32,,male,Quarterly,2022-04-24,2022-11-28,Cycling,109.0,,8684.0,10.0,0
32,33,,female,Monthly,2024-08-19,2024-10-04,Bench Press,33.0,,8167.0,19.0,0
45,46,,male,Quarterly,2022-01-21,2022-08-13,Treadmill,74.0,209.0,,6.0,1
52,53,,male,Monthly,2022-07-18,2022-09-07,Bench Press,57.0,323.0,9151.0,11.0,0
77,78,,female,Monthly,2023-06-18,2023-10-03,Bench Press,61.0,632.0,1492.0,5.0,1
86,87,,male,Monthly,,2023-02-20,Squats,72.0,,8466.0,20.0,0
98,99,,female,Monthly,2022-05-10,2022-12-07,Treadmill,61.0,372.0,5239.0,12.0,0
102,103,,female,Monthly,2022-12-11,2023-01-18,Bench Press,75.0,718.0,3067.0,18.0,0


In [None]:

import pandas as pd

df = pd.DataFrame([fv.__dict__ for fv in feature_vectors])

df.isna().sum()

Unnamed: 0,member_id,age,avg_workout_duration_min,avg_calories_burned,total_weight_lifted_kg,visits_per_month,gender,membership_type,favorite_exercise,tenure_days,days_since_last_visit,visit_recency_bucket,tenure_bucket,calories_per_minute,weight_per_visit,churn
0,1,19.0,93,214.0,13995.0,18.0,Male,Quarterly,Pull-ups,103.0,1141,stale,medium,2.301075,777.500000,False
1,2,19.0,37,436.0,4612.0,11.0,Female,Monthly,Squats,72.0,673,stale,short,11.783784,419.272727,False
2,3,52.0,98,523.0,3124.0,4.0,Male,Quarterly,Bench Press,85.0,376,stale,short,5.336735,781.000000,True
3,4,52.0,66,282.0,4586.0,24.0,Male,Monthly,Pull-ups,264.0,297,stale,medium,4.272727,191.083333,False
4,5,32.0,32,522.0,16353.0,18.0,Male,Monthly,Bench Press,170.0,1112,stale,medium,16.312500,908.500000,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
145,146,54.0,64,733.0,6082.0,14.0,Male,Quarterly,Pull-ups,139.0,600,stale,medium,11.453125,434.428571,False
146,147,45.0,95,641.0,7375.0,12.0,Female,Monthly,Bench Press,51.0,991,stale,short,6.747368,614.583333,False
147,148,54.0,80,471.0,6138.0,14.0,Male,Quarterly,Deadlift,255.0,742,stale,medium,5.887500,438.428571,False
148,149,26.0,50,473.0,18336.0,4.0,Female,Monthly,Bench Press,326.0,632,stale,medium,9.460000,4584.000000,True


In [1]:
from churn_gym.infrastructure.ml.preprocessing.robust_preprocessing_pipeline import (
    RobustPreprocessingPipeline
)
from churn_gym.application.use_cases.preprocess_dataset import PreprocessDatasetUseCase

from churn_gym.application.use_cases.build_features import BuildFeaturesUseCase
from churn_gym.infrastructure.ml.features.advanced_feature_engineering import (
    AdvancedFeatureEngineeringPipeline
)


from churn_gym.infrastructure.ml.preprocessing.robust_preprocessing_pipeline import (
    RobustPreprocessingPipeline
)


loader = PandasCSVLoader("../data/gym_churn.csv")
load_uc = LoadDatasetUseCase(loader)

raw_records = load_uc.execute()

preprocess_uc = PreprocessDatasetUseCase(
    pipeline=RobustPreprocessingPipeline()
)

member_records = preprocess_uc.execute(raw_records)


import pandas as pd

df = pd.DataFrame([fv.__dict__ for fv in feature_vectors])

df.isna().sum()

NameError: name 'PandasCSVLoader' is not defined