In [1]:
import pandas as pd
import numpy as np
import os

In [8]:
os.makedirs('../data/raw', exist_ok=True)
os.makedirs('../data/processed', exist_ok=True)

In [None]:
np.random.seed(42)

n = 1000

data = pd.DataFrame({

    'age': np.random.randint(18, 60, n),
    'days_since_last_login': np.random.randint(0, 100, n),
    'login_frequency': np.random.randint(1, 50, n),
    'total_time_spent': np.random.randint(10, 500, n),
    'avg_session_time': np.random.uniform(5, 60, n),
    'courses_enrolled': np.random.randint(1, 10, n),
    'assignments_completed': np.random.randint(0, 20, n),
    'videos_watched': np.random.randint(1, 300, n),
    'forum_posts': np.random.randint(0, 50, n),
    'subscription_length': np.random.randint(1, 365, n),
    'preferred_time': np.random.choice(['morning', 'evening', 'night'], n),
    'student_type': np.random.choice(['active', 'passive'], n),
    'country': np.random.choice(['France','Germany','Spain','Italy'], n),
    'dropout': np.random.choice([0,1], n)
})

# Ajouter des valeurs manquantes
data.loc[np.random.choice(data.index, 100), 'age'] = np.nan

# Ajouter des valeurs aberrantes
data.loc[np.random.choice(data.index, 50), 'total_time_spent'] = -999

data.to_csv('../data/raw/student_data.csv', index=False)

data.head()

In [10]:
df = pd.read_csv('../data/raw/student_data.csv')

In [11]:
df.head()
df.info()
df.describe()
df.isnull().sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 0 entries
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   age          0 non-null      object
 1   study_hours  0 non-null      object
 2   attendance   0 non-null      object
 3   dropout      0 non-null      object
dtypes: object(4)
memory usage: 132.0+ bytes


Unnamed: 0,0
age,0
study_hours,0
attendance,0
dropout,0


In [14]:
# 1. Compter les valeurs manquantes
print("AVANT NETTOYAGE:")
print(df.isnull().sum())
print(f"Valeurs n√©gatives dans total_time_spent: {(df['total_time_spent'] < 0).sum()}")


AVANT NETTOYAGE:
age            0
study_hours    0
attendance     0
dropout        0
dtype: int64


KeyError: 'total_time_spent'

In [6]:
# 2. Cr√©er une copie pour le nettoyage
df_clean = df.copy()


In [9]:
# 3. Traiter les valeurs aberrantes de total_time_spent
# Remplacer -999 par NaN (pour les traiter comme manquantes)
df_clean.loc[df_clean['total_time_spent'] < 0, 'total_time_spent'] = np.nan
print(f"\nApr√®s remplacement des n√©gatifs: {df_clean['total_time_spent'].isnull().sum()} NaN dans total_time_spent")




Apr√®s remplacement des n√©gatifs: 47 NaN dans total_time_spent


In [10]:
# 4. Imputer les valeurs manquantes
# Pour age : remplacer par la m√©diane (moins sensible aux outliers que la moyenne)
median_age = df_clean['age'].median()
df_clean['age'].fillna(median_age, inplace=True)

# Pour total_time_spent : remplacer par la moyenne
mean_time = df_clean['total_time_spent'].mean()
df_clean['total_time_spent'].fillna(mean_time, inplace=True)



The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_clean['age'].fillna(median_age, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_clean['total_time_spent'].fillna(mean_time, inplace=True)


In [11]:
# 5. V√©rifier qu'il n'y a plus de valeurs manquantes
print("\nAPR√àS NETTOYAGE:")
print(df_clean.isnull().sum())
print(f"Min total_time_spent: {df_clean['total_time_spent'].min()}")


APR√àS NETTOYAGE:
age                      0
days_since_last_login    0
login_frequency          0
total_time_spent         0
avg_session_time         0
courses_enrolled         0
assignments_completed    0
videos_watched           0
forum_posts              0
subscription_length      0
preferred_time           0
student_type             0
country                  0
dropout                  0
dtype: int64
Min total_time_spent: 10.0


In [14]:
# ============================================
# ENCODAGE DES VARIABLES CAT√âGORIELLES
# ============================================

print("="*50)
print("ENCODAGE DES DONN√âES")
print("="*50)


ENCODAGE DES DONN√âES


In [15]:
# 1. Identifier les types de variables
print("\nüîç Variables cat√©gorielles dans nos donn√©es:")
categorical_cols = df_clean.select_dtypes(include=['object']).columns.tolist()
for col in categorical_cols:
    print(f"   - {col}: {df_clean[col].unique().tolist()}")




üîç Variables cat√©gorielles dans nos donn√©es:
   - preferred_time: ['evening', 'night', 'morning']
   - student_type: ['active', 'passive']
   - country: ['Germany', 'Spain', 'Italy', 'France']


In [16]:
# 2. One-Hot Encoding (pour variables sans ordre)
print("\n‚úÖ Application de One-Hot Encoding...")
df_encoded = pd.get_dummies(
    df_clean, 
    columns=['preferred_time', 'student_type', 'country'],  # variables √† encoder
    drop_first=True,  # pour √©viter la redondance
    dtype=int
)




‚úÖ Application de One-Hot Encoding...


In [17]:
# 3. V√©rification
print(f"\nüìä Shape avant encodage: {df_clean.shape}")
print(f"üìä Shape apr√®s encodage: {df_encoded.shape}")
print(f"\nüìã Nouvelles colonnes cr√©√©es:")
new_cols = set(df_encoded.columns) - set(df_clean.columns)
for col in sorted(new_cols):
    print(f"   - {col}")




üìä Shape avant encodage: (1000, 14)
üìä Shape apr√®s encodage: (1000, 17)

üìã Nouvelles colonnes cr√©√©es:
   - country_Germany
   - country_Italy
   - country_Spain
   - preferred_time_morning
   - preferred_time_night
   - student_type_passive


In [18]:
# 4. Aper√ßu du r√©sultat
print("\nüëÄ Aper√ßu des donn√©es encod√©es (5 premi√®res lignes):")
print(df_encoded.head())




üëÄ Aper√ßu des donn√©es encod√©es (5 premi√®res lignes):
    age  days_since_last_login  login_frequency  total_time_spent  \
0  56.0                     28               49         107.00000   
1  46.0                     46               49          74.00000   
2  32.0                     67               44         256.87723   
3  25.0                     75               28         350.00000   
4  38.0                     44               47          14.00000   

   avg_session_time  courses_enrolled  assignments_completed  videos_watched  \
0         35.193925                 8                      1             266   
1         17.060068                 4                     17              95   
2          6.851805                 5                      8             262   
3         16.804544                 2                      5              71   
4         27.886679                 9                     10             219   

   forum_posts  subscription_length  dropout

In [22]:
# 5. Sauvegarde
df_encoded.to_csv('../data/processed/student_data_encoded.csv', index=False)
print("\nüíæ Donn√©es encod√©es sauvegard√©es dans data/processed/")


üíæ Donn√©es encod√©es sauvegard√©es dans data/processed/
