### Importation des Librairies et des fichiers : 

In [5]:
import pandas as pd

In [7]:
LFB2 = pd.read_csv('LFB2.csv')

In [8]:
LFB2.head()

Unnamed: 0.1,Unnamed: 0,CalYear,HourOfCall,StopCodeDescription,PropertyCategory,Easting_rounded,Northing_rounded,FirstPumpArriving_AttendanceTime,DeployedFromStation_Name,Meteo,Visibility
0,0,2009,0,Special Service,Road Vehicle,528650,176850,319,Battersea,météo très défavorable,moyenne
1,1,2009,0,Secondary Fire,Outdoor,533750,194450,308,Edmonton,météo très défavorable,moyenne
2,2,2009,0,Secondary Fire,Outdoor,507750,182850,210,Hillingdon,météo très défavorable,moyenne
3,3,2009,0,Secondary Fire,Outdoor,531050,185350,233,Holloway,météo très défavorable,moyenne
4,4,2009,0,AFA,Dwelling,529450,185250,172,Kentish Town,météo très défavorable,moyenne


In [9]:
LFB2.drop(columns = ['Unnamed: 0'], inplace = True)

In [17]:
LFB2.head()

Unnamed: 0,CalYear,HourOfCall,StopCodeDescription,PropertyCategory,Easting_rounded,Northing_rounded,FirstPumpArriving_AttendanceTime,DeployedFromStation_Name,Meteo,Visibility
0,2009,0,Special Service,Road Vehicle,528650,176850,319,Battersea,météo très défavorable,moyenne
1,2009,0,Secondary Fire,Outdoor,533750,194450,308,Edmonton,météo très défavorable,moyenne
2,2009,0,Secondary Fire,Outdoor,507750,182850,210,Hillingdon,météo très défavorable,moyenne
3,2009,0,Secondary Fire,Outdoor,531050,185350,233,Holloway,météo très défavorable,moyenne
4,2009,0,AFA,Dwelling,529450,185250,172,Kentish Town,météo très défavorable,moyenne


### Descriptif de la variable cible afin de vérifier la suppression correcte des outliers :

In [25]:
LFB2['FirstPumpArriving_AttendanceTime'].describe()

count    1.541635e+06
mean     3.117042e+02
std      1.217369e+02
min      1.000000e+00
25%      2.310000e+02
50%      2.960000e+02
75%      3.760000e+02
max      7.800000e+02
Name: FirstPumpArriving_AttendanceTime, dtype: float64

### Split entre la variable cible et les variables explicatives :

In [27]:
feats =  LFB2.drop(columns = ['FirstPumpArriving_AttendanceTime'], axis = 1)
target = LFB2['FirstPumpArriving_AttendanceTime']

In [28]:
# Compte des valeurs manquantes de chaque variables explicatives 
feats.isna().sum()

CalYear                       0
HourOfCall                    0
StopCodeDescription           0
PropertyCategory              0
Easting_rounded               0
Northing_rounded              0
DeployedFromStation_Name      7
Meteo                       238
Visibility                  238
dtype: int64

### Création d'un simple imputer pour remplacer les valeurs manquantes par leur mode :

In [30]:
from sklearn.impute import SimpleImputer

In [31]:
import numpy as np 
imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent')

### Séparation des variables en un jeu d'entraînement et un jeu de test :

In [33]:
from sklearn.model_selection import train_test_split

In [36]:
X_train, X_test, y_train, y_test = train_test_split(feats, target, test_size = 0.2,
                                                    random_state = 42)
if isinstance(X_train, np.ndarray):
    X_train = pd.DataFrame(X_train, columns=feats.columns)
    X_test = pd.DataFrame(X_test, columns=feats.columns)

#### Remplacement des valeurs manquantes :

In [40]:
X_train = imputer.fit_transform(X_train)
X_test = imputer.transform(X_test)

### Encodage des Variables catégorielles avec Frequency Encoding :

In [42]:
columns = ["DeployedFromStation_Name","Easting_rounded","Northing_rounded"]

In [43]:
X_train = pd.DataFrame(X_train, columns= ['CalYear','HourOfCall','StopCodeDescription',
                                          'PropertyCategory','Easting_rounded','Northing_rounded',
                                          'DeployedFromStation_Name','Meteo','Visibility'])
X_test = pd.DataFrame(X_test, columns=['CalYear','HourOfCall','StopCodeDescription',
                                          'PropertyCategory','Easting_rounded','Northing_rounded',
                                          'DeployedFromStation_Name','Meteo','Visibility'])

In [47]:
frequencies = {}

for col in columns:
    frequencies[col] = X_train[col].value_counts(normalize=True).to_dict()
    frequencies[col]["Unknown"] = 0 

X_train_encoded = X_train.copy()
X_test_encoded = X_test.copy()

for col in columns:
    X_train_encoded[col] = X_train_encoded[col].map(frequencies[col])
    X_test_encoded[col] = X_test_encoded[col].map(frequencies[col]).fillna(frequencies[col]["Unknown"])

In [48]:
print(type(X_train))
print(X_train.shape)  
print(X_test.shape)

<class 'pandas.core.frame.DataFrame'>
(1233308, 9)
(308327, 9)


In [50]:
type(X_train)

pandas.core.frame.DataFrame

### Encodage des variables ordinales avec Ordinal Encoding :

In [52]:
from sklearn.preprocessing import OrdinalEncoder

In [53]:
encoder1 = OrdinalEncoder(categories = [['météo très défavorable', 'météo défavorable',
                                         'météo correcte', 'météo favorable', 'météo idéale']])
encoder2 = OrdinalEncoder(categories = [['très mauvaise', 'moyenne', 'très bonne']])

In [54]:
X_train_encoded['Meteo_encoded'] = encoder1.fit_transform(X_train_encoded[['Meteo']])
X_test_encoded['Meteo_encoded'] = encoder1.transform(X_test_encoded[['Meteo']])

In [55]:
X_train_encoded['Visibility_encoded'] = encoder2.fit_transform(X_train_encoded[['Visibility']])
X_test_encoded['Visibility_encoded'] = encoder2.transform(X_test_encoded[['Visibility']])

### Encodage des variables catégorielles à moins de 15 valeurs avec OneHotEncoding 

In [57]:
from sklearn.preprocessing import OneHotEncoder
col_cat = ['StopCodeDescription','PropertyCategory']
encoder = OneHotEncoder(drop='first', sparse_output = False, handle_unknown = 'ignore')
X_train_code = encoder.fit_transform(X_train_encoded[col_cat])
X_test_code = encoder.transform(X_test_encoded[col_cat])
encoded_columns = encoder.get_feature_names_out(col_cat)
X_train_code_df = pd.DataFrame(X_train_code, columns = encoded_columns)
X_test_code_df = pd.DataFrame(X_test_code, columns = encoded_columns)
X_train_encoded = X_train_encoded.reset_index(drop = True)
X_train_code_df = X_train_code_df.reset_index(drop = True)

X_test_encoded = X_test_encoded.reset_index(drop = True)
X_test_code_df = X_test_code_df.reset_index(drop = True)

### Standardisation des données numériques :

In [62]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = X_train.copy()
X_test_scaled = X_test.copy()
X_train_scaled = scaler.fit_transform(X_train_scaled[['HourOfCall', 'CalYear']])
X_test_scaled = scaler.transform(X_test_scaled[['HourOfCall','CalYear']])
X_train_scaled_df = pd.DataFrame(X_train_scaled, columns = ['Hour','Year'])
X_train_scaled_df = X_train_scaled_df.reset_index(drop = True)
X_test_scaled_df = pd.DataFrame(X_test_scaled, columns = ['Hour','Year'])
X_test_scaled_df = X_test_scaled_df.reset_index(drop = True)

### Concaténation des DataFrames :

In [67]:
X_train = pd.concat([X_train_encoded, X_train_code_df, X_train_scaled_df], axis = 1)
X_test = pd.concat([X_test_encoded, X_test_code_df, X_test_scaled_df], axis = 1)

In [75]:
X_train.head()

Unnamed: 0,CalYear,HourOfCall,StopCodeDescription,PropertyCategory,Easting_rounded,Northing_rounded,DeployedFromStation_Name,Meteo,Visibility,Meteo_encoded,...,PropertyCategory_Boat,PropertyCategory_Dwelling,PropertyCategory_Non Residential,PropertyCategory_Other Residential,PropertyCategory_Outdoor,PropertyCategory_Outdoor Structure,PropertyCategory_Rail Vehicle,PropertyCategory_Road Vehicle,Hour,Year
0,2019,20,AFA,Other Residential,0.001883,0.003372,0.006311,météo correcte,moyenne,2.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.047141,0.530967
1,2014,18,False alarm - Good intent,Dwelling,0.004413,0.001692,0.007733,météo favorable,moyenne,3.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.728869,-0.501276
2,2017,16,Special Service,Dwelling,0.003442,0.00753,0.016697,météo défavorable,moyenne,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.410597,0.11807
3,2010,0,AFA,Non Residential,0.003616,0.002784,0.007116,météo favorable,moyenne,3.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,-2.13558,-1.32707
4,2024,15,Secondary Fire,Outdoor,0.001578,0.002732,0.009881,météo favorable,moyenne,3.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.251461,1.56321


In [77]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1233308 entries, 0 to 1233307
Data columns (total 29 columns):
 #   Column                                              Non-Null Count    Dtype  
---  ------                                              --------------    -----  
 0   CalYear                                             1233308 non-null  object 
 1   HourOfCall                                          1233308 non-null  object 
 2   StopCodeDescription                                 1233308 non-null  object 
 3   PropertyCategory                                    1233308 non-null  object 
 4   Easting_rounded                                     1233308 non-null  float64
 5   Northing_rounded                                    1233308 non-null  float64
 6   DeployedFromStation_Name                            1233308 non-null  float64
 7   Meteo                                               1233308 non-null  object 
 8   Visibility                                          

In [79]:
X_train['Meteo_encoded'].unique()

array([2., 3., 1., 4., 0.])

### Suppression des colonnes originale des variables encodées : 

In [82]:
X_train_enc = X_train.drop(['StopCodeDescription','PropertyCategory', 'Meteo', 'Visibility', 'HourOfCall','CalYear'], axis = 1)
X_test_enc = X_test.drop(['StopCodeDescription','PropertyCategory', 'Meteo', 'Visibility', 'HourOfCall','CalYear'], axis = 1)

In [84]:
display(X_train_enc.head())
display(X_train_enc.info())

Unnamed: 0,Easting_rounded,Northing_rounded,DeployedFromStation_Name,Meteo_encoded,Visibility_encoded,StopCodeDescription_Chimney Fire,StopCodeDescription_False alarm - Good intent,StopCodeDescription_False alarm - Malicious,StopCodeDescription_Late Call,StopCodeDescription_Primary Fire,...,PropertyCategory_Boat,PropertyCategory_Dwelling,PropertyCategory_Non Residential,PropertyCategory_Other Residential,PropertyCategory_Outdoor,PropertyCategory_Outdoor Structure,PropertyCategory_Rail Vehicle,PropertyCategory_Road Vehicle,Hour,Year
0,0.001883,0.003372,0.006311,2.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.047141,0.530967
1,0.004413,0.001692,0.007733,3.0,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.728869,-0.501276
2,0.003442,0.00753,0.016697,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.410597,0.11807
3,0.003616,0.002784,0.007116,3.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,-2.13558,-1.32707
4,0.001578,0.002732,0.009881,3.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.251461,1.56321


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1233308 entries, 0 to 1233307
Data columns (total 23 columns):
 #   Column                                              Non-Null Count    Dtype  
---  ------                                              --------------    -----  
 0   Easting_rounded                                     1233308 non-null  float64
 1   Northing_rounded                                    1233308 non-null  float64
 2   DeployedFromStation_Name                            1233308 non-null  float64
 3   Meteo_encoded                                       1233308 non-null  float64
 4   Visibility_encoded                                  1233308 non-null  float64
 5   StopCodeDescription_Chimney Fire                    1233308 non-null  float64
 6   StopCodeDescription_False alarm - Good intent       1233308 non-null  float64
 7   StopCodeDescription_False alarm - Malicious         1233308 non-null  float64
 8   StopCodeDescription_Late Call                       

None

### Importation des fichiers en format csv (objectif : partir des mêmes données) 

In [100]:
X_train_enc.to_csv('X_train_enc.csv')
X_test_enc.to_csv('X_test_enc.csv')

In [101]:
y_train.to_csv('y_train_reg.csv')
y_test.to_csv('y_test_reg.csv')