In [1]:
# DataFrame Preprocess
import pandas as pd
import numpy as np

# Data Split
from sklearn.model_selection import train_test_split

# Pipeline processing
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer

#Visualization
import matplotlib.pyplot as plt

# Regression
import pyearth

In [2]:
df = pd.read_csv('boston.csv')
print(df.head())

      CRIM    ZN  INDUS  CHAS    NOX     RM   AGE     DIS  RAD    TAX  \
0  0.00632  18.0   2.31     0  0.538  6.575  65.2  4.0900    1  296.0   
1  0.02731   0.0   7.07     0  0.469  6.421  78.9  4.9671    2  242.0   
2  0.02729   0.0   7.07     0  0.469  7.185  61.1  4.9671    2  242.0   
3  0.03237   0.0   2.18     0  0.458  6.998  45.8  6.0622    3  222.0   
4  0.06905   0.0   2.18     0  0.458  7.147  54.2  6.0622    3  222.0   

   PTRATIO       B  LSTAT  MEDV  
0     15.3  396.90   4.98  24.0  
1     17.8  396.90   9.14  21.6  
2     17.8  392.83   4.03  34.7  
3     18.7  394.63   2.94  33.4  
4     18.7  396.90   5.33  36.2  


In [3]:
print(df.describe())

             CRIM          ZN       INDUS        CHAS         NOX          RM  \
count  506.000000  506.000000  506.000000  506.000000  506.000000  506.000000   
mean     3.613524   11.363636   11.136779    0.069170    0.554695    6.284634   
std      8.601545   23.322453    6.860353    0.253994    0.115878    0.702617   
min      0.006320    0.000000    0.460000    0.000000    0.385000    3.561000   
25%      0.082045    0.000000    5.190000    0.000000    0.449000    5.885500   
50%      0.256510    0.000000    9.690000    0.000000    0.538000    6.208500   
75%      3.677083   12.500000   18.100000    0.000000    0.624000    6.623500   
max     88.976200  100.000000   27.740000    1.000000    0.871000    8.780000   

              AGE         DIS         RAD         TAX     PTRATIO           B  \
count  506.000000  506.000000  506.000000  506.000000  506.000000  506.000000   
mean    68.574901    3.795043    9.549407  408.237154   18.455534  356.674032   
std     28.148861    2.1057

In [4]:
# Compte le nombre de valeurs manquantes
print(df.isnull().sum())

CRIM       0
ZN         0
INDUS      0
CHAS       0
NOX        0
RM         0
AGE        0
DIS        0
RAD        0
TAX        0
PTRATIO    0
B          0
LSTAT      0
MEDV       0
dtype: int64


In [5]:
df.columns = df.columns.str.strip()
print(df.columns)

Index(['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX',
       'PTRATIO', 'B', 'LSTAT', 'MEDV'],
      dtype='object')


In [6]:
X = df.drop('MEDV', axis=1)
y = df['MEDV']

In [7]:
# Select numerical columns only
X_num = X.select_dtypes(include=[np.number]).columns.to_list()

# Select categorical columns only
X_cat = X.select_dtypes(include=[object]).columns.to_list()

print(X_num)
print(X_cat)

['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT']
[]


In [14]:
# Définir les transformations spécifiques pour les colonnes numériques et catégorielles
numeric_features = X_num
categorical_features = X_cat

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')), # Remplace les valeurs manquantes par la moyenne
    ('scaler', StandardScaler())]) # Standardise les valeurs (Moyenne = 0, Ecart-type = 1)

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')), # Remplace les valeurs manquantes par la valeur la plus fréquente
    ('onehot', OneHotEncoder(handle_unknown='ignore'))]) # Encodage One-Hot, ignore les valeurs inconnues qui n'ont pas été vues lors de l'entraînement

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features), # Applique les transformations numériques
        ('cat', categorical_transformer, categorical_features)]) # Applique les transformations catégorielles



In [None]:
model = pyearth.Mars(max_degree=2, penalty=3.0, endspan=5, feature_importance_type='rss'

AttributeError: module 'pyearth' has no attribute 'Mars'

In [None]:

pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', model)])