In [None]:
# Data Manipulation
import pandas as pd
import numpy as np

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Model Selection
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score

# Preprocessing
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import (
    OrdinalEncoder, 
    StandardScaler, 
    OneHotEncoder, 
    MinMaxScaler
)

# Models
from sklearn.linear_model import (
    LinearRegression, 
    Ridge
)

from sklearn.ensemble import (
    RandomForestRegressor, 
    GradientBoostingRegressor, 
    VotingRegressor, 
    StackingRegressor
)

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

# Model Selection
from sklearn.model_selection import train_test_split


# Data Analyse

In [None]:
train_df = pd.read_csv('home-data-for-ml-course/train.csv')
test_df = pd.read_csv('home-data-for-ml-course/test.csv')

In [None]:
# Select numerical columns only
X_num = train_df.select_dtypes(include=[np.number]).columns.to_list()

# Select categorical columns only
X_cat = train_df.select_dtypes(include=[object]).columns.to_list()

print(X_num)
print(X_cat)

In [None]:
# Normaliser les données
train_df_numeric = train_df.select_dtypes(include=[int, float])

# Créer la pipeline pour imputer les valeurs manquantes et normaliser les données
pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="mean")),  # Remplacer les NaN par la moyenne
    ("scaler", MinMaxScaler())                    # Normaliser les données entre 0 et 1
])

# Appliquer la pipeline aux données
data_processed = pipeline.fit_transform(train_df_numeric)

# Si tu veux convertir le résultat en DataFrame
data_processed_df = pd.DataFrame(data_processed, columns=train_df_numeric.columns)


In [None]:
# Définir l'architecture de l'auto-encodeur
input_dim = data_processed.shape[1] 
print(f'nombre de variables : {input_dim}')

autoencoder = Sequential([
    Dense(32, activation="tanh", input_shape=(input_dim,)),
    Dense(16, activation="tanh"),
    Dense(12, activation="tanh"),  # Couche de compression
    Dense(16, activation="tanh"),
    Dense(32, activation="tanh"),
    Dense(input_dim, activation="sigmoid")  # La sortie doit être de la même dimension que l'entrée
])

# Compiler le modèle
autoencoder.compile(optimizer="adam", loss='mse')

autoencoder.summary()

In [None]:
history = autoencoder.fit(data_processed, data_processed, epochs=30, batch_size=32)

In [None]:
# Obtenir les prédictions
predictions = autoencoder.predict(data_processed)

# Calculer les erreurs de reconstruction
reconstruction_error = np.mean(np.square(data_processed - predictions), axis=1)

In [None]:
# Définir le seuil comme la moyenne + un multiple de l'écart-type
threshold_multiplier = 1
threshold = np.mean(reconstruction_error) + threshold_multiplier * np.std(reconstruction_error)
threshold

In [None]:
# Afficher les erreurs de reconstruction
plt.hist(reconstruction_error, bins=50)
plt.xlabel('Reconstruction Error')
plt.ylabel('Number of samples')
plt.axvline(x=threshold, color='red', linestyle='--')
plt.show()

print(f'Le nombre de valeurs aberrantes est de {np.sum(reconstruction_error > threshold)}')

In [None]:
# Identifier les outliers
outliers = reconstruction_error > threshold

# Recontruire le DataFrame original sans les outliers
train_df_no_outliers = train_df[~outliers]
len(train_df_no_outliers)

In [None]:
values = [598, 955, 935, 1299, 250, 314, 336, 707, 379, 1183, 692, 186, 441, 186, 524, 739, 598, 955, 636, 1062, 1191, 496, 198, 1338]
train_df_video = train_df[train_df.Id.isin(values) == False]
len(train_df_video)

In [None]:
# intersection des deux dataframes
train_df_target = pd.merge(train_df_no_outliers, train_df_video, how='inner')
len(train_df_target)

# Nous allons regarder les colonnes qui ont des valeurs manquantes

In [None]:
pd.DataFrame(train_df_target.isnull().sum()).sort_values(by=0, ascending=False).head(20)

In [None]:
# PoolQC -> Pool Quality
train_df_target = train_df_target.drop('PoolQC', axis=1)
test_df = test_df.drop('PoolQC', axis=1)
# MiscFeature -> Miscellaneous feature not covered in other categories
train_df_target = train_df_target.drop('MiscFeature', axis=1)
test_df = test_df.drop('MiscFeature', axis=1)
# Alley -> Type of alley access to property
train_df_target = train_df_target.drop('Alley', axis=1)
test_df = test_df.drop('Alley', axis=1)
# Fence -> Fence quality
train_df_target = train_df_target.drop('Fence', axis=1)
test_df = test_df.drop('Fence', axis=1)
# MasVnrType -> Masonry veneer type
train_df_target['MasVnrType'].fillna('Unknown', inplace=True)
sns.catplot(data=train_df_target, x="MasVnrType", y="SalePrice", kind="box")

In [None]:
## On remarque que les valeurs 'BrkCmn' et 'Unknown' ont des prix de vente moyen similaires
train_df_target['MasVnrType'] = train_df_target['MasVnrType'].replace('Unknown', 'BrkCmn')
test_df['MasVnrType'].fillna('BrkCmn', inplace=True)

In [None]:
# FireplaceQu -> Fireplace quality
train_df_target['FireplaceQu'].fillna('Unknown', inplace=True)
sns.catplot(data=train_df, x="FireplaceQu", y="SalePrice", kind="box")

In [None]:
train_df_target['FireplaceQu'] = train_df_target['FireplaceQu'].replace('Unknown', 'Po')
test_df['FireplaceQu'].fillna('Po', inplace=True)
# LotFrontage -> Linear feet of street connected to property (remplacer les NaN par la médiane)
train_df_target['LotFrontage'].fillna(train_df_target['LotFrontage'].median(), inplace=True)

# GarageYrBlt -> Year garage was built 
print(f'Correlation between GarageYrBlt and SalePrice: {train_df_target["GarageYrBlt"].corr(train_df_target["SalePrice"])}')
train_df_target = train_df_target.drop('GarageYrBlt', axis=1)
test_df = test_df.drop('GarageYrBlt', axis=1)

# GarageCond -> Garage condition
train_df_target = train_df_target.drop(columns='GarageCond')
test_df = test_df.drop(columns='GarageCond')

# GarageType -> Garage location
train_df_target['GarageType'].fillna('Unknown', inplace=True)
test_df['GarageType'].fillna('Unknown', inplace=True)

# GarageFinish -> Interior finish of the garage
train_df_target['GarageFinish'].fillna('Unf', inplace=True)
test_df['GarageFinish'].fillna('Unf', inplace=True)

# GarageQual -> Garage quality
train_df_target['GarageQual'].fillna('TA', inplace=True)
test_df['GarageQual'].fillna('TA', inplace=True)

In [None]:
# BsmtFinType2 
train_df_target = train_df_target.drop('BsmtFinType2', axis=1)
test_df = test_df.drop('BsmtFinType2', axis=1)

# BsmtExposure  
train_df_target['BsmtExposure'].fillna('NoBasement', inplace=True)
test_df['BsmtExposure'].fillna('NoBasement', inplace=True)

# BsmtQual
train_df_target['BsmtQual'].fillna('NoBasement', inplace=True)
test_df['BsmtQual'].fillna('NoBasement', inplace=True)

# BsmtCond
train_df_target['BsmtCond'].fillna('NoBasement', inplace=True)
test_df['BsmtCond'].fillna('NoBasement', inplace=True)

# BsmtFinType1
train_df_target['BsmtFinType1'].fillna('NoBasement', inplace=True)
sns.catplot(data=train_df_target, x="BsmtFinType1", y="SalePrice", kind="box")

In [None]:
train_df_target['BsmtFinType1'] = train_df_target['BsmtFinType1'].replace('NoBasement', 'Unf')
test_df['BsmtFinType1'].fillna('Unf', inplace=True)
train_df_target['MasVnrArea'].fillna(0, inplace=True)
test_df['MasVnrArea'].fillna(0, inplace=True)
train_df_target['Electrical'].fillna('SBrkr', inplace=True)
test_df['Electrical'].fillna('SBrkr', inplace=True)

In [None]:
train_df_target = train_df_target.drop(columns=['GarageArea'])
test_df = test_df.drop(columns=['GarageArea'])

In [None]:
# Il a été remarqué qu'appliquer le logarithme sur la variable cible permet de mieux modéliser les données
train_df_target['SalePrice'] = np.log1p(train_df_target['SalePrice'])

In [None]:
sns.histplot(
    train_df_target,
    x=train_df_target['SalePrice']
)

In [None]:
ordinal_columns = [
    'LotShape', 
    'LandContour',
    'Utilities',
    'LandSlope',  
    'BsmtQual',  
    'BsmtFinType1',  
    'CentralAir',  
    'Functional', 
    'FireplaceQu', 
    'GarageFinish', 
    'GarageQual', 
    'PavedDrive', 
    'ExterCond', 
    'KitchenQual', 
    'BsmtExposure', 
    'HeatingQC',
    'ExterQual', 
    'BsmtCond'
]

categorical_columns = [
    'Street', 
    'LotConfig',
    'Neighborhood', 
    'Condition1', 
    'Condition2', 
    'BldgType', 
    'HouseStyle', 
    'RoofStyle', 
    'Exterior1st', 
    'Exterior2nd',
    'MasVnrType',
    'Foundation',  
    'Electrical',  
    'SaleType', 
    'MSZoning', 
    'SaleCondition', 
    'Heating', 
    'GarageType', 
    'RoofMatl'
]

numerical_columns = train_df_target.select_dtypes(include=[np.number]).columns.to_list()
numerical_columns.remove('SalePrice')

In [None]:
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')), # Remplace les valeurs manquantes par la moyenne
    ('scaler', StandardScaler()) # Standardise les valeurs (Moyenne = 0, Ecart-type = 1)
]) 

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')), # Remplace les valeurs manquantes par 'missing'
    ('onehot', OneHotEncoder(handle_unknown='ignore')) # Encodage one-hot
])

ordinal_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')), # Remplace les valeurs manquantes par 'missing'
    ('ordinal', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)) # Encodage ordinal
])

In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_columns),
        ('cat', categorical_transformer, categorical_columns),
        ('ord', ordinal_transformer, ordinal_columns)
    ],
    remainder='passthrough', # Ignore les colonnes non transformées
    n_jobs=-1 # Utilise tous les coeurs du CPU
)

In [None]:
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
])

In [None]:
X = train_df_target.drop(columns='SalePrice')
y = train_df_target['SalePrice']

X.drop(columns='Id')

In [None]:
X_preprocessed = pipeline.fit_transform(X)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_preprocessed, y, test_size=0.2, random_state=25)

# Data Science

In [None]:
# Models
from sklearn.linear_model import (
    LinearRegression, 
    Ridge
)

from sklearn.ensemble import (
    RandomForestRegressor, 
    GradientBoostingRegressor, 
    VotingRegressor, 
    StackingRegressor
)

from xgboost import XGBRegressor

# Metrics
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

### Linear Regression

In [None]:
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)
print(f'Linear Regression: {mean_squared_error(y_test, y_pred)}')

In [None]:
rfr = RandomForestRegressor()

rfr_param_grid = {
    'max_depth': [5, 10, 15],
    'n_estimators': [100, 250, 500],
    'min_samples_split': [3, 5, 10]
}

rfr_grid = GridSearchCV(rfr, rfr_param_grid, cv=5, scoring='neg_mean_squared_error' ,n_jobs=-1)
rfr_grid.fit(X_train, y_train)

In [None]:
np.sqrt(-1 * rfr_grid.best_score_)

In [None]:
rfr_grid.best_params_

### XGboost Regression

In [None]:
xgb = XGBRegressor()
xgb_param_grid = {
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5],
    'n_estimators': [100, 250, 500]
}

xgb_grid = GridSearchCV(
    xgb, 
    xgb_param_grid, 
    cv=5, 
    scoring='neg_mean_squared_error', 
    n_jobs=-1
)

xgb_grid.fit(X_train, y_train)

In [None]:
np.sqrt(-1 * xgb_grid.best_score_)

### Ridge Regression

In [None]:
ridge = Ridge()
ridge_param_grid = {
    'alpha': [0.05, 0.1, 1, 3, 5],
    'solver': ['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga']
}

ridge_grid = GridSearchCV(
    ridge, 
    ridge_param_grid, 
    cv=5, 
    scoring='neg_mean_squared_error', 
    n_jobs=-1
)

ridge_grid.fit(X_train, y_train)

In [None]:
np.sqrt(-1 * ridge_grid.best_score_)

### Gradient Boosting Regressor

In [None]:
gdr = GradientBoostingRegressor()

gdr_param_grid = {
    'learning_rate': [0.01, 0.1, 0.001],
    'max_depth': [10,15,20],
    'n_estimators': [100, 250, 500, 1000],
    'min_samples_split': [10, 25, 50], 
    'max_features': [0.01, 0.1, 0.5]
}

gdr_grid = GridSearchCV(
    gdr, 
    gdr_param_grid, 
    cv=5, 
    scoring='neg_mean_squared_error', 
    n_jobs=-1
)

gdr_grid.fit(X_train, y_train)

### stacking

In [None]:
vr = VotingRegressor(
    estimators=[
        ('rfr', rfr_grid.best_estimator_), 
        ('xgb', xgb_grid.best_estimator_), 
        ('ridge', ridge_grid.best_estimator_), 
        ('gdr', gdr_grid.best_estimator_)
    ]
)

vr.fit(X_train, y_train)

In [None]:
y_pred_vr = vr.predict(X_test)

In [None]:
print("__Erreur quadratique moyenne__")
print(mean_squared_error(y_test, y_pred_vr))
print("__Coefficient de détermination__")
print(r2_score(y_test, y_pred_vr))