# 1) Libraries
---

In [1]:
import plotly.figure_factory as ff
import plotly.express as px
import pandas as pd

from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import   StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import r2_score

# 2) Import files
---

In [None]:
#import
data= pd.read_csv('dataset_from_kaggle.csv')
data.head()


Unnamed: 0,ID,Square_Feet,Num_Bedrooms,Num_Bathrooms,Num_Floors,Year_Built,Has_Garden,Has_Pool,Garage_Size,Location_Score,Distance_to_Center,Price
0,1,143.63503,1,3,3,1967,1,1,48,8.297631,5.935734,602134.816747
1,2,287.678577,1,2,1,1949,0,1,37,6.061466,10.827392,591425.135386
2,3,232.998485,1,3,2,1923,1,0,14,2.911442,6.904599,464478.69688
3,4,199.664621,5,2,2,1918,0,0,17,2.070949,8.284019,583105.655996
4,5,89.00466,4,3,3,1999,1,0,34,1.523278,14.648277,619879.142523


# 3) EDA
---

In [3]:
# Basic stats
print(f"Number of rows : {data.shape[0]}")
print(f"Number of columns : {data.shape[1]}")

print("Basics statistics: ")
display(data.describe(include='all'))

print("Percentage of missing values: ")
print(data.isna() .sum() / data.shape[0])

Number of rows : 500
Number of columns : 12
Basics statistics: 


Unnamed: 0,ID,Square_Feet,Num_Bedrooms,Num_Bathrooms,Num_Floors,Year_Built,Has_Garden,Has_Pool,Garage_Size,Location_Score,Distance_to_Center,Price
count,500.0,500.0,500.0,500.0,500.0,500.0,500.0,500.0,500.0,500.0,500.0,500.0
mean,250.5,174.640428,2.958,1.976,1.964,1957.604,0.536,0.492,30.174,5.16441,10.469641,582209.629529
std,144.481833,74.672102,1.440968,0.820225,0.802491,35.491781,0.499202,0.500437,11.582575,2.853489,5.588197,122273.390345
min,1.0,51.265396,1.0,1.0,1.0,1900.0,0.0,0.0,10.0,0.004428,0.062818,276892.470136
25%,125.75,110.319923,2.0,1.0,1.0,1926.0,0.0,0.0,20.0,2.76065,6.066754,503080.34414
50%,250.5,178.290937,3.0,2.0,2.0,1959.0,1.0,0.0,30.0,5.206518,10.886066,574724.113347
75%,375.25,239.03122,4.0,3.0,3.0,1988.0,1.0,1.0,41.0,7.732933,15.07259,665942.301274
max,500.0,298.241199,5.0,3.0,3.0,2022.0,1.0,1.0,49.0,9.995439,19.927966,960678.274291


Percentage of missing values: 
ID                    0.0
Square_Feet           0.0
Num_Bedrooms          0.0
Num_Bathrooms         0.0
Num_Floors            0.0
Year_Built            0.0
Has_Garden            0.0
Has_Pool              0.0
Garage_Size           0.0
Location_Score        0.0
Distance_to_Center    0.0
Price                 0.0
dtype: float64


In [4]:
# Distribution of each numeric variable
num_features = ['Square_Feet','Garage_Size','Location_Score','Distance_to_Center', 'Price']
for i in range(len(num_features)):
    fig = px.histogram(data[num_features[i]])
    fig.show()

In [None]:
#Barplot of each qualitative variable

qual_features = ['Num_Bedrooms','Num_Bathrooms','Num_Floors','Year_Built', 'Has_Garden', 'Has_Pool']

for i in range(len(qual_features)):my_certification_projects/my_certification_projects_lead/M04-Projet-Final-estimmo/notebook/02_create_postgre_database.ipynb
    df = pd.DataFrame(data[qual_features[i]].value_counts()).reset_index()
    df.columns = [qual_features[i], 'Count']
    fig = px.bar(df, x=qual_features[i], y='Count', title=qual_features[i])
    fig.show()

In [None]:
# Correlation matrix
corr_matrix = data.corr().round(2)

fig = ff.create_annotated_heatmap(corr_matrix.values,
                                  x = corr_matrix.columns.tolist(),
                                  y = corr_matrix.index.tolist())


fig.show()

# 4) Preprocessing
---

In [7]:
#Drop lines containing outliers :
def drop_outlier(dataset, col):
    upper_outlier = dataset[col].mean() + 3 * dataset[col].std()
    lower_outlier = dataset[col].mean() - 3 * dataset[col].std()
    outlier_condition = (dataset[col] > upper_outlier) | (dataset[col] < lower_outlier)
    dataset = dataset[~outlier_condition]

    return dataset

Columns_to_clean = ['Price']
for col in Columns_to_clean:
    df = drop_outlier(data,col)

In [8]:
#Separate target from other explanatory variable :
target_variable = "Price"
X = df.drop([target_variable,'ID'] , axis = 1)
y = df[target_variable]

my_features_list = X.columns.tolist() #for later


#Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


#Numeric/categorical columns
categorical_features = []
numeric_features = [feature for feature in my_features_list if feature not in categorical_features]


#Transformer
numerical_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])


preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numeric_features)
        ])

#Preprocess
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

# 5) Baseline model (linear regression)
---

In [9]:
scoring_df = pd.DataFrame()

def log_model_score(df, model_name, R2_train, R2_test):
    rows = pd.DataFrame({
        'model_name': [model_name, model_name],
        'series': ['Train', 'Test'],
        'r2_score': [R2_train, R2_test]
    })
    return pd.concat([df, rows], ignore_index=True)


In [10]:
# Train model
regressor = LinearRegression()
regressor.fit(X_train, y_train)


# Predictions
y_train_pred = regressor.predict(X_train)
y_test_pred = regressor.predict(X_test)

# Print and save R2 scores
R2_train = r2_score(y_train, y_train_pred)
R2_test = r2_score(y_test, y_test_pred)
print("R2 score on training set : ", R2_train)
print("R2 score on test set : ", R2_test)

model_name = "Linear Regression"
scoring_df = log_model_score(scoring_df,model_name,R2_train, R2_test)


R2 score on training set :  0.9762006396979073
R2 score on test set :  0.9713328388699407


In [11]:
# Create a pandas DataFrame
column_names = []
for name, pipeline, features_list in preprocessor.transformers_:
    if name == 'num':
        features = features_list
    else:
        features = pipeline.named_steps['encoder'].get_feature_names_out()
    column_names.extend(features)
        
coefs = pd.DataFrame(index = column_names, data = regressor.coef_.transpose(), columns=["coefficients"])
feature_importance = abs(coefs).sort_values(by = 'coefficients')


# Plot coefficients
fig = px.bar(feature_importance, orientation = 'h')
fig.update_layout(showlegend = False, 
                  margin = {'l': 120} 
                 )
fig.show()

# 6) Ridge & Lasso
---

In [12]:
# Ridge
regressor = Ridge()
regressor.fit(X_train, y_train)

# Print and save R2 scores
R2_train = regressor.score(X_train, y_train)
R2_test = regressor.score(X_test, y_test)
print("R2 score on training set : ", R2_train)
print("R2 score on test set : ", R2_test)

model_name = "Ridge"
scoring_df = log_model_score(scoring_df,model_name,R2_train, R2_test)


R2 score on training set :  0.9761931103461313
R2 score on test set :  0.9712738486387501


In [13]:
regressor = Lasso()
regressor.fit(X_train, y_train)

# Print and save R2 scores
R2_train = regressor.score(X_train, y_train)
R2_test = regressor.score(X_test, y_test)
print("R2 score on training set : ", R2_train)
print("R2 score on test set : ", R2_test)

model_name = "Lasso"
scoring_df = log_model_score(scoring_df,model_name,R2_train, R2_test)

R2 score on training set :  0.9762006388681125
R2 score on test set :  0.9713317552439752


**==> Standard Ridge and Lasso have the sames resultats that the linear Regression in this case**

# 7) GridSearch
---

In [14]:
# Ridge with gridsearch

regressor = Ridge()
params = {
    'alpha': [0.01, 0.05, 0.1, 0.5, 1, 5, 10, 50, 100] # 0 corresponds to no regularization
}
gridsearch_ridge = GridSearchCV(regressor, param_grid=params, cv=3) # cv : the number of folds to be used for CV
gridsearch_ridge.fit(X_train, y_train)

print("Best hyperparameters : ", gridsearch_ridge.best_params_)
print("Best R2 score : ", gridsearch_ridge.best_score_)

# Print and save R2 scores
R2_train = gridsearch_ridge.score(X_train, y_train)
R2_test = gridsearch_ridge.score(X_test, y_test)
print("R2 score on training set : ", R2_train)
print("R2 score on test set : ", R2_test)

model_name = "Ridge with gridsearch"
scoring_df = log_model_score(scoring_df,model_name,R2_train, R2_test)

Best hyperparameters :  {'alpha': 0.01}
Best R2 score :  0.9737930165057401
R2 score on training set :  0.9762006389407066
R2 score on test set :  0.9713323217447846


In [15]:
# Lasso with gridsearch

regressor = Lasso()
params = {
    'alpha': [100, 500, 1000, 1500, 2000] # 0 corresponds to no regularization
}
gridsearch_lasso = GridSearchCV(regressor, param_grid=params, cv=3) # cv : the number of folds to be used for CV
gridsearch_lasso.fit(X_train, y_train)

print("Best hyperparameters : ", gridsearch_lasso.best_params_)
print("Best R2 score : ", gridsearch_lasso.best_score_)

# Print and save R2 scores
R2_train = gridsearch_lasso.score(X_train, y_train)
R2_test = gridsearch_lasso.score(X_test, y_test)
print("R2 score on training set : ", R2_train)
print("R2 score on test set : ", R2_test)

model_name = "Lasso with gridsearch"
scoring_df = log_model_score(scoring_df,model_name,R2_train, R2_test)

Best hyperparameters :  {'alpha': 100}
Best R2 score :  0.9737728453677504
R2 score on training set :  0.9761924093378729
R2 score on test set :  0.97121537977537


In [16]:
#show the feature coefficients from Ridge and Lasso
data_dict = {
    'Feature': column_names,
    'Best_Ridge': gridsearch_ridge.best_estimator_.coef_,
    'Best_Lasso': gridsearch_lasso.best_estimator_.coef_,
            }

coefficients_ridge_lasso = pd.DataFrame(data=data_dict)

fig = px.line(coefficients_ridge_lasso, x = 'Feature', y = ['Best_Ridge', 'Best_Lasso'])
fig.show()

# 8) Conclusion
---

In [17]:

fig = px.line(scoring_df, x='model_name', y='r2_score', color='series',
              markers=True, title="Scores R² pour chaque modèle")
fig.show()


**==> We will keep the baseline regression for our model. Lasso, Ridge and gridsearch have the same results**