* [1.Load Data From CSV File](#ld)
* [2.Build a pipeline](#bp)
* [3.Optimisation du modèle](#om)


In [1]:
#pip install category-encoders

In [2]:
import pandas as pd
import numpy as np

import missingno as msno
import seaborn as sns

import matplotlib.pyplot as plt 
import warnings
warnings.filterwarnings('ignore')

from sklearn.preprocessing import OneHotEncoder,StandardScaler

from sklearn.compose import make_column_transformer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn import metrics

### Load Data From CSV File<a class="anchor" id="ld"></a>

In [3]:
df = pd.read_csv('prepared_data.csv', sep='\t')
df.head()

Unnamed: 0,OSEBuildingID,groups_building,groups_type,NumberofBuildings,NumberofFloors,ENERGYSTARScore,SteamUse(kBtu),Electricity(kBtu),NaturalGas(kBtu),groups_ComplianceStatus,tranche_date,BuildingAge,SiteEnergyUse(kBtu),TotalGHGEmissions
0,1,NonResidential,Various,1.0,12,60.0,2003882.0,3946027.0,1276453.0,Compliant,tranche_date1,89,7226362.5,249.98
1,2,NonResidential,Various,1.0,11,61.0,0.0,3242851.0,5145082.0,Compliant,tranche_date2,20,8387933.0,295.86
2,3,NonResidential,Various,1.0,41,43.0,21566554.0,49526664.0,1493800.0,Compliant,tranche_date2,47,72587024.0,2089.28
3,5,NonResidential,Various,1.0,10,56.0,2214446.25,2768924.0,1811213.0,Compliant,tranche_date1,90,6794584.0,286.43
4,8,NonResidential,Various,1.0,18,75.0,0.0,5368607.0,8803998.0,Compliant,tranche_date2,36,14172606.0,505.01


In [4]:
df.shape

(1533, 14)

In [5]:
df.columns

Index(['OSEBuildingID', 'groups_building', 'groups_type', 'NumberofBuildings',
       'NumberofFloors', 'ENERGYSTARScore', 'SteamUse(kBtu)',
       'Electricity(kBtu)', 'NaturalGas(kBtu)', 'groups_ComplianceStatus',
       'tranche_date', 'BuildingAge', 'SiteEnergyUse(kBtu)',
       'TotalGHGEmissions'],
      dtype='object')

In [6]:
df=df[['groups_building', 'groups_type', 'NumberofBuildings',
       'NumberofFloors', 'ENERGYSTARScore', 'SteamUse(kBtu)',
       'Electricity(kBtu)', 'NaturalGas(kBtu)', 'groups_ComplianceStatus',
       'tranche_date', 'BuildingAge', 
       'TotalGHGEmissions']]

In [7]:
numercial_cols = [col for col in df.columns if df[col].dtype!="O"]
print(numercial_cols)

['NumberofBuildings', 'NumberofFloors', 'ENERGYSTARScore', 'SteamUse(kBtu)', 'Electricity(kBtu)', 'NaturalGas(kBtu)', 'BuildingAge', 'TotalGHGEmissions']


In [8]:
categorical_cols = [col for col in df.columns if df[col].dtype=="O"]
print(categorical_cols)

['groups_building', 'groups_type', 'groups_ComplianceStatus', 'tranche_date']


In [9]:
df['groups_building'].unique()

array(['NonResidential', 'Other'], dtype=object)

In [10]:
df['groups_ComplianceStatus'].unique()

array(['Compliant', 'Other'], dtype=object)

In [11]:
df['groups_type'].unique()

array(['Various', 'Commerce', 'Health'], dtype=object)

In [12]:
df['tranche_date'].unique()

array(['tranche_date1', 'tranche_date2', 'tranche_date3'], dtype=object)

In [13]:
df.shape

(1533, 12)

**Note**
<br>
en se basant sur lamatrice de corrélation on va enlever dans la suite les variables:
'Electricity(kBtu)','SiteEnergyUse(kBtu)','NaturalGas(kBtu)'

In [14]:
numercial_cols= list(set(numercial_cols) - set([ 'Electricity(kBtu)','SiteEnergyUse(kBtu)','NaturalGas(kBtu)']))

In [15]:
numercial_cols_for_transformer=list(set(numercial_cols)-set(['TotalGHGEmissions']))

In [16]:
retained_columns=categorical_cols+numercial_cols

In [17]:
df1=df[retained_columns]

In [28]:
numercial_cols_for_transformer

['NumberofFloors',
 'NumberofBuildings',
 'ENERGYSTARScore',
 'BuildingAge',
 'SteamUse(kBtu)']

## Build a pipeline<a class="anchor" id="bp"></a>

In [19]:

numeric_transformer = Pipeline(
    steps=[("scaler", StandardScaler())]
)

categorical_transformer = OneHotEncoder(handle_unknown="ignore")
 #when using sklearn.compose.ColumnTransformer You need to make sure the output column is not included in the data you create your Preprocessing pipeline with

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numercial_cols_for_transformer),
        ("cat", categorical_transformer, categorical_cols),
    ]
)

In [20]:

clf = Pipeline(
    steps=[("preprocessor", preprocessor), ("classifier", LinearRegression())]
)


In [21]:

y=df1[['TotalGHGEmissions']]
y= StandardScaler().fit_transform(y)
features=['groups_building', 'groups_type', 'groups_ComplianceStatus',
       'tranche_date', 'SteamUse(kBtu)', 'NumberofBuildings', 'NumberofFloors',
       'ENERGYSTARScore', 'BuildingAge']
X=df1[features]

In [23]:
df1.columns

Index(['groups_building', 'groups_type', 'groups_ComplianceStatus',
       'tranche_date', 'TotalGHGEmissions', 'NumberofFloors',
       'NumberofBuildings', 'ENERGYSTARScore', 'BuildingAge',
       'SteamUse(kBtu)'],
      dtype='object')

In [29]:
X.columns

Index(['groups_building', 'groups_type', 'groups_ComplianceStatus',
       'tranche_date', 'SteamUse(kBtu)', 'NumberofBuildings', 'NumberofFloors',
       'ENERGYSTARScore', 'BuildingAge'],
      dtype='object')

In [31]:
y

array([[ 0.13581128],
       [ 0.21029077],
       [ 3.12164693],
       ...,
       [ 0.09288979],
       [-0.23410248],
       [-0.20299901]])

In [24]:

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [25]:
X_train.columns

Index(['groups_building', 'groups_type', 'groups_ComplianceStatus',
       'tranche_date', 'SteamUse(kBtu)', 'NumberofBuildings', 'NumberofFloors',
       'ENERGYSTARScore', 'BuildingAge'],
      dtype='object')

In [26]:
clf

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('scaler',
                                                                   StandardScaler())]),
                                                  ['NumberofFloors',
                                                   'NumberofBuildings',
                                                   'ENERGYSTARScore',
                                                   'BuildingAge',
                                                   'SteamUse(kBtu)']),
                                                 ('cat',
                                                  OneHotEncoder(handle_unknown='ignore'),
                                                  ['groups_building',
                                                   'groups_type',
                                                   'groups_ComplianceStatus',
                                    

### Accurancy

In [27]:
train_score = clf.score(X_train,y_train)
test_score = clf.score(X_test,y_test)

print(f'Train Accuracy : {train_score:.3f}')
print(f'Test Accuracy : {test_score:.3f}')

NotFittedError: This ColumnTransformer instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.

In [None]:
#Check RMSE

from sklearn.metrics import mean_squared_error
y_pred = clf.predict(X_test)
print(f'RMSE : {np.sqrt(mean_squared_error(y_test, y_pred)):.3f}')

### Paramètres du modèle

In [None]:

clf.named_steps['classifier'].intercept_

In [None]:
clf.named_steps['classifier'].coef_

## Optimisation du modèle <a class="anchor" id="om"></a>

#### cross validation

In [None]:
from sklearn.model_selection import cross_val_score
import sklearn
scores = cross_val_score(clf, X, y, cv=10,scoring='r2')#coefficient of determination (pr défaut). how well the regression model fits the observed data.a higher coefficient indicates a better fit for the model.
scores

In [None]:
print(" mean of  coefficient of determination= %0.2f" % scores.mean())

In [None]:
# can tune other metrics, such as MSE
scores = cross_val_score(clf, X, y, scoring='neg_mean_squared_error', cv=10)# Thus a big neg_mean_squared_error is better than a low one.
scores


In [None]:
print(" mean of  neg_mean_squared_error= %0.2f" % scores.mean())

#### ajustement des paramètres en utilisant gridsearch

In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV
#LinearRegression().get_params().keys()
param_grid = {
    
    "classifier__fit_intercept": [True, False],
    "classifier__copy_X": [True, False],
    "classifier": [LinearRegression()]
}

grid_search = GridSearchCV(clf, param_grid, cv=10)
grid_search

In [None]:
# cv results
cv_results = pd.DataFrame(grid_search.cv_results_)
cv_results

In [None]:
LinearRegression().get_params().keys()

##### 