# Importation des modules utiles à la construction de notre modèle de machine learning

In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
import os

# Lecture de la donnée

In [8]:
data = pd.read_csv('/home/apprenant/Documents/Brief-9-Rachid-Karbiche/data/02_intermediate/cleaned_data.csv')

In [9]:
data.select_dtypes('object')

Unnamed: 0,transactiondate,hashottuborspa,propertycountylandusecode,propertyzoningdesc,fireplaceflag,taxdelinquencyflag
0,2016-01-01,,0100,LARS,,
1,2016-01-01,,1,0,,
2,2016-01-01,,0100,PSR6,,
3,2016-01-02,,010C,LAR3,,
4,2016-01-02,,122,0,,
...,...,...,...,...,...,...
167883,2017-09-20,,010C,LARD3,,
167884,2017-09-20,,0100,LARE40,,
167885,2017-09-21,,1111,0,,
167886,2017-09-21,,0100,SER1*,,


In [10]:
data.shape

(167888, 45)

In [11]:
xgboost_data = data.drop(['transactiondate'], axis=1)
one_hot_encoding_X = pd.get_dummies(xgboost_data)

In [12]:
one_hot_encoding_X.head()

Unnamed: 0,parcelid,logerror,airconditioningtypeid,bathroomcnt,bedroomcnt,buildingqualitytypeid,calculatedbathnbr,calculatedfinishedsquarefeet,finishedsquarefeet12,fips,...,propertyzoningdesc_WVRPD18U*,propertyzoningdesc_WVRPD40000,propertyzoningdesc_WVRPD4OOOO,propertyzoningdesc_WVRPD56*,propertyzoningdesc_WVRR,propertyzoningdesc_WVRR1-RPD1,fireplaceflag_None,fireplaceflag_True,taxdelinquencyflag_None,taxdelinquencyflag_Y
0,11016594,0.0276,1,2.0,3,4,2.0,1684,1684,6037,...,0,0,0,0,0,0,1,0,1,0
1,14366692,-0.1684,0,3.5,4,0,3.5,2263,2263,6059,...,0,0,0,0,0,0,1,0,1,0
2,12098116,-0.004,1,3.0,2,4,3.0,2217,2217,6037,...,0,0,0,0,0,0,1,0,1,0
3,12643413,0.0218,1,2.0,2,4,2.0,839,839,6037,...,0,0,0,0,0,0,1,0,1,0
4,14432541,-0.005,0,2.5,4,0,2.5,2283,2283,6059,...,0,0,0,0,0,0,1,0,1,0


In [13]:
data = data.drop(['transactiondate', 'hashottuborspa', 'propertycountylandusecode', 'propertyzoningdesc', 'fireplaceflag', 'taxdelinquencyflag'], axis=1)

# Premier aperçu d'un modèle : Decision Tree Regressor

In [22]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error, mean_absolute_error
model = DecisionTreeRegressor()

## Choix des variables explicatives et de la variable cible

In [57]:
y = data["logerror"].values.astype(np.float32)
X = data.drop(['parcelid', 'logerror', 'transaction_month', 'transaction_year'], axis=1)
train_X, val_X, train_y, val_y = train_test_split(X, y,random_state = 0)

## Entraînement sur nos données d'entraînement

In [10]:
model.fit(train_X, train_y)

DecisionTreeRegressor()

## Prédictions à réaliser

In [18]:
predictions = model.predict(val_X)

NotFittedError: This DecisionTreeRegressor instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.

In [13]:
precision = r2_score(val_y, predictions)
error = mean_absolute_error(val_y, predictions)
print(precision)
print(error)

-1.288696106629561
0.11671748701895412


# Deuxième aperçu d'un modèle : Random Forest Regressor

In [21]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

In [20]:
forest_model = RandomForestRegressor(n_estimators = 60, random_state=1, max_features = 'sqrt', max_depth = 14, min_samples_split = 5, min_samples_leaf = 2, bootstrap = True, criterion = 'mae')
forest_model.fit(train_X, train_y)
forest_preds = forest_model.predict(val_X)
print(mean_absolute_error(val_y, forest_preds))

0.06862986269569013


In [28]:
def modelizing_GridSearchCV(target_value, evaluated_values, defined_parameters, model_name):
    y = target_value
    X = evaluated_values
    train_X, val_X, train_y, val_y = train_test_split(X, y,random_state = 1)
    parameters = defined_parameters
    model = model_name
    grid = GridSearchCV(model, parameters)
    grid.fit(X, y)
    return grid, train_X, val_X, train_y, val_y

In [17]:
random_forest_grid, train_X, val_X, train_y, val_y = modelizing_GridSearchCV(y, X, {'random_state':[1]}, RandomForestRegressor())

In [18]:
ytrain_pred = random_forest_grid.predict(train_X)
ytest_pred = random_forest_grid.predict(val_X)

In [22]:
print(mean_absolute_error(val_y, ytest_pred))

0.028479520650336595


# Troisième aperçu d'un modèle : XGBoost

In [1]:
from xgboost import XGBRegressor

In [10]:
xgboost_y = xgboost_data["logerror"].values.astype(np.float32)
xgboost_X = one_hot_encoding_X.drop(['parcelid', 'logerror'], axis=1)
X_train, X_val, y_train, y_val = train_test_split(xgboost_X, xgboost_y, random_state = 1)

In [11]:
my_model = XGBRegressor(n_estimators = 1000, learning_rate = 0.05)
my_model.fit(X_train, y_train, early_stopping_rounds=5, eval_set=[(X_val, y_val)], verbose=False)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.05, max_delta_step=0, max_depth=6,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=1000, n_jobs=8, num_parallel_tree=1, random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [12]:
predictions = my_model.predict(X_val)
print("Mean Absolute Error: " + str(mean_absolute_error(predictions, y_val)))

Mean Absolute Error: 0.06933563


# Quatrième aperçu d'un modèle : régression linéaire

## 1. Régression linéaire classique

In [27]:
from sklearn import linear_model

def modelizing_linear_regression(target_value, evaluated_values):
    y = target_value
    X = evaluated_values
    xtrain, xtest, ytrain, ytest = train_test_split(X, y, train_size=0.8)
    regr = linear_model.LinearRegression()
    regr.fit(xtrain, ytrain)
    return regr, xtrain, ytrain, xtest, ytest

defined_regr, x_train, y_train, x_test, y_test = modelizing_linear_regression(y, X)

In [18]:
def get_model_metrics(model, X, y, b1=True, b0=True, title=None):
    """
        Separate data in train and test sets,
        fit the model,
        make predictions on train and test datas,
        print metrics

        params:
            model(function): model used with params (ie: Lasso(alpha=x)) 
            X(DataFrame): DataFrame subset with selected features,
            y(Series): variable to predict

        returns: 
                print β1, β0, R2 and RMSE
    """
    # Split des datas
    x_train, x_test, y_train, y_test = train_test_split(
        X, y, train_size=0.8, random_state=1
    )
    model.fit(x_train, y_train)
    if title:
        print(f"{title} : ")
    if b1:
        # Affichage des β1 pour chaque variable
        for idx, name in enumerate(X_train.columns):
            print(f"β1 de {name} : {round(model.coef_[idx], 3)}")
    if b0:
        print(f"β0 (intercept_) : {round(model.intercept_, 3)}\n")

    y_train_pred = model.predict(x_train)
    y_test_pred = model.predict(x_test)

    y_list = [y_train, y_train_pred, y_test, y_test_pred]
    get_r2_rmse(y_list)


def get_r2_rmse(y_list):
    """
        calculate R2 and RMSE for each sets (train and test)
        and format output

        param:
            y_list = [
                y_train, 
                y_train_pred, 
                y_test, 
                y_test_pred
            ]
    """

    sets = ["Training", "Testing "]
    i = 0
    for set in sets:
        r2 = round(r2_score(y_list[i], y_list[i+1]), 3)
        rmse = round(mean_squared_error(
            y_list[i], y_list[i+1], squared=False), 3)
        print(
            "{} set : R2 = {}, RMSE = {}".format(set, r2, rmse)
        )
        i += 2
    print("\n")

In [19]:
ytrain_pred = defined_regr.predict(x_train)
ytest_pred = defined_regr.predict(x_test)

In [23]:
print ("Pour les données d'entrainement le R2 vaut {} alors que pour les données de test, il est de {}" 
      .format(round(r2_score(y_train, ytrain_pred),3), round(r2_score(y_test, ytest_pred),3)))

Pour les données d'entrainement le R2 vaut 0.005 alors que pour les données de test, il est de 0.003


In [24]:
print ("Pour les données d'entrainement le RMSE vaut {} alors que pour les données de test, il est de {}" 
      .format(round(mean_squared_error(y_train, ytrain_pred, squared=False),3), 
              round(mean_squared_error(y_test, ytest_pred, squared=False),3)))

Pour les données d'entrainement le RMSE vaut 0.166 alors que pour les données de test, il est de 0.164


In [25]:
print(mean_absolute_error(y_train, ytrain_pred))
print(mean_absolute_error(y_test, ytest_pred))

0.06942737287429818
0.0695109694233321


## 2. Régression Ridge

### GridSearchCV

In [29]:
ridge_grid, train_X, val_X, train_y, val_y = modelizing_GridSearchCV(y, X, {'alpha':np.arange(0, 10, 1).tolist(), 'normalize':[True,False]}, linear_model.Ridge())

  return linalg.solve(A, Xy, sym_pos=True,
  return linalg.solve(A, Xy, sym_pos=True,
  return linalg.solve(A, Xy, sym_pos=True,
  return linalg.solve(A, Xy, sym_pos=True,
  return linalg.solve(A, Xy, sym_pos=True,
  return linalg.solve(A, Xy, sym_pos=True,
  return linalg.solve(A, Xy, sym_pos=True,
  return linalg.solve(A, Xy, sym_pos=True,
  return linalg.solve(A, Xy, sym_pos=True,
  return linalg.solve(A, Xy, sym_pos=True,
  return linalg.solve(A, Xy, sym_pos=True,
  return linalg.solve(A, Xy, sym_pos=True,
  return linalg.solve(A, Xy, sym_pos=True,
  return linalg.solve(A, Xy, sym_pos=True,
  return linalg.solve(A, Xy, sym_pos=True,
  return linalg.solve(A, Xy, sym_pos=True,
  return linalg.solve(A, Xy, sym_pos=True,
  return linalg.solve(A, Xy, sym_pos=True,
  return linalg.solve(A, Xy, sym_pos=True,
  return linalg.solve(A, Xy, sym_pos=True,
  return linalg.solve(A, Xy, sym_pos=True,
  return linalg.solve(A, Xy, sym_pos=True,
  return linalg.solve(A, Xy, sym_pos=True,
  return li

In [30]:
print(ridge_grid.best_estimator_)

Ridge(alpha=9)


In [32]:
get_model_metrics(linear_model.Ridge(alpha=9), X, y, b1=False, b0=False)

Training set : R2 = 0.006, RMSE = 0.166
Testing  set : R2 = 0.001, RMSE = 0.162




In [33]:
ytest_pred_ridge = ridge_grid.predict(val_X)
print(mean_absolute_error(val_y, ytest_pred_ridge))

0.06918224215241829


### RandomizedSearchCV

In [62]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform

def modelizing_RandomizedSearchCV(target_value, evaluated_values, defined_parameters, model_name, n_iter=10, cv=5, random_state=None):
    y = target_value
    x = evaluated_values
    xtrain, xtest, ytrain, ytest = train_test_split(X, y, train_size=0.8)
    parameters = defined_parameters
    model = model_name
    regr = RandomizedSearchCV(model, parameters, n_iter=n_iter, cv=cv, random_state=random_state)
    regr.fit(x, y)
    return regr, xtrain, ytrain, xtest, ytest

In [63]:
ridge_randomized, xtrain, ytrain, xtest, ytest = modelizing_RandomizedSearchCV(y, X, {'alpha':uniform(loc=0, scale=10), 'normalize':[True, False]}, linear_model.Ridge(), random_state=1)

  return linalg.solve(A, Xy, sym_pos=True,
  return linalg.solve(A, Xy, sym_pos=True,
  return linalg.solve(A, Xy, sym_pos=True,
  return linalg.solve(A, Xy, sym_pos=True,
  return linalg.solve(A, Xy, sym_pos=True,
  return linalg.solve(A, Xy, sym_pos=True,
  return linalg.solve(A, Xy, sym_pos=True,
  return linalg.solve(A, Xy, sym_pos=True,
  return linalg.solve(A, Xy, sym_pos=True,
  return linalg.solve(A, Xy, sym_pos=True,
  return linalg.solve(A, Xy, sym_pos=True,
  return linalg.solve(A, Xy, sym_pos=True,
  return linalg.solve(A, Xy, sym_pos=True,
  return linalg.solve(A, Xy, sym_pos=True,
  return linalg.solve(A, Xy, sym_pos=True,
  return linalg.solve(A, Xy, sym_pos=True,
  return linalg.solve(A, Xy, sym_pos=True,
  return linalg.solve(A, Xy, sym_pos=True,
  return linalg.solve(A, Xy, sym_pos=True,
  return linalg.solve(A, Xy, sym_pos=True,


In [64]:
print(ridge_randomized.best_estimator_)

Ridge(alpha=9.325573593386588)


In [65]:
get_model_metrics(linear_model.Ridge(alpha=9.325573593386588), X, y, b1=False, b0=False)

Training set : R2 = 0.005, RMSE = 0.166
Testing  set : R2 = 0.0, RMSE = 0.162




In [66]:
ytest_pred_ridge_randomized = ridge_randomized.predict(xtest)
print(mean_absolute_error(ytest, ytest_pred_ridge_randomized))

0.06863828428555449


## 3. Régression Lasso

### GridSearchCV

In [39]:
lasso_grid, train_X, val_X, train_y, val_y = modelizing_GridSearchCV(y, X, {'alpha':np.arange(100, 110, 1).tolist(), 'normalize':[True,False]}, linear_model.Lasso())

In [40]:
print(lasso_grid.best_estimator_)

Lasso(alpha=100, normalize=True)


In [41]:
get_model_metrics(linear_model.Ridge(alpha=100), X, y, b1=False, b0=False)

Training set : R2 = 0.006, RMSE = 0.166
Testing  set : R2 = 0.001, RMSE = 0.162


  return linalg.solve(A, Xy, sym_pos=True,


In [42]:
ytest_pred_lasso_grid = lasso_grid.predict(val_X)
print(mean_absolute_error(val_y, ytest_pred_lasso_grid))

0.069164405191441


### RandomizedSearchCV

In [43]:
lasso_randomized, xtrain, ytrain, xtest, ytest = modelizing_RandomizedSearchCV(y, X, {'alpha':uniform(loc=100, scale=300), 'normalize':[True, False]}, linear_model.Lasso(), random_state=1)

In [44]:
print(lasso_randomized.best_estimator_)

Lasso(alpha=225.1066014107722, normalize=True)


In [46]:
get_model_metrics(linear_model.Lasso(alpha=225.1066014107722), X, y, b1=False, b0=False)

Training set : R2 = 0.0, RMSE = 0.167
Testing  set : R2 = -0.0, RMSE = 0.162




In [47]:
ytest_pred_lasso_randomized = lasso_randomized.predict(xtest)
print(mean_absolute_error(ytest, ytest_pred_lasso_randomized))

0.0693746914802827


# Sample submission

In [67]:
df_samples = pd.read_csv('/home/apprenant/Documents/Brief-9-Rachid-Karbiche/data/01_raw/sample_submission.csv')
df_samples.head()

Unnamed: 0,ParcelId,201610,201611,201612,201710,201711,201712
0,10754147,0,0,0,0,0,0
1,10759547,0,0,0,0,0,0
2,10843547,0,0,0,0,0,0
3,10859147,0,0,0,0,0,0
4,10879947,0,0,0,0,0,0


In [68]:
prop_2016_df = pd.read_csv("/home/apprenant/Documents/Brief-9-Rachid-Karbiche/data/01_raw/properties_2016.csv", index_col='parcelid', low_memory=False)

  mask |= (ar1 == a)


In [69]:
df_samples['parcelid'] = df_samples['ParcelId']
sub = df_samples.merge(prop_2016_df, on='parcelid', how='left')

In [70]:
X_test = prop_2016_df[X.columns].copy()

In [72]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

# Select numerical columns
numerical_cols = [cname for cname in train_X.columns if 
                train_X[cname].dtype in ['int64', 'float64']]

# "Cardinality" means the number of unique values in a column
# Select categorical columns with relatively low cardinality (convenient but arbitrary)
categorical_cols = [cname for cname in train_X.columns if
                    train_X[cname].nunique() < 10 and 
                    train_X[cname].dtype == "object"]

# Preprocessing for numerical data
numerical_transformer = SimpleImputer(strategy='constant')

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])


# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])


# Bundle preprocessing and modeling code in a pipeline
clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('model', ridge_randomized)
                     ])

# Preprocessing of training data, fit model 
clf.fit(xtrain, ytrain)

  return linalg.solve(A, Xy, sym_pos=True,
  return linalg.solve(A, Xy, sym_pos=True,
  return linalg.solve(A, Xy, sym_pos=True,
  return linalg.solve(A, Xy, sym_pos=True,
  return linalg.solve(A, Xy, sym_pos=True,
  return linalg.solve(A, Xy, sym_pos=True,
  return linalg.solve(A, Xy, sym_pos=True,
  return linalg.solve(A, Xy, sym_pos=True,


Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  SimpleImputer(strategy='constant'),
                                                  ['airconditioningtypeid',
                                                   'bathroomcnt', 'bedroomcnt',
                                                   'buildingqualitytypeid',
                                                   'calculatedbathnbr',
                                                   'calculatedfinishedsquarefeet',
                                                   'finishedsquarefeet12',
                                                   'fips', 'fireplacecnt',
                                                   'fullbathcnt',
                                                   'garagecarcnt',
                                                   'garagetotalsqft',
                                                   'heatingorsystemtypeid',
        

In [73]:
preds_full = clf.predict(X_test)

In [74]:
sub['201610'] = preds_full
sub['201611'] = preds_full
sub['201612'] = preds_full
sub['201710'] = preds_full
sub['201711'] = preds_full
sub['201712'] = preds_full

In [75]:
sub.head()

Unnamed: 0,ParcelId,201610,201611,201612,201710,201711,201712,parcelid,airconditioningtypeid,architecturalstyletypeid,...,numberofstories,fireplaceflag,structuretaxvaluedollarcnt,taxvaluedollarcnt,assessmentyear,landtaxvaluedollarcnt,taxamount,taxdelinquencyflag,taxdelinquencyyear,censustractandblock
0,10754147,0.014027,0.014027,0.014027,0.014027,0.014027,0.014027,10754147,,,...,,,,9.0,2015.0,9.0,,,,
1,10759547,0.013393,0.013393,0.013393,0.013393,0.013393,0.013393,10759547,,,...,,,,27516.0,2015.0,27516.0,,,,
2,10843547,0.420356,0.420356,0.420356,0.420356,0.420356,0.420356,10843547,,,...,,,650756.0,1413387.0,2015.0,762631.0,20800.37,,,
3,10859147,0.003041,0.003041,0.003041,0.003041,0.003041,0.003041,10859147,,,...,1.0,,571346.0,1156834.0,2015.0,585488.0,14557.57,,,
4,10879947,-0.002567,-0.002567,-0.002567,-0.002567,-0.002567,-0.002567,10879947,,,...,,,193796.0,433491.0,2015.0,239695.0,5725.17,,,


In [76]:
sub = sub[['ParcelId', '201610', '201611', '201612', '201710', '201711', '201712']]
sub.head()

Unnamed: 0,ParcelId,201610,201611,201612,201710,201711,201712
0,10754147,0.014027,0.014027,0.014027,0.014027,0.014027,0.014027
1,10759547,0.013393,0.013393,0.013393,0.013393,0.013393,0.013393
2,10843547,0.420356,0.420356,0.420356,0.420356,0.420356,0.420356
3,10859147,0.003041,0.003041,0.003041,0.003041,0.003041,0.003041
4,10879947,-0.002567,-0.002567,-0.002567,-0.002567,-0.002567,-0.002567


In [78]:
sub.to_csv('/home/apprenant/Documents/Brief-9-Rachid-Karbiche/data/02_intermediate/random_forest2.csv', index=False, float_format='%.4f')