# BUILDING A PIPELINE

### loading the data

In [38]:
import pandas as pd

#____loading data____
df = pd.read_csv('./data/clean_house.csv')
# df = pd.read_csv('./data/clean_app.csv')

### defining target and features

In [39]:
# _____defining target and features____
target_column_to_drop = ['price']
X = df.drop(columns=target_column_to_drop, axis=1)
y = df['price']

### Preprocessing

In [40]:
# _____Define the columns you want to drop______
columns_to_drop = ['property_id', 'latitude', 'longitude', 'property_type', 'type_of_sale', 'fully_equipped_kitchen', 'locality_name', 'main_city']
X = X.drop(columns=columns_to_drop, axis=1)

In [15]:
# for app only

app_columns_to_drop = ['surface_of_good']

# X_train = X_train.drop(columns=app_columns_to_drop, axis=1)
# y_train = y_train.drop(columns=app_columns_to_drop, axis=1)

X = X.drop(columns=app_columns_to_drop, axis=1)

### Handling missing values

In [41]:
# _____imputing missing values for swimmingpool____
from sklearn.impute import SimpleImputer
import numpy as np

# Define the imputer to replace missing values with 0.0
constant_imputer = SimpleImputer(missing_values=np.nan, strategy='constant', fill_value=0.0)

# Define the column containing missing values
columns_with_missing_values = ['swimming_pool']

# Impute missing values in X_train
X[columns_with_missing_values] = constant_imputer.fit_transform(X[columns_with_missing_values])

In [42]:
# _____one-hot encoding for kitchen_type____
X = pd.get_dummies(X, columns=["kitchen_type"], prefix="kitchen_type")

# _____one-hot encoding for state_of_building____
X = pd.get_dummies(X, columns=["state_of_building"], prefix="state_of_building")

# _____one-hot encoding for property_subtype____
X = pd.get_dummies(X, columns=["property_subtype"], prefix="property_subtype")

# _____one-hot encoding for province____
X = pd.get_dummies(X, columns=["province"], prefix="province")

### defining X_train, X_test, y_train, y_test

In [43]:
# _____creating training and testing sets____
from sklearn.model_selection import train_test_split

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [44]:
# _____imputing -1 for missing values for number_of_rooms, terrace_area, garden_area, furnished, garden, terrace, number_of_facades____

constant_imputer = SimpleImputer(missing_values=np.nan, strategy='constant', fill_value=-1)

colums_to_impute = ['number_of_rooms', 'terrace_area', 'garden_area', 'furnished', 'garden', 'terrace', 'number_of_facades']


X_train[colums_to_impute] = constant_imputer.fit_transform(X_train[colums_to_impute])
X_test[colums_to_impute] = constant_imputer.fit_transform(X_test[colums_to_impute])

# Fitting and predicting the model

```python	
# latest results for houses
Fitting 3 folds for each of 12 candidates, totalling 36 fits
randomforest choosen max_depth:  20
randomforest choosen n_estimators:  100
randomforest training score:  0.9591154789033177
randomforest testscore:  0.7096580474996514
rmse:  156079.94095185387
cross val score:  0.7267999915259765

# latest results for apartments
Fitting 3 folds for each of 12 candidates, totalling 36 fits
randomforest choosen max_depth:  20
randomforest choosen n_estimators:  200
randomforest training score:  0.9657029558400106
randomforest testscore:  0.7043570191397667
rmse:  191241.85796628642
cross val score:  0.77146716370121
```

In [45]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error as MSE
import sklearn.metrics  
import math

# cv =3 for 3fold cross validation

param_grid = {
    'max_depth': [10, 15, 20, 25, 30, 40],
    'n_estimators': [100, 200]}
grid = GridSearchCV(RandomForestRegressor(), param_grid=param_grid, cv=3, verbose=1)
grid.fit(X, y)
dept = grid.best_params_['max_depth']
estimators = grid.best_params_['n_estimators']

randomforest = RandomForestRegressor(n_estimators = estimators, max_depth = dept)

#____train the model____
randomforest.fit(X_train, y_train)

#____show the choosen max_depth____
print("randomforest choosen max_depth: ", dept)

#____show the choosen n_estimators____
print("randomforest choosen n_estimators: ", estimators)

#____show the score____
print("randomforest training score: ", randomforest.score(X_train, y_train))

#____test the model____
y_prediction = randomforest.predict(X_test)

#____show the score____
print("randomforest testscore: ", randomforest.score(X_test, y_test))

mse = MSE(y_test, y_prediction)
rmse = math.sqrt(mse)
print("rmse: ",rmse)

print("cross val score: ", cross_val_score(randomforest, X_train, y_train).mean())

Fitting 3 folds for each of 12 candidates, totalling 36 fits
randomforest choosen max_depth:  20
randomforest choosen n_estimators:  100
randomforest training score:  0.9591154789033177
randomforest testscore:  0.7096580474996514
rmse:  156079.94095185387
cross val score:  0.7267999915259765


In [22]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline

pipe = Pipeline([
    ('regressor', RandomForestRegressor())
])

params = dict(
    regressor__max_depth = [10, 15, 20, 25, 30],
    regressor__n_estimators = [100, 200]
)

grid_search = GridSearchCV(pipe, param_grid=params)
gs = grid_search.fit(X_train, y_train).best_params_