<a href="https://colab.research.google.com/github/Mrrohit0806/Capstone-project-01/blob/main/Model_selection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [160]:
import numpy as np
import pandas as pd

from sklearn.model_selection import KFold, cross_val_score
from sklearn.linear_model import LogisticRegression, LinearRegression, Ridge, Lasso
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler,OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.neural_network import MLPRegressor

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

from sklearn.decomposition import PCA

from xgboost import XGBRegressor

In [161]:
df = pd.read_csv('/content/gurgaon_properties_post_feature_selection.csv')

In [162]:
df.head()

Unnamed: 0,property_type,sector,bedRoom,bathroom,balcony,agePossession,built_up_area,servant room,store room,furnishing_type,luxury_category,floor_category,Price_in_Crore
0,0.0,168.0,2.0,2,2.0,4.0,1260.0,0,0,0,1.0,0.0,1.99
1,0.0,221.0,1.0,1,0.0,3.0,484.0,0,0,0,1.0,1.0,0.21
2,0.0,201.0,2.0,2,3.0,0.0,1000.0,0,0,0,1.0,1.0,0.5
3,0.0,221.0,2.0,2,2.0,1.0,623.0,0,0,0,2.0,1.0,0.42
4,0.0,193.0,2.0,2,3.0,4.0,1016.0,0,0,0,2.0,0.0,1.3


In [261]:
df['furnishing_type'].value_counts()

Unnamed: 0_level_0,count
furnishing_type,Unnamed: 1_level_1
unfurnished,2458
furnished,1010
semifurnished,189


In [262]:
# 0 -> unfurnished
# 1 -> semifurnished
# 2 -> furnished
df['furnishing_type'] = df['furnishing_type'].replace({0.0:'unfurnished',1.0:'semifurnished',2.0:'furnished'})

In [263]:
df.head()

Unnamed: 0,property_type,sector,bedRoom,bathroom,balcony,agePossession,built_up_area,servant room,store room,furnishing_type,luxury_category,floor_category,Price_in_Crore
0,0.0,168.0,2.0,2,2.0,4.0,1260.0,0,0,unfurnished,1.0,0.0,1.99
1,0.0,221.0,1.0,1,0.0,3.0,484.0,0,0,unfurnished,1.0,1.0,0.21
2,0.0,201.0,2.0,2,3.0,0.0,1000.0,0,0,unfurnished,1.0,1.0,0.5
3,0.0,221.0,2.0,2,2.0,1.0,623.0,0,0,unfurnished,2.0,1.0,0.42
4,0.0,193.0,2.0,2,3.0,4.0,1016.0,0,0,unfurnished,2.0,0.0,1.3


In [264]:
X = df.drop(columns=['Price_in_Crore'])
y = df['Price_in_Crore']

In [265]:
# Applying the log1p transformation to the target variable
y_transformed = np.log1p(y)

**Ordinal Encoding**

In [271]:
# Creating a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
        ('cat', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), columns_to_encode)
    ],
    remainder='passthrough'
)

In [272]:
# Creating a pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

In [273]:
# K-fold cross-validation
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')

In [274]:
scores.mean(),scores.std()

(np.float64(0.386485175077041), np.float64(0.04504679835315389))

In [275]:
X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)

In [276]:
pipeline.fit(X_train,y_train)

In [277]:
Pipeline(steps=[('preprocessor',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('num', StandardScaler(),
                                                  ['bedRoom', 'bathroom',
                                                   'built_up_area',
                                                   'servant room',
                                                   'store room']),
                                                 ('cat', OrdinalEncoder(),
                                                  ['property_type', 'sector',
                                                   'balcony', 'agePossession',
                                                   'furnishing_type',
                                                   'luxury_category',
                                                   'floor_category'])])),
                ('regressor', LinearRegression())])

In [278]:
y_pred = pipeline.predict(X_test)

In [279]:
mean_absolute_error(np.expm1(y_test),y_pred)

1.0137775370538264

In [280]:
def scorer(model_name, model):

    output = []

    output.append(model_name)

    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', model)
    ])

    # K-fold cross-validation
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')

    output.append(scores.mean())

    X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)

    pipeline.fit(X_train,y_train)

    y_pred = pipeline.predict(X_test)

    y_pred = np.expm1(y_pred)

    output.append(mean_absolute_error(np.expm1(y_test),y_pred))

    return output


In [281]:
model_dict = {
    'linear_reg':LinearRegression(),
    'svr':SVR(),
    'ridge':Ridge(),
    'LASSO':Lasso(),
    'decision tree': DecisionTreeRegressor(),
    'random forest':RandomForestRegressor(),
    'extra trees': ExtraTreesRegressor(),
    'gradient boosting': GradientBoostingRegressor(),
    'adaboost': AdaBoostRegressor(),
    'mlp': MLPRegressor(),
    'xgboost':XGBRegressor()
}

In [282]:
model_output = []
for model_name,model in model_dict.items():
    model_output.append(scorer(model_name, model))



In [283]:
model_df = pd.DataFrame(model_output, columns=['name','r2','mae'])

In [284]:
model_df.sort_values(['mae'])

Unnamed: 0,name,r2,mae
10,xgboost,0.866177,0.308875
5,random forest,0.813392,0.356077
7,gradient boosting,0.77865,0.431612
6,extra trees,0.71578,0.460689
4,decision tree,0.65718,0.461791
8,adaboost,0.576809,0.646
9,mlp,0.486717,0.652189
1,svr,0.415286,0.752842
0,linear_reg,0.386485,0.755474
2,ridge,0.386488,0.755481


xgboost	0.866177(r2)	0.308875 cr (mean absolute error)

**OneHotEncoding**

In [293]:
# Creating a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
        ('cat', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), columns_to_encode),
        ('cat1',OneHotEncoder(drop='first', handle_unknown='ignore'),['sector','agePossession','furnishing_type'])
    ],
    remainder='passthrough'
)

In [294]:
# Creating a pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

In [295]:
# K-fold cross-validation
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')



In [296]:
scores.mean()

np.float64(0.7569223356025343)

In [297]:
scores.std()

np.float64(0.021871242339611505)

In [298]:
X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)

In [299]:
pipeline.fit(X_train,y_train)

In [300]:
Pipeline(steps=[('preprocessor',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('num', StandardScaler(),
                                                  ['bedRoom', 'bathroom',
                                                   'built_up_area',
                                                   'servant room',
                                                   'store room']),
                                                 ('cat', OrdinalEncoder(),
                                                  ['property_type', 'sector',
                                                   'balcony', 'agePossession',
                                                   'furnishing_type',
                                                   'luxury_category',
                                                   'floor_category']),
                                                 ('cat1',
                                                  OneHotEncoder(drop='first'),
                                                  ['sector', 'agePossession',
                                                   'furnishing_type'])])),
                ('regressor', LinearRegression())])

In [301]:
y_pred = pipeline.predict(X_test)



In [302]:
y_pred = np.expm1(y_pred)

In [303]:
mean_absolute_error(np.expm1(y_test),y_pred)

0.46790451507292147

In [304]:
def scorer(model_name, model):

    output = []

    output.append(model_name)

    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', model)
    ])

    # K-fold cross-validation
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')

    output.append(scores.mean())

    X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)

    pipeline.fit(X_train,y_train)

    y_pred = pipeline.predict(X_test)

    y_pred = np.expm1(y_pred)

    output.append(mean_absolute_error(np.expm1(y_test),y_pred))

    return output

In [305]:
model_dict = {
    'linear_reg':LinearRegression(),
    'svr':SVR(),
    'ridge':Ridge(),
    'LASSO':Lasso(),
    'decision tree': DecisionTreeRegressor(),
    'random forest':RandomForestRegressor(),
    'extra trees': ExtraTreesRegressor(),
    'gradient boosting': GradientBoostingRegressor(),
    'adaboost': AdaBoostRegressor(),
    'mlp': MLPRegressor(),
    'xgboost':XGBRegressor()
}

In [306]:
model_output = []
for model_name,model in model_dict.items():
    model_output.append(scorer(model_name, model))



In [307]:
model_df = pd.DataFrame(model_output, columns=['name','r2','mae'])

In [308]:
model_df.sort_values(['mae'])

Unnamed: 0,name,r2,mae
10,xgboost,0.88198,0.283303
5,random forest,0.865998,0.283966
6,extra trees,0.873449,0.298368
4,decision tree,0.776229,0.336569
9,mlp,0.840028,0.340661
7,gradient boosting,0.801106,0.42162
0,linear_reg,0.756922,0.467905
2,ridge,0.756867,0.468721
8,adaboost,0.571321,0.637153
1,svr,0.415581,0.753566


xgboost	0.881980(r2 score)

0.283303 cr ( mean absolute error )

**OneHotEncoding With SVD**

In [320]:
# Creating a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
        ('cat', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), columns_to_encode),
        ('cat1',OneHotEncoder(drop='first', handle_unknown='ignore'),['sector','agePossession','furnishing_type'])
    ],
    remainder='passthrough'
)

In [324]:
from sklearn.decomposition import TruncatedSVD

# Creating a pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('svd', TruncatedSVD(n_components=100)), # Set n_components to an integer
    ('regressor', LinearRegression())
])

In [325]:
# K-fold cross-validation
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')



### Determining the optimal `n_components` for TruncatedSVD

To choose an optimal integer value for `n_components` in `TruncatedSVD` to retain a certain percentage of variance, you can follow these steps:

1.  Apply the preprocessing steps to your data.
2.  Fit `TruncatedSVD` to the preprocessed data without specifying `n_components` initially.
3.  Examine the `explained_variance_ratio_` attribute of the fitted `TruncatedSVD` object. This attribute is an array where each value represents the percentage of variance explained by each component.
4.  Calculate the cumulative explained variance and determine the number of components needed to reach your desired percentage (e.g., 95%).
5.  Use this number as the integer value for `n_components` in your `TruncatedSVD` step within the pipeline.

In [326]:
scores.mean()

np.float64(0.7166366345710775)

In [327]:
scores.std()

np.float64(0.02224804792030666)

In [332]:
def scorer(model_name, model):

    output = []

    output.append(model_name)

    pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('svd', TruncatedSVD(n_components=100)), # Set n_components to an integer
    ('regressor', LinearRegression())
])

    # K-fold cross-validation
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')

    output.append(scores.mean())

    X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)

    pipeline.fit(X_train,y_train)

    y_pred = pipeline.predict(X_test)

    y_pred = np.expm1(y_pred)

    output.append(mean_absolute_error(np.expm1(y_test),y_pred))

    return output

In [333]:
model_dict = {
    'linear_reg':LinearRegression(),
    'svr':SVR(),
    'ridge':Ridge(),
    'LASSO':Lasso(),
    'decision tree': DecisionTreeRegressor(),
    'random forest':RandomForestRegressor(),
    'extra trees': ExtraTreesRegressor(),
    'gradient boosting': GradientBoostingRegressor(),
    'adaboost': AdaBoostRegressor(),
    'mlp': MLPRegressor(),
    'xgboost':XGBRegressor()
}

In [334]:
model_output = []
for model_name,model in model_dict.items():
    model_output.append(scorer(model_name, model))



In [335]:
model_df = pd.DataFrame(model_output, columns=['name','r2','mae'])

In [336]:
model_df.sort_values(['mae'])

Unnamed: 0,name,r2,mae
9,mlp,0.71707,0.49082
10,xgboost,0.717471,0.492117
7,gradient boosting,0.716351,0.492812
0,linear_reg,0.716906,0.492833
2,ridge,0.713636,0.492834
1,svr,0.714896,0.492875
6,extra trees,0.714538,0.492947
5,random forest,0.712438,0.494613
4,decision tree,0.715941,0.49465
8,adaboost,0.712929,0.494781


Not give good result

**Target Encoder**

In [346]:
import category_encoders as ce

# Update columns to encode, removing 'sector' from ordinal encoding
columns_to_encode = ['property_type', 'balcony', 'agePossession', 'furnishing_type', 'sector','luxury_category', 'floor_category']

# Creating a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
        ('cat', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), columns_to_encode),
        ('cat1',OneHotEncoder(drop='first',sparse_output=False),['agePossession']),
        ('target_enc', ce.TargetEncoder(), ['sector'])
    ],
    remainder='passthrough'
)

In [338]:
!pip install category_encoders



In [347]:
# Creating a pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

In [348]:
# K-fold cross-validation
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')



In [349]:
scores.mean(),scores.std()

(np.float64(-18.53329811238031), np.float64(23.372018697675564))

In [350]:
def scorer(model_name, model):

    output = []

    output.append(model_name)

    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', model)
    ])

    # K-fold cross-validation
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')

    output.append(scores.mean())

    X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)

    pipeline.fit(X_train,y_train)

    y_pred = pipeline.predict(X_test)

    y_pred = np.expm1(y_pred)

    output.append(mean_absolute_error(np.expm1(y_test),y_pred))

    return output


In [351]:
model_dict = {
    'linear_reg':LinearRegression(),
    'svr':SVR(),
    'ridge':Ridge(),
    'LASSO':Lasso(),
    'decision tree': DecisionTreeRegressor(),
    'random forest':RandomForestRegressor(),
    'extra trees': ExtraTreesRegressor(),
    'gradient boosting': GradientBoostingRegressor(),
    'adaboost': AdaBoostRegressor(),
    'mlp': MLPRegressor(),
    'xgboost':XGBRegressor()
}

In [352]:
model_output = []
for model_name,model in model_dict.items():
    model_output.append(scorer(model_name, model))



In [353]:
model_df = pd.DataFrame(model_output, columns=['name','r2','mae'])

In [354]:
model_df.sort_values(['mae'])

Unnamed: 0,name,r2,mae
10,xgboost,0.866052,0.31911
5,random forest,0.815324,0.356994
6,extra trees,0.753981,0.425342
7,gradient boosting,0.777222,0.434552
4,decision tree,0.63288,0.462503
8,adaboost,0.581768,0.63762
2,ridge,-18.421633,0.752463
0,linear_reg,-18.533298,0.752475
1,svr,0.365229,0.782992
3,LASSO,0.020734,0.941574


xgboost	0.866052(r2 score)

0.319110 cr ( mean ansolute error)

Hyperparameter Tuning

# Task
Tune the hyperparameters of the XGBoost model using GridSearchCV to minimize the Mean Absolute Error and improve the R2 score.

## Define the hyperparameter grid for xgboost

### Subtask:
Specify the range of hyperparameters to tune for the `XGBRegressor`.


**Reasoning**:
Define the hyperparameter grid for XGBoost.



In [362]:
param_grid = {
    'regressor__n_estimators': [100, 200, 300],
    'regressor__learning_rate': [0.01, 0.1, 0.2],
    'regressor__max_depth': [3, 4, 5]
}

## Set up gridsearchcv

### Subtask:
Create a `GridSearchCV` object with the pipeline (including the preprocessor and XGBoost regressor), the hyperparameter grid, and cross-validation settings. We will use MAE as the scoring metric to minimize.


**Reasoning**:
Create a pipeline with the preprocessor and XGBoost regressor, then instantiate GridSearchCV to tune the hyperparameters using the defined parameter grid and cross-validation settings, using negative mean absolute error as the scoring metric.



In [369]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from xgboost import XGBRegressor
import numpy as np

# Assuming columns_to_encode was intended for ordinal features,
# but we will explicitly list all features to be handled.

# Define the columns for different types of encoding
numerical_features = ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']

# Columns to be ordinally encoded (including previously unhandled ones)
ordinal_features = ['property_type', 'balcony', 'luxury_category', 'floor_category']

# Columns to be one-hot encoded
onehot_features = ['sector', 'agePossession', 'furnishing_type']

# Creating a column transformer for preprocessing
# Explicitly handling all columns and removing remainder='passthrough'
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat_ordinal', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), ordinal_features),
        ('cat_onehot', OneHotEncoder(drop='first', handle_unknown='ignore', sparse_output=False), onehot_features)
    ]
    # Removed remainder='passthrough'
)


pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', XGBRegressor())
])

param_grid = {
    'regressor__n_estimators': [100, 200, 300],
    'regressor__learning_rate': [0.01, 0.1, 0.2],
    'regressor__max_depth': [3, 4, 5]
}

grid_search = GridSearchCV(estimator=pipeline,
                           param_grid=param_grid,
                           cv=5,
                           scoring='neg_mean_absolute_error',
                           n_jobs=-1)

## Fit gridsearchcv

### Subtask:
Fit the `GridSearchCV` object to the training data to find the best hyperparameters.


**Reasoning**:
Fit the GridSearchCV object to the training data to find the best hyperparameters.



In [370]:
# List of categorical columns to cast to object
categorical_cols_to_cast = ['property_type', 'balcony', 'luxury_category', 'floor_category', 'sector', 'agePossession', 'furnishing_type']

# Cast categorical columns to object dtype in X_train and X_test
X_train[categorical_cols_to_cast] = X_train[categorical_cols_to_cast].astype(object)
X_test[categorical_cols_to_cast] = X_test[categorical_cols_to_cast].astype(object)


grid_search.fit(X_train, y_train)

## Evaluate the tuned XGBoost model

### Subtask:
Evaluate the performance of the XGBoost model with the best hyperparameters found by `GridSearchCV` using the test data and calculate the Mean Absolute Error and R2 score.

**Reasoning**:
Evaluate the tuned XGBoost model.

In [372]:
# Get the best model from the grid search
best_model = grid_search.best_estimator_

# Make predictions on the test set
y_pred_tuned = best_model.predict(X_test)

# Since the model was trained on the log-transformed target,
# we need to inverse transform the predictions to the original scale
y_pred_tuned_original_scale = np.expm1(y_pred_tuned)

# Inverse transform the actual test target values for evaluation on the original scale
y_test_original_scale = np.expm1(y_test)

# Calculate and display the Mean Absolute Error (MAE)
mae = mean_absolute_error(y_test_original_scale, y_pred_tuned_original_scale)
print(f"Mean Absolute Error (MAE) of the tuned XGBoost model: {mae}")

# Calculate and display the R2 score
from sklearn.metrics import r2_score
r2 = r2_score(y_test_original_scale, y_pred_tuned_original_scale)
print(f"R2 Score of the tuned XGBoost model: {r2}")

Mean Absolute Error (MAE) of the tuned XGBoost model: 0.2749116463145302
R2 Score of the tuned XGBoost model: 0.8760835623408937




**Exporting the model**

In [373]:
# Define the columns for different types of encoding
numerical_features = ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']

# Columns to be ordinally encoded (including previously unhandled ones)
ordinal_features = ['property_type', 'balcony', 'luxury_category', 'floor_category']

# Columns to be one-hot encoded
onehot_features = ['sector', 'agePossession', 'furnishing_type']

# Creating a column transformer for preprocessing
# Explicitly handling all columns and removing remainder='passthrough'
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat_ordinal', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), ordinal_features),
        ('cat_onehot', OneHotEncoder(drop='first', handle_unknown='ignore', sparse_output=False), onehot_features)
    ]
    # Removed remainder='passthrough'
)


In [374]:
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', XGBRegressor(n_estimators=500))
])

In [375]:
pipeline.fit(X,y_transformed)

In [376]:
Pipeline(steps=[('preprocessor',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('num', StandardScaler(),
                                                  ['bedRoom', 'bathroom',
                                                   'built_up_area',
                                                   'servant room',
                                                   'store room']),
                                                 ('cat', OrdinalEncoder(),
                                                  ['property_type', 'sector',
                                                   'balcony', 'agePossession',
                                                   'furnishing_type',
                                                   'luxury_category',
                                                   'floor_category']),
                                                 ('cat1',
                                                  OneHotEncoder(drop='first',
                                                                sparse_output=False),
                                                  ['sector',
                                                   'agePossession'])])),
                ('regressor', RandomForestRegressor(n_estimators=500))])

In [378]:
import pickle

with open('pipeline_01.pkl', 'wb') as file:
    pickle.dump(pipeline, file)

In [380]:
with open('df_01.pkl', 'wb') as file:
    pickle.dump(X, file)

**Trying out the predictio**ns

In [381]:
X.columns

Index(['property_type', 'sector', 'bedRoom', 'bathroom', 'balcony',
       'agePossession', 'built_up_area', 'servant room', 'store room',
       'furnishing_type', 'luxury_category', 'floor_category'],
      dtype='object')

In [383]:
X.iloc[0].values

array([np.float64(0.0), np.float64(168.0), np.float64(2.0), np.int64(2),
       np.float64(2.0), np.float64(4.0), np.float64(1260.0), np.int64(0),
       np.int64(0), 'unfurnished', np.float64(1.0), np.float64(0.0)],
      dtype=object)

In [385]:
data = [['house', 'sector 102', 4, 3, '3+', 'New Property', 2750, 0, 0, 'unfurnished', 'Low', 'Low Floor']]
columns = ['property_type', 'sector', 'bedRoom', 'bathroom', 'balcony',
       'agePossession', 'built_up_area', 'servant room', 'store room',
       'furnishing_type', 'luxury_category', 'floor_category']

In [386]:
# Convert to DataFrame
one_df = pd.DataFrame(data, columns=columns)


In [387]:
one_df

Unnamed: 0,property_type,sector,bedRoom,bathroom,balcony,agePossession,built_up_area,servant room,store room,furnishing_type,luxury_category,floor_category
0,house,sector 102,4,3,3+,New Property,2750,0,0,unfurnished,Low,Low Floor


In [388]:
np.expm1(pipeline.predict(one_df))



array([2.2426345], dtype=float32)

In [389]:
X.dtypes

Unnamed: 0,0
property_type,float64
sector,float64
bedRoom,float64
bathroom,int64
balcony,float64
agePossession,float64
built_up_area,float64
servant room,int64
store room,int64
furnishing_type,object


In [390]:
sorted(X['sector'].unique().tolist())

[0.0,
 1.0,
 2.0,
 3.0,
 4.0,
 5.0,
 6.0,
 7.0,
 8.0,
 9.0,
 10.0,
 11.0,
 12.0,
 13.0,
 14.0,
 15.0,
 16.0,
 17.0,
 18.0,
 19.0,
 20.0,
 21.0,
 22.0,
 23.0,
 24.0,
 25.0,
 26.0,
 27.0,
 28.0,
 29.0,
 30.0,
 31.0,
 32.0,
 33.0,
 34.0,
 35.0,
 36.0,
 37.0,
 38.0,
 39.0,
 40.0,
 41.0,
 42.0,
 43.0,
 44.0,
 45.0,
 46.0,
 47.0,
 48.0,
 49.0,
 50.0,
 51.0,
 52.0,
 53.0,
 54.0,
 55.0,
 56.0,
 57.0,
 58.0,
 59.0,
 60.0,
 61.0,
 62.0,
 63.0,
 64.0,
 65.0,
 66.0,
 67.0,
 68.0,
 69.0,
 70.0,
 71.0,
 72.0,
 73.0,
 74.0,
 75.0,
 76.0,
 77.0,
 78.0,
 79.0,
 80.0,
 81.0,
 82.0,
 83.0,
 84.0,
 85.0,
 86.0,
 87.0,
 88.0,
 89.0,
 90.0,
 91.0,
 92.0,
 93.0,
 94.0,
 95.0,
 96.0,
 97.0,
 98.0,
 99.0,
 100.0,
 101.0,
 102.0,
 103.0,
 104.0,
 105.0,
 106.0,
 107.0,
 108.0,
 109.0,
 110.0,
 111.0,
 112.0,
 113.0,
 114.0,
 115.0,
 116.0,
 117.0,
 118.0,
 119.0,
 120.0,
 121.0,
 122.0,
 123.0,
 124.0,
 125.0,
 126.0,
 127.0,
 128.0,
 129.0,
 130.0,
 131.0,
 132.0,
 133.0,
 134.0,
 135.0,
 136.0,
 137.0,
 138.0