In [1]:
# Dataset being imported from preprocessing_and_feature_engineering file
%store -r dataset_processed

In [2]:
dataset_final = dataset_processed.copy()
dataset_final

Unnamed: 0,BHK,Rent,Size,Floor,Area Type,City,Furnishing Status,Tenant Preferred,Bathroom,Point of Contact,Day,Month
0,2.0,10000.0,1100.0,0,Super Area,Kolkata,Unfurnished,Bachelors/Family,2.0,Contact Owner,18,5
1,2.0,20000.0,800.0,1,Super Area,Kolkata,Semi-Furnished,Bachelors/Family,1.0,Contact Owner,13,5
2,2.0,17000.0,1000.0,1,Super Area,Kolkata,Semi-Furnished,Bachelors/Family,1.0,Contact Owner,16,5
3,2.0,10000.0,800.0,1,Super Area,Kolkata,Unfurnished,Bachelors/Family,1.0,Contact Owner,4,7
4,2.0,7500.0,850.0,1,Carpet Area,Kolkata,Unfurnished,Bachelors,1.0,Contact Owner,9,5
...,...,...,...,...,...,...,...,...,...,...,...,...
4741,2.0,15000.0,1000.0,3,Carpet Area,Hyderabad,Semi-Furnished,Bachelors/Family,2.0,Contact Owner,18,5
4742,3.0,29000.0,2000.0,1,Super Area,Hyderabad,Semi-Furnished,Bachelors/Family,3.0,Contact Owner,15,5
4743,3.0,35000.0,1750.0,3,Carpet Area,Hyderabad,Semi-Furnished,Bachelors/Family,3.0,Contact Agent,10,7
4744,3.0,45000.0,1500.0,2,Carpet Area,Hyderabad,Semi-Furnished,Family,2.0,Contact Agent,6,7


In [3]:
dataset_final.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4746 entries, 0 to 4745
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   BHK                4746 non-null   float64
 1   Rent               4746 non-null   float64
 2   Size               4746 non-null   float64
 3   Floor              4746 non-null   int64  
 4   Area Type          4746 non-null   object 
 5   City               4746 non-null   object 
 6   Furnishing Status  4746 non-null   object 
 7   Tenant Preferred   4746 non-null   object 
 8   Bathroom           4746 non-null   float64
 9   Point of Contact   4746 non-null   object 
 10  Day                4746 non-null   int32  
 11  Month              4746 non-null   int32  
dtypes: float64(4), int32(2), int64(1), object(5)
memory usage: 408.0+ KB


In [4]:
# Checking for null values
dataset_final.isna().any()

BHK                  False
Rent                 False
Size                 False
Floor                False
Area Type            False
City                 False
Furnishing Status    False
Tenant Preferred     False
Bathroom             False
Point of Contact     False
Day                  False
Month                False
dtype: bool

In [5]:
import numpy as np
import pandas as pd

In [6]:
from sklearn.model_selection import train_test_split

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

In [7]:
categorical_columns = dataset_final.select_dtypes(include=object).columns.to_list()
categorical_columns

['Area Type',
 'City',
 'Furnishing Status',
 'Tenant Preferred',
 'Point of Contact']

In [8]:
numerical_columns = dataset_final.select_dtypes(include=np.number).drop("Rent", axis=1).columns.to_list()
numerical_columns

['BHK', 'Size', 'Floor', 'Bathroom', 'Day', 'Month']

### One hot encoding the categorical variables

In [9]:
# Making pipeline for preprocessing of data
scaler_transformer = StandardScaler()

categorical_transformer = Pipeline(
    steps=[
        ("One_hot_encoder", OneHotEncoder())
    ]
)

preprocessor = ColumnTransformer(
    transformers=[
        ("categorical", categorical_transformer, categorical_columns),
        ("scaler", scaler_transformer, numerical_columns)
    ]
)

In [10]:
# Independent Variables
X = dataset_final.drop("Rent", axis=1)

# Dependent Variables
y = dataset_final["Rent"]

In [11]:
# train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [12]:
# For Comparision between two regression model
Random_forest_Regressor = RandomForestRegressor()
Linear_Regressor = LinearRegression()

In [13]:
# Function to choose between two regression model
def model_fit(model: str="Random_forest"):
    if model == "Random_forest":
        model = Random_forest_Regressor
    
    elif model == "Linear_Regression":
        model = Linear_Regressor
    
    else:
        raise Exception("Only have option of two: \n i. Random_forest \n ii. Linear_Regression")

    pipeline = Pipeline(
        steps=[
            ("preprocessor", preprocessor),
            ("model", model)
        ]
    )

    return pipeline.fit(X_train, y_train)

In [14]:
def evaluation_metric(pipeline):
    from sklearn.model_selection import cross_val_score
    from sklearn.metrics import mean_absolute_error, mean_squared_error

    print(f"The Accuracy of the model is: {round(pipeline.score(X_test, y_test) * 100, 3)} %")
    print("----------------------------------------------------------------")
    scores = cross_val_score(pipeline, X_test, y_test, scoring='r2', cv=5)
    print(f"Cross Validation score of 5 kfold: {scores}")
    print(f"Average cross validation score: {np.mean(scores)}")
    print("----------------------------------------------------------------")
    y_pred = pipeline.predict(X_test)
    mae = mean_absolute_error(y_test, y_pred)
    print(f"Mean Absolute Error: {mae}")
    mse = mean_squared_error(y_test, y_pred)
    print(f"Mean Squared Error: {mse}")

In [15]:
# For Random Forest
random_forest_pipeline = model_fit("Random_forest")
random_forest_pipeline

In [16]:
# Evaluation metric of Random Forest
evaluation_metric(random_forest_pipeline)

The Accuracy of the model is: 70.494 %
----------------------------------------------------------------
Cross Validation score of 5 kfold: [0.67760295 0.67752317 0.638323   0.6273291  0.63842672]
Average cross validation score: 0.6518409887152586
----------------------------------------------------------------
Mean Absolute Error: 5189.117307645222
Mean Squared Error: 58690627.04218715


In [17]:
# For Linear Regression
linear_regression_pipeline = model_fit("Linear_Regression")
linear_regression_pipeline

In [18]:
# Evaluation metric of Linear Regression
evaluation_metric(linear_regression_pipeline)

The Accuracy of the model is: 63.868 %
----------------------------------------------------------------
Cross Validation score of 5 kfold: [0.66891036 0.6046956  0.61882724 0.58083814 0.64796175]
Average cross validation score: 0.6242466174231003
----------------------------------------------------------------
Mean Absolute Error: 6120.254308337215
Mean Squared Error: 71868871.35276999


Random Forest gave better evaluation metric than Linear Regression. So, choosing Random Forest model would be better.

In [19]:
model_pipeline = random_forest_pipeline
model_pipeline

### Real Time Prediction

In [20]:
# User Input
def user_input():
    from datetime import datetime

    def check_valid_input(input_number: int, v_range: int, option: list):
        valid_range = [i for i in range(1, v_range)]

        if input_number in valid_range:
            return option[input_number-1]
        
        else:
            raise Exception("Invalid choice. Please choose among the option")
            

    month = int(datetime.today().strftime('%m'))
    day = int(datetime.today().strftime('%d'))

    bhk = int(input("What BHK would you prefer:"))
    size = int(input("What size of house would you prefer:"))
    bathroom = int(input("What number of bathroom would you prefer:"))
    floor = int(input("Which floor would you prefer:"))

    area_type = int(input("Choose amongst the following area type: \n 1. Super Area \n 2. Carpet Area \n 3. Built Area"))
    area_type = check_valid_input(input_number=area_type, v_range=4, option=["Super Area", "Carpet Area", "Built Area"])

    city = int(input("Choose amongst the following City: \n 1. Kolkata \n 2. Mumbai \n 3. Bangalore \n 4. Delhi \n 5. Chennai \n 6. Hyderbad"))
    city = check_valid_input(input_number=city, v_range=7, option=["Kolkata", "Mumbai", "Bangalore", "Delhi", "Chennai", "Hyderabad"])

    furnishing = int(input("Choose amongst the following furnishing status: \n 1. Unfurnished \n 2. Semi-Furnished \n 3. Furnished"))
    furnishing = check_valid_input(input_number=furnishing, v_range=4, option=["Unfurnished", "Semi-Furnished", "Furnished"])

    tenant = int(input("Choose amongst the following tenant preferred: \n 1. Bachelors \n 2. Family \n 3. Either"))
    tenant = check_valid_input(input_number=tenant, v_range=4, option=["Bachelors", "Family", "Bachelors/Family"])

    point_of_contact = int(input("Choose amongst the following point of contact preferred: \n 1. Contact Owner \n 2. Contact Agent \n 3. Contact Builder"))
    point_of_contact = check_valid_input(input_number=point_of_contact, v_range=4, option=["Contact Owner", "Contact Agent", "Contact Builder"])

    return pd.DataFrame(
                        data=[
                            [bhk, size, bathroom, day, month, floor, area_type, city, furnishing, tenant, point_of_contact]
                        ], 
                        columns=[
                            "BHK", "Size", "Bathroom", "Day", "Month", "Floor", 
                            "Area Type", "City", "Furnishing Status", "Tenant Preferred", "Point of Contact"
                        ]
                )

In [21]:
input_to_be_predicted = user_input()
input_to_be_predicted

Unnamed: 0,BHK,Size,Bathroom,Day,Month,Floor,Area Type,City,Furnishing Status,Tenant Preferred,Point of Contact
0,5,850,3,29,9,3,Super Area,Delhi,Furnished,Bachelors/Family,Contact Agent


In [22]:
def predict_rent(data: pd.DataFrame, model) -> int:
    return model.predict(data)[0]

In [23]:
print(f"The Rent for your preferred choice is Rs {round(predict_rent(data=input_to_be_predicted, model=model_pipeline), 3)}")

The Rent for your preferred choice is Rs 37448.517
