In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)


from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.svm import LinearSVR, SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
import warnings
warnings.filterwarnings(action='ignore')
# Input data files are available in the read-only "../input/" directory
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/delhi-house-price-prediction/MagicBricks.csv


In [2]:
data = pd.read_csv('/kaggle/input/delhi-house-price-prediction/MagicBricks.csv')

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1259 entries, 0 to 1258
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Area         1259 non-null   float64
 1   BHK          1259 non-null   int64  
 2   Bathroom     1257 non-null   float64
 3   Furnishing   1254 non-null   object 
 4   Locality     1259 non-null   object 
 5   Parking      1226 non-null   float64
 6   Price        1259 non-null   int64  
 7   Status       1259 non-null   object 
 8   Transaction  1259 non-null   object 
 9   Type         1254 non-null   object 
 10  Per_Sqft     1018 non-null   float64
dtypes: float64(4), int64(2), object(5)
memory usage: 108.3+ KB


In [4]:
def onehot_encode(df,column,rename=False):
    df=df.copy()
    if rename==True:
        df[column]=df[column].replace({x:i for i,x in enumerate(df[column].unique())})
    dummies = pd.get_dummies(df[column], prefix=column)
    for c in dummies.columns:
        dummies[c]= dummies[c].replace({False:0,True:1})
    df=pd.concat([df,dummies],axis=1)    
    df=df.drop(column,axis=1)
    
    return df
    

In [5]:
def preprocess_inputs(df):
    df=df.copy()
    df=df.drop("Per_Sqft",axis=1)
    
    #dealing with missing values
    #i chose to round the mean of parking vallues since mode was 1 but mean was 1.67 so roundng
    #the mean made more sense
    df['Type'] = df['Type'].fillna(df['Type'].mode()[0])
    df['Parking'] = df['Parking'].fillna(df['Parking'].mode()[0]);
    df['Bathroom'] = df['Bathroom'].fillna(df['Bathroom'].mode()[0])
    
    #binary encoding the transaction/status/Type columns since they have only two values
    df['Status']=df['Status'].replace({
        'Almost_ready':0,
        'Ready_to_move':1
    })
    df['Transaction']=df['Transaction'].replace({
        'New_Property':0,
        'Resale':1
    })
    df['Type']=df['Type'].replace({
        'Apartment':1,
        'Builder_Floor':0
    })   
    
    #one_hot_encoding multi-categorical values
    df=onehot_encode(df,column="Furnishing")
    df=onehot_encode(df,column="Locality",rename=True)  
    
    #splitting df int X and y
    y=df['Price']
    X=df.drop('Price',axis=1)
    
    #train_test_split
    X_train, X_test,y_train,y_test = train_test_split(X,y,train_size=0.7,shuffle=True,random_state=1)
    
    #scale X
    scaler = StandardScaler()
    scaler.fit(X_train)
    X_train = pd.DataFrame(scaler.transform(X_train),index=X_train.index,columns= X_train.columns)
    X_test = pd.DataFrame(scaler.transform(X_test),index=X_test.index,columns= X_test.columns)
    return X_train, X_test,y_train,y_test



In [6]:
X_train, X_test,y_train,y_test = preprocess_inputs(data)

# **TRAINING**


In [7]:
models = {
    "                     Linear Regression": LinearRegression(),
    " Linear Regression (L2 Regularization)": Ridge(),
    " Linear Regression (L1 Regularization)": Lasso(),
    "                   K-Nearest Neighbors": KNeighborsRegressor(),
    "                        Neural Network": MLPRegressor(),
    "Support Vector Machine (Linear Kernel)": LinearSVR(),
    "   Support Vector Machine (RBF Kernel)": SVR(),
    "                         Decision Tree": DecisionTreeRegressor(),
    "                         Random Forest": RandomForestRegressor(),
    "                     Gradient Boosting": GradientBoostingRegressor(),
    "                               XGBoost": XGBRegressor(),
    "                              LightGBM": LGBMRegressor(),
    "                              CatBoost": CatBoostRegressor(verbose=0)
}

for name, model in models.items():
    model.fit(X_train, y_train)
    print(name + " trained.")

                     Linear Regression trained.
 Linear Regression (L2 Regularization) trained.
 Linear Regression (L1 Regularization) trained.
                   K-Nearest Neighbors trained.
                        Neural Network trained.
Support Vector Machine (Linear Kernel) trained.
   Support Vector Machine (RBF Kernel) trained.
                         Decision Tree trained.
                         Random Forest trained.
                     Gradient Boosting trained.
                               XGBoost trained.
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002645 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 170
[LightGBM] [Info] Number of data points in the train set: 881, number of used features: 13
[LightGBM] [Info] Start training from score 20935561.861521
                              LightGBM trained.
              

In [8]:
for name, model in models.items():
    print(name + "R^2 score {:.5f}".format(model.score(X_test,y_test)))

                     Linear RegressionR^2 score -17785645789648935461060608.00000
 Linear Regression (L2 Regularization)R^2 score 0.67672
 Linear Regression (L1 Regularization)R^2 score 0.67647
                   K-Nearest NeighborsR^2 score 0.59440
                        Neural NetworkR^2 score -0.62240
Support Vector Machine (Linear Kernel)R^2 score -0.62248
   Support Vector Machine (RBF Kernel)R^2 score -0.07453
                         Decision TreeR^2 score 0.70969
                         Random ForestR^2 score 0.80716
                     Gradient BoostingR^2 score 0.84460
                               XGBoostR^2 score 0.87506
                              LightGBMR^2 score 0.77550
                              CatBoostR^2 score 0.85818


In [9]:
model = XGBRegressor(n_estimators=200)
model.fit(X_train,y_train)
model.score(X_test,y_test)

0.8769676510321107