## Importing libraries 

In [None]:
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

## Importing the dataset

In [None]:
train_data = pd.read_csv("house_prices_train_data.csv")

In [None]:
#data overview 
#number of rows and cols
train_data.shape

## Understanding the data


In [None]:
train_data.head()

In [None]:
train_data.info()

In [None]:
train_data.isnull().any()

In [None]:
train_data.columns

In [None]:
independent_attributes = ['POSTED_BY', 'UNDER_CONSTRUCTION', 'RERA', 'BHK_NO.', 'BHK_OR_RK','SQUARE_FT', 'READY_TO_MOVE', 'RESALE']
for attribute in independent_attributes:
    print(train_data[attribute].value_counts())

In [None]:
train_data.head()

## Visualizing the features

In [None]:
#visualizing the features 
train_data.POSTED_BY.value_counts().plot(kind='bar')	

In [None]:
train_data['BHK_NO.'].value_counts().plot(kind='bar')

In [None]:
train_data.RESALE.value_counts().plot(kind='bar')

In [None]:
train_data.READY_TO_MOVE.value_counts().plot(kind='bar')

In [None]:
train_data.RERA.value_counts().plot(kind='bar')

In [None]:
train_data.UNDER_CONSTRUCTION.value_counts().plot(kind='bar')

In [None]:
train_data.BHK_OR_RK.describe()

In [None]:
train_data.POSTED_BY.describe()

In [None]:
train_data['TARGET(PRICE_IN_LACS)'].describe()

In [None]:
train_data.describe()

In [None]:
train_data.head()

In [None]:
train_data.ADDRESS = train_data.ADDRESS.apply(lambda x: x.split(',')[-1])
train_data.head()

## Corelation matrix

In [None]:
corr_matrix = train_data.corr()
fig = plt.figure(figsize = (15, 10)) 
sns.heatmap(corr_matrix, annot=True, cmap=sns.diverging_palette(220, 20, as_cmap=True)) 
plt.show()

In [None]:
# train_data = train_data.drop("LONGITUDE", axis=1)
# train_data = train_data.drop("LATITUDE", axis =1)

In [None]:
train_data.head()

In [None]:
train_data['ADDRESS'].nunique()

## Converting categorical values to numerical

In [None]:
#converting categorical values to numerical 
from sklearn.preprocessing import LabelEncoder
lr = LabelEncoder()
train_data.POSTED_BY =lr.fit_transform(train_data.POSTED_BY)
train_data.BHK_OR_RK =lr.fit_transform(train_data.BHK_OR_RK)
train_data.ADDRESS =lr.fit_transform(train_data.ADDRESS)

In [None]:
train_data.head()

## Separating the features and target 

In [None]:
features = train_data.drop('TARGET(PRICE_IN_LACS)', axis=1)
target = train_data['TARGET(PRICE_IN_LACS)']

## Standardizing the features

In [None]:
#scaling the features 
from sklearn.preprocessing import StandardScaler
x_features = features
feature_attr = StandardScaler().fit_transform(x_features)

In [None]:
print(features.shape)
print(target.shape)

## Building the model

In [None]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(feature_attr, target, train_size=0.8,random_state=69)
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error

In [None]:
type(x_test)

## Random Forest Regressor

In [None]:
rf_model = RandomForestRegressor()
rf_model.fit(x_train, y_train)
predictions = rf_model.predict(x_test)
print("R-score of RandomForestRegressor: ",r2_score(y_test,predictions))
sns.regplot(y_test,predictions)

## Decision Tree Regressor

In [None]:
model = DecisionTreeRegressor()
model.fit(x_train, y_train)
predictions = model.predict(x_test)
print("R-score of DecisionTreeRegressor: ", r2_score(y_test,predictions))

## XGBRegressor

In [None]:
from xgboost import XGBRegressor
xgboost_model = XGBRegressor(n_estimators=100, learning_rate=0.05, random_state=42)
xgboost_model.fit(x_train, y_train)
preds = xgboost_model.predict(x_test)
print('XG Boost: ', r2_score(y_test, preds))