Data Loading

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Loading the Dataset
data = pd.read_csv("House_Rent_Dataset.csv")

Exploratory Data Analysis (EDA) and Visualization

In [None]:
# Distribution of Area Types
print('Distribution of Area Types: \n{}'.format(data['Area Type'].value_counts()))

# Visualization of the percentage distribution of Area Types
sns.set_style('darkgrid')
fig,axes = plt.subplots(figsize=(12,8))
ax = sns.countplot(x='Area Type',data=data, palette=['#b644e3', '#6ecc64', '#e3784d'])
plt.title('Distribution of Area Type',fontsize=15)

# Distribution of Houses per cities
print('Distribution of Houses per cities: \n{}'.format(data['City'].value_counts()))

# Visualization of Houses for rent per city
sns.set_style('darkgrid')
fig,axes = plt.subplots(figsize=(12,8))
ax = sns.countplot(x='City',data=data, palette='coolwarm')
plt.title('Houses in Cities available for Rent',fontsize=15)

# Distribution of Furnishing Status
print('Housing Status: \n{}'.format(data['Furnishing Status'].value_counts()))

# Visualization of Housing Status for rent
sns.set_style('darkgrid')
fig,axes = plt.subplots(figsize=(12,8))
ax = sns.countplot(x='Furnishing Status',data=data, palette=['#87ace8', '#6ecc64', '#EAE509'])
plt.title('Furnishing Status Distribution for House Renting',fontsize=15)

# Distribution of the Preferred Tenant Type for renting
print("Percentage of Tenancy Types Preferred for rent: \n{}".format(data['Tenant Preferred'].value_counts()/len(data['Tenant Preferred'])*100))

# Visualization of Tenant Preferred
colors = ['#87ace8', '#6ecc64', '#EAE509']
plt.figure(figsize = (12, 8)) 
data['Tenant Preferred'].value_counts().plot.pie(explode=[0.05, 0.05,0.05],textprops={'fontsize': 12}, autopct='%1.2f%%',colors=colors, shadow=True)
plt.title('Pie Chart for Tenant Preferred',fontsize=15)

# Frequency of Bathrooms available in houses for rent Distribution
print('Frequency of Bathrooms available in houses for rent: \n{}'.format(data['Bathroom'].value_counts()))

# Visualization of Bathroom Frequency
sns.set_style('darkgrid')
fig,axes = plt.subplots(figsize=(12,8))
ax = sns.countplot(x='Bathroom',data=data, palette='coolwarm')
plt.title('Frequency of Bathrooms available in house for rent:',fontsize=15)

# Proportion of Middlemen agents Renters should contact
print('Proportion of Middlemen agents Renters should contact:\n{}'.format(data['Point of Contact'].value_counts()/len(data['Point of Contact'])*100))

# Visualization of Point of Contact
colors = ['#87ace8','#b644e3', '#EAE509']
plt.figure(figsize = (12, 8)) 
data['Point of Contact'].value_counts().plot.pie(explode=[0.05, 0.05,0.05],textprops={'fontsize': 12}, autopct='%1.2f%%',colors=colors, shadow=True)
plt.title('Pie Chart for Area Type',fontsize=15)


Data Transformation and Model Building

In [None]:
# Removing irrelevant columns
data = data.drop(['Posted On','Area Locality','Floor'], axis=1)

# Coding strings into numeric data using dummies
housing_data = pd.get_dummies(data, columns=['Area Type', 'City','Furnishing Status','Tenant Preferred', 'Point of Contact'])

# Defining the variables for model building
x = housing_data.drop(['Rent'], axis=1)
y = housing_data['Rent'] 

# Splitting into test and training dataset
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size= 0.10, random_state=2
)

# Using Linear Regression Modelling
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(x_train, y_train)
lr_prediction = lr.predict(x_test)
score = lr.score(x_test, y_test)
print('Test Accuracy:', (score)*100)


Model Evaluation and Metrics

In [None]:
y_preds = lr.predict(x_test)
from sklearn.metrics import mean_squared_error, r2_score

# The coefficients
print("Coefficients: \n", lr.coef_)

# The mean squared error
print("Mean squared error: %.2f" % mean_squared_error(y_test, y_preds))

# The coefficient of determination (R-squared)
print("Coefficient of determination: %.2f" % r2_score(y_test, y_preds))


Preprocessing

In [None]:
# Data Preprocessing

# Converting 'Posted On' columns to DateTime
data['Posted On'] = pd.to_datetime(data['Posted On'])

# Coding strings into numeric data using dummies
housing_data = pd.get_dummies(data, columns=['Area Type', 'City','Furnishing Status','Tenant Preferred', 'Point of Contact'])

# Defining the variables for model building
x = housing_data.drop(['Rent'], axis=1)
y = housing_data['Rent'] 

# Splitting into test and training dataset
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size= 0.10, random_state=2
)