# Import Libraries and Data 

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
import os
import sklearn
from sklearn.cluster import KMeans 
import pylab as pl 

In [2]:
%matplotlib inline

In [3]:
#Imported path
path = r'/Users/james/Desktop/Task6'

In [4]:
path

'/Users/james/Desktop/Task6'

In [5]:
df = pd.read_csv('/Users/james/Desktop/Task6/Housing.New.csv')

# Data Check and Clean

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 266383 entries, 0 to 266382
Data columns (total 21 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   Unnamed: 0         266383 non-null  int64  
 1   State              266383 non-null  object 
 2   City               266383 non-null  object 
 3   Postal_Code        266383 non-null  int64  
 4   Year_Constructed   266383 non-null  float64
 5   Total_Rent         266383 non-null  float64
 6   Base_Rent          266383 non-null  float64
 7   Service_Charge     266383 non-null  float64
 8   Living_Space_(m2)  266383 non-null  float64
 9   N_Rooms            266383 non-null  float64
 10  Heating_Type       222093 non-null  object 
 11  Condition          198642 non-null  object 
 12  Balcony            266383 non-null  bool   
 13  Price_Trend        266383 non-null  float64
 14  Has_Kitchen        266383 non-null  bool   
 15  Lift               266383 non-null  bool   
 16  Fl

In [7]:
#Removing unnecessary column
df.drop(['Unnamed: 0','State', 'City', 'Postal_Code', 'Heating_Type', 'Condition', 'Has_Kitchen', 'Balcony', 'Lift', 'Flat_Type', 'Garden', 'Rent_range'], axis=1, inplace=True)

In [8]:
df.head()

Unnamed: 0,Year_Constructed,Total_Rent,Base_Rent,Service_Charge,Living_Space_(m2),N_Rooms,Price_Trend,Floor,Price_per_m2
0,1965.0,840.0,595.0,245.0,86.0,4.0,4.62,1.0,6.918605
1,1871.0,895.331385,800.0,134.0,89.0,3.0,3.47,0.0,8.988764
2,2019.0,1300.0,965.0,255.0,83.8,3.0,2.72,3.0,11.515513
3,1964.0,895.331385,343.0,58.15,58.15,3.0,1.53,3.0,5.898538
4,1950.0,903.0,765.0,138.0,84.97,3.0,2.46,1.0,9.003178


In [9]:
df.shape

(266383, 9)

# Elbow Technique 

In [10]:
num_cl = range(1, 10) # Defines the range of potential clusters in the data.
kmeans = [KMeans(n_clusters=i) for i in num_cl] # Defines k-means clusters in the range assigned above.

In [None]:
score = [kmeans[i].fit(df).score(df) for i in range(len(kmeans))] # Creates a score that represents 
# a rate of variation for the given cluster option.

score



In [None]:
# Plot the elbow curve using PyLab.

pl.plot(num_cl,score)
pl.xlabel('Number of Clusters')
pl.ylabel('Score')
pl.title('Elbow Curve')
pl.show()

In [None]:
# The significant increase observed on the x-axis at the transition from 2 to 3 suggests a noticeable change in the data. This indicates that the most suitable number of clusters is likely to be 3.

# K - Means Clustering 

In [None]:
# Create the k-means object.

kmeans = KMeans(n_clusters = 2) 

In [None]:
# Fit the k-means object to the data.

kmeans.fit(df)

In [None]:
df['clusters'] = kmeans.fit_predict(df)

In [None]:
kmeans = KMeans(n_clusters=2, n_init=10)
kmeans.fit(df)

In [None]:
df.head()

In [None]:
df['clusters'].value_counts()

In [None]:
# Plot the clusters for the "Total_Rent" and "Living_Space_(m2)" variables.

plt.figure(figsize=(10, 8))
ax = sns.scatterplot(x=df['Total_Rent'], y=df['Living_Space_(m2)'], hue=kmeans.labels_, s=50, alpha=0.7)
ax.grid(False)
plt.xlabel('Total_Rent')
plt.ylabel('Living_Space_(m2)')
plt.title('Clusters of Rental Properties')

# Set the y-axis limit to a maximum of 600
plt.ylim(0, 600)

# Set the x-axis limit to 25000
plt.xlim(0, 10000)

plt.show()

In [None]:
# Plot the clusters for the "Total_Rent" and "Year_Constructed" variables.

plt.figure(figsize=(10,8))
ax = sns.scatterplot(x=df['Total_Rent'], y=df['Year_Constructed'], hue=kmeans.labels_, s=100)

ax.grid(False) 
plt.xlabel('Total_Rent') 
plt.ylabel('Year_constructed') 
plt.show()

In [None]:
# Plot the clusters for the "Total_Rent" and "Year_Constructed" variables.

plt.figure(figsize=(10,8))
ax = sns.scatterplot(x=df['Total_Rent'], y=df['Base_Rent'], hue=kmeans.labels_, s=100)

ax.grid(False) 
plt.xlabel('Total_Rent') 
plt.ylabel('Base_Rent') 
plt.show()

In [None]:
# Plot the clusters for the "Total_Rent" and "Year_Constructed" variables.

plt.figure(figsize=(10,8))
ax = sns.scatterplot(x=df['Total_Rent'], y=df['N_Rooms'], hue=kmeans.labels_, s=100)

ax.grid(False) 
plt.xlabel('Total_Rent') 
plt.ylabel('N_Rooms') 
plt.show()

In [None]:
# Plot the clusters for the "Total_Rent" and "Year_Constructed" variables.

plt.figure(figsize=(10,8))
ax = sns.scatterplot(x=df['Living_Space_(m2)'], y=df['Price_per_m2'], hue=kmeans.labels_, s=100)

ax.grid(False) 
plt.xlabel('Living_Space_(m2)') 
plt.ylabel('Price_per_m2') 
plt.show()

In [None]:
#df.loc[df['clusters'] == 2, 'cluster'] = 'dark purple'
df.loc[df['clusters'] == 1, 'cluster'] = 'Blue'
df.loc[df['clusters'] == 0, 'cluster'] = 'Orange'

In [None]:
df.head()

In [None]:
df.groupby('cluster').agg({'Total_Rent':['mean', 'median'], 
                         'Price_Trend':['mean', 'median'], 
                         'Living_Space_(m2)':['mean', 'median'],
                           'N_Rooms':['mean', 'median'],
                           'Service_Charge':['mean', 'median'],
                          'Price_per_m2':['mean', 'median']})

In [None]:
# The purple cluster stands out with superior performance across various categories. It exhibits high rental prices and demonstrates a positive price trend. Additionally, the maintenance fee associated with this cluster is notably elevated compared to others.