In [None]:
from mpl_toolkits.mplot3d import Axes3D
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import RobustScaler
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture
from sklearn.datasets import make_blobs
import matplotlib.pyplot as plt 
import numpy as np 
import os 
import pandas as pd
import seaborn as sns

In [None]:
dx=pd.read_csv("housing.csv")
df= dx.copy()
df.head(5)

In [None]:
df.describe()

In [None]:
sns.heatmap(df.corr())

In [None]:
df.columns

In [None]:
df.hist(bins=50,figsize=(20,15))
plt.show()

In [None]:
x=df.ocean_proximity
y=df.median_house_value
sns.barplot(x,y)

In [None]:
x=df.longitude
y=df.latitude
fig,ax=plt.subplots(2,2,figsize=(25,20))
fig.suptitle("data - price Map")
ax[0][0].set_title("median_house_value")
sns.scatterplot(x=x,y=y,hue=df.median_house_value,ax=ax[0][0])
ax[0][1].set_title("ocean_proximity")
sns.scatterplot(x=x,y=y,hue=df.ocean_proximity,ax=ax[0][1])
ax[1][0].set_title("median_income")
sns.scatterplot(x=x,y=y,hue=df.median_income,ax=ax[1][0])
ax[1][1].set_title("population")
sns.scatterplot(x=x,y=y,hue=df.population,ax=ax[1][1])

In [None]:
# median_income - households Map for ocean_proximity
x=df.median_income
y=df.households
sns.scatterplot(x,y,hue=df["ocean_proximity"],)

In [None]:
# compare various attribute

df["rooms_per_household"]=df["total_rooms"]/df["households"]
df["bedrooms_per_room"]=df["total_bedrooms"]/df["total_rooms"]
df["population_per_householed"]=df["population"]/df["households"]

df.head()

In [None]:
# Display number of null or NaN values for each column. Remove rows containing null or Nan values.
print(df.isnull().sum())
df = df.dropna()

In [None]:
# encoding categorical value

df_ocean=df[["ocean_proximity"]]
df_ocean.head()

In [None]:
# encoding1 - label encoding

from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
label_encoded = label_encoder.fit_transform(df_ocean)
df['ocean_proximity'] = label_encoded

print(label_encoded)
label_encoder.classes_
sns.countplot(df['ocean_proximity'])

In [None]:
# encoding2 - astype('category').cat.codes

df['ocean_proximity'].astype('category').cat.codes

In [None]:
# encoding3 - frequency encoding

# grouping by frequency
fq_encoder = df.groupby('ocean_proximity').size()/len(df)
# mapping values to dataframe
df.loc[:, "{}_freq_encode".format('nocean_proximityom_0')] = df['ocean_proximity'].map(fq_encoder)

fq_encoder.plot.bar(stacked=True)

In [None]:
df.head()

In [None]:
data = df.iloc[:,0:10]
target = df.iloc[:,10]

features = ['longitude', 'latitude', 'median_house_value']
selected_df = data[features]


In [None]:
# scaling1 - standard scaler

ss_scaler = StandardScaler()
ss_data = ss_scaler.fit_transform(selected_df)
ss_data = pd.DataFrame(ss_data)

In [None]:
# scaling2 - minmax scaler

mm_scaler = MinMaxScaler()
mm_data = mm_scaler.fit_transform(selected_df)
mm_data = pd.DataFrame(mm_data)

In [None]:
# scaling3 - robust scaler

r_scaler = RobustScaler()
r_data = r_scaler.fit_transform(selected_df)
r_data = pd.DataFrame(r_data)

In [None]:
# K-means
k_arr = [2, 4, 6, 8, 10]

for k in k_arr:
    model = KMeans(n_clusters = k, random_state=10)

    model.fit(ss_data)

    cluster_name = 'kmeans_cluster' + str(k)
    df[cluster_name] = model.fit_predict(ss_data)

    plt.figure(figsize = (25, 20))

    for i in range(k):
        plt.scatter(df.loc[df[cluster_name] == i, 'longitude'], df.loc[df[cluster_name] == i, 'latitude'], label = 'cluster' + str(i))

    plt.legend()
    plt.title('K = %d results'%k)
    plt.xlabel('longitude')
    plt.ylabel('latitude')
    plt.show()

In [None]:
# EM
for k in k_arr:
    gmm = GaussianMixture(n_components=k, random_state=10)
    gmm.fit(ss_data)
    gmm_labels = gmm.fit_predict(ss_data)

    cluster_name_gmm = 'gmm_cluster' + str(k)

    df[cluster_name_gmm] = gmm_labels

    plt.figure(figsize = (25, 20))

    for i in range(k):
        plt.scatter(df.loc[df[cluster_name_gmm] == i, 'longitude'], df.loc[df[cluster_name_gmm] == i, 'latitude'], label = 'cluster' + str(i))

    plt.legend()
    plt.title('K = %d results'%k)
    plt.xlabel('longitude')
    plt.ylabel('latitude')
    plt.show()

In [None]:
from sklearn.cluster import DBSCAN
from sklearn.metrics import silhouette_score
#DBSCAN

def dbscan(data, combination, eps, min_samples):
  """
  Args:
    data `dataframe`: train data
    combination `list`: list of feature
    eps `float`: eps
    min_samples `float`: min_samples
  Return:
    combination `list`: list of feature
    score `float`: score of model
  """
  dbsc = DBSCAN(eps= eps, min_samples= min_samples)
  featureCom = data[combination] 
  dbsc.fit(data)
  featureCom['cluster'] = dbsc.fit_predict(featureCom)
  sns.pairplot(featureCom, hue='cluster')
  plt.show()
  print(dbsc.labels_)
  if len(featureCom['cluster'].unique()) == 1:
    return combination, -1
  score = silhouette_score(data, dbsc.labels_, metric='euclidean')

  return combination, score


In [None]:
# Silhoutte Score Measure

from sklearn.metrics import silhouette_score

# Calculate Silhoutte Score
score = silhouette_score(ss_data, model.labels_, metric='euclidean')

# Print the score
print('Silhouetter Score: %.3f' % score)

In [None]:
from yellowbrick.cluster import SilhouetteVisualizer

fig, ax = plt.subplots(2, 2, figsize=(15,8))
for i in [2, 3, 4, 5]:
    '''
    Create KMeans instance for different number of clusters
    '''
    model = KMeans(n_clusters=i, random_state=10)
    q, mod = divmod(i, 2)
    '''
    Create SilhouetteVisualizer instance with KMeans instance
    Fit the visualizer
    '''
    visualizer = SilhouetteVisualizer(model, colors='yellowbrick', ax=ax[q-1][mod])
    visualizer.fit(ss_data)