In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import sklearn
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

In [2]:
df = pd.read_csv('minute_weather.csv')
df.head()

Unnamed: 0,rowID,hpwren_timestamp,air_pressure,air_temp,avg_wind_direction,avg_wind_speed,max_wind_direction,max_wind_speed,min_wind_direction,min_wind_speed,rain_accumulation,rain_duration,relative_humidity
0,0,2011-09-10 00:00:49,912.3,64.76,97.0,1.2,106.0,1.6,85.0,1.0,,,60.5
1,1,2011-09-10 00:01:49,912.3,63.86,161.0,0.8,215.0,1.5,43.0,0.2,0.0,0.0,39.9
2,2,2011-09-10 00:02:49,912.3,64.22,77.0,0.7,143.0,1.2,324.0,0.3,0.0,0.0,43.0
3,3,2011-09-10 00:03:49,912.3,64.4,89.0,1.2,112.0,1.6,12.0,0.7,0.0,0.0,49.5
4,4,2011-09-10 00:04:49,912.3,64.4,185.0,0.4,260.0,1.0,100.0,0.1,0.0,0.0,58.8


In [3]:
df.shape
print(df.columns)

Index(['rowID', 'hpwren_timestamp', 'air_pressure', 'air_temp',
       'avg_wind_direction', 'avg_wind_speed', 'max_wind_direction',
       'max_wind_speed', 'min_wind_direction', 'min_wind_speed',
       'rain_accumulation', 'rain_duration', 'relative_humidity'],
      dtype='object')


In [4]:
df.shape

(1587257, 13)

In [5]:
df.isna().sum()

rowID                   0
hpwren_timestamp        0
air_pressure            0
air_temp                0
avg_wind_direction    433
avg_wind_speed        433
max_wind_direction    433
max_wind_speed        433
min_wind_direction    433
min_wind_speed        433
rain_accumulation       1
rain_duration           1
relative_humidity       0
dtype: int64

In [6]:
#since we are attempting a k-means clustering through an iterative process, it is important to find the optimal
#positioning of the initial centroids to allow the k-means algorithm to find convergence
#so to do this, instead of working with the entire dataset, we draw up a sample and run short centroid 
#initialisation runs of randomly initialised centroids and track improvements in the metric
#taking a sample from every 10th row we create a new sample df

sample_df = df[(df['rowID'] % 10) == 0]
sample_df.shape

(158726, 13)

In [7]:
sample_df.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
rowID,158726.0,793625.0,458203.937509,0.0,396812.5,793625.0,1190437.5,1587250.0
air_pressure,158726.0,916.830161,3.051717,905.0,914.8,916.7,918.7,929.5
air_temp,158726.0,61.851589,11.833569,31.64,52.7,62.24,70.88,99.5
avg_wind_direction,158680.0,162.1561,95.278201,0.0,62.0,182.0,217.0,359.0
avg_wind_speed,158680.0,2.775215,2.057624,0.0,1.3,2.2,3.8,31.9
max_wind_direction,158680.0,163.462144,92.452139,0.0,68.0,187.0,223.0,359.0
max_wind_speed,158680.0,3.400558,2.418802,0.1,1.6,2.7,4.6,36.0
min_wind_direction,158680.0,166.774017,97.441109,0.0,76.0,180.0,212.0,359.0
min_wind_speed,158680.0,2.134664,1.742113,0.0,0.8,1.6,3.0,31.6
rain_accumulation,158725.0,0.000318,0.011236,0.0,0.0,0.0,0.0,3.12


In [8]:
#dropping rain_accumulation and rain_duration columns

sample_df[sample_df['rain_accumulation'] == 0].shape

(157812, 13)

In [9]:
sample_df[sample_df['rain_duration'] == 0].shape

(157237, 13)

In [10]:
#dropping all the rows with empty rain_duration and rain_accumulation

del sample_df['rain_accumulation']
del sample_df['rain_duration']

In [11]:
df1 = sample_df.dropna()

In [12]:
print(df1.columns)

Index(['rowID', 'hpwren_timestamp', 'air_pressure', 'air_temp',
       'avg_wind_direction', 'avg_wind_speed', 'max_wind_direction',
       'max_wind_speed', 'min_wind_direction', 'min_wind_speed',
       'relative_humidity'],
      dtype='object')


In [13]:
#selecting columns we would like to cluster
cols_of_interest = ['air_pressure', 'air_temp', 'avg_wind_direction', 'avg_wind_speed','max_wind_direction',
                    'max_wind_speed', 'relative_humidity']

In [14]:
data = df1[cols_of_interest]
data.head()

Unnamed: 0,air_pressure,air_temp,avg_wind_direction,avg_wind_speed,max_wind_direction,max_wind_speed,relative_humidity
0,912.3,64.76,97.0,1.2,106.0,1.6,60.5
10,912.3,62.24,144.0,1.2,167.0,1.8,38.5
20,912.2,63.32,100.0,2.0,122.0,2.5,58.3
30,912.2,62.6,91.0,2.0,103.0,2.4,57.9
40,912.2,64.04,81.0,2.6,88.0,2.9,57.4


In [15]:
data.shape

(158680, 7)

In [16]:
# since the values are across multiple scales, we first need to normalise them 
# scaling is also important from a clustering perspective as the distance between points is what affects
# our clusters

X = StandardScaler().fit_transform(data)
X

array([[-1.48456281,  0.24544455, -0.68385323, ..., -0.62153592,
        -0.74440309,  0.49233835],
       [-1.48456281,  0.03247142, -0.19055941, ...,  0.03826701,
        -0.66171726, -0.34710804],
       [-1.51733167,  0.12374562, -0.65236639, ..., -0.44847286,
        -0.37231683,  0.40839371],
       ...,
       [-0.30488381,  1.15818654,  1.90856325, ...,  2.0393087 ,
        -0.70306017,  0.01538018],
       [-0.30488381,  1.12776181,  2.06599745, ..., -1.67073075,
        -0.74440309, -0.04948614],
       [-0.30488381,  1.09733708, -1.63895404, ..., -1.55174989,
        -0.62037434, -0.05711747]])

K-means clustering

In [79]:
#using KMeans clustering for n = 10

k_means = KMeans(n_clusters=5)
model = k_means.fit(X)
model

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
       n_clusters=5, n_init=10, n_jobs=None, precompute_distances='auto',
       random_state=None, tol=0.0001, verbose=0)

In [80]:
centers = model.cluster_centers_
centers

array([[ 1.28996614, -0.20062798, -1.11475165,  1.66442517, -1.01172211,
         1.76193986, -1.10422665],
       [-0.18828935, -0.99481577,  0.43717222, -0.37173136,  0.42273073,
        -0.3643798 ,  1.17375491],
       [-0.93373555, -0.59659884,  0.40966414,  1.45918021,  0.50421556,
         1.41076221,  0.75220196],
       [ 0.03820476,  0.71608084,  0.71386961, -0.35222427,  0.55603731,
        -0.3495922 , -0.52243663],
       [ 0.25848765,  0.27477416, -1.25076308, -0.52499842, -1.10752747,
        -0.53621634, -0.39406384]])

In [56]:
#generate cluster predictions with k-means and store in y_hat
# y_hat = k_means.predict(X)
# y_hat

In [57]:
#calculating the silhouette coefficient

# from sklearn import metrics
# labels = k_means.labels_

# metrics.silhouette_score(X, labels, metric = 'euclidean')

In [58]:
#calculating k_means for n=12

# k_means_12 = KMeans(n_clusters=12)
# model = k_means_12.fit(X)
# y_hat_12 = k_means_12.predict(X)
# model

In [59]:
#calculating CH score for both the above models

# metrics.calinski_harabasz_score(X, labels)

In [60]:
# labels_12 = k_means_12.labels_
# metrics.silhouette_score(X, labels_12, metric='euclidean')

In [61]:
# metrics.calinski_harabasz_score(X, labels_12)

In [62]:
# k_means_6 = KMeans(n_clusters=6)
# model = k_means_6.fit(X)
# y_hat_6 = k_means_6.predict(X)
# model

In [63]:
# labels_6 = k_means_6.labels_
# metrics.silhouette_score(X, labels_6, metric = 'euclidean')

In [64]:
# metrics.calinski_harabasz_score(X, labels_6)

In [65]:
# k_means_8 = KMeans(n_clusters=8)
# model = k_means_8.fit(X)
# y_hat_8 = k_means_8.predict(X)
# model

In [66]:
# labels_8 = k_means_8.labels_
# metrics.silhouette_score(X, labels_8, metric = 'euclidean')

In [67]:
# metrics.calinski_harabasz_score(X, labels_8)

In [68]:
# k_means_5 = KMeans(n_clusters=5)
# model = k_means_5.fit(X)
# y_hat_5 = k_means_5.predict(X)
# model

In [69]:
# labels_5 = k_means_5.labels_
# metrics.silhouette_score(X, labels_5, metric = 'euclidean')

In [70]:
# metrics.calinski_harabasz_score(X, labels_5)

In [71]:
#for each value of k, we can initialise k_means and use inertia to identify the sum of 
#squared distances of samples to the nearest cluster centre

# sum_of_squared_distances = []
# K = range(1,15)
# for k in K:
#     k_means = KMeans(n_clusters=k)
#     model = k_means.fit(X)
#     sum_of_squared_distances.append(k_means.inertia_)

In [72]:
# plt.plot(K, sum_of_squared_distances, 'bx-')
# plt.xlabel('k')
# plt.ylabel('sum_of_squared_distances')
# plt.title('elbow method for optimal k')
# plt.show()

In [73]:
# #we can run a PCA on this
# from sklearn.decomposition import PCA
# pca = PCA(n_components=1).fit(X)
# pca_d = pca.transform()
# pca_c = pca.transform(X)

In [81]:
#function that creates a dataframe with a column for cluster number

def pd_centers(cols_of_interest, centers):
        colNames = list(cols_of_interest)
        colNames.append('prediction')

        # Zip with a column called 'prediction' (index)
        Z = [np.append(A, index) for index, A in enumerate(centers)]

        # Convert to pandas data frame for plotting
        P = pd.DataFrame(Z, columns=colNames)
        P['prediction'] = P['prediction'].astype(int)
        return P

In [82]:
P = pd_centers(cols_of_interest, centers)
P

Unnamed: 0,air_pressure,air_temp,avg_wind_direction,avg_wind_speed,max_wind_direction,max_wind_speed,relative_humidity,prediction
0,1.289966,-0.200628,-1.114752,1.664425,-1.011722,1.76194,-1.104227,0
1,-0.188289,-0.994816,0.437172,-0.371731,0.422731,-0.36438,1.173755,1
2,-0.933736,-0.596599,0.409664,1.45918,0.504216,1.410762,0.752202,2
3,0.038205,0.716081,0.71387,-0.352224,0.556037,-0.349592,-0.522437,3
4,0.258488,0.274774,-1.250763,-0.524998,-1.107527,-0.536216,-0.394064,4
