In [1]:
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import pandas as pd
import numpy as np
from itertools import cycle, islice

In [2]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [4]:
data = pd.read_csv('/content/minute_weather.csv')

In [5]:
data.shape


(52800, 13)

In [6]:
data.head()


Unnamed: 0,rowID,hpwren_timestamp,air_pressure,air_temp,avg_wind_direction,avg_wind_speed,max_wind_direction,max_wind_speed,min_wind_direction,min_wind_speed,rain_accumulation,rain_duration,relative_humidity
0,0,2011-09-10 00:00:49,912.3,64.76,97.0,1.2,106.0,1.6,85.0,1.0,,,60.5
1,1,2011-09-10 00:01:49,912.3,63.86,161.0,0.8,215.0,1.5,43.0,0.2,0.0,0.0,39.9
2,2,2011-09-10 00:02:49,912.3,64.22,77.0,0.7,143.0,1.2,324.0,0.3,0.0,0.0,43.0
3,3,2011-09-10 00:03:49,912.3,64.4,89.0,1.2,112.0,1.6,12.0,0.7,0.0,0.0,49.5
4,4,2011-09-10 00:04:49,912.3,64.4,185.0,0.4,260.0,1.0,100.0,0.1,0.0,0.0,58.8


In [7]:
#data sampling
sampled_df = data[(data['rowID'] % 10) == 0]
sampled_df.shape

(5280, 13)

In [8]:
sampled_df.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
rowID,5280.0,26395.0,15243.490414,0.0,13197.5,26395.0,39592.5,52790.0
air_pressure,5280.0,916.253636,1.617976,910.8,915.1,916.2,917.5,920.1
air_temp,5280.0,68.397739,9.813052,45.68,62.78,70.16,75.74,90.68
avg_wind_direction,5277.0,152.386015,98.888369,0.0,51.0,175.0,213.0,359.0
avg_wind_speed,5277.0,2.243017,1.560148,0.0,1.1,1.9,3.0,23.4
max_wind_direction,5277.0,152.857874,95.424243,0.0,56.0,180.0,219.0,359.0
max_wind_speed,5277.0,2.766648,1.820335,0.1,1.5,2.4,3.7,24.5
min_wind_direction,5277.0,161.459352,102.677449,0.0,59.0,175.0,212.0,359.0
min_wind_speed,5277.0,1.704586,1.339688,0.0,0.8,1.4,2.4,21.6
rain_accumulation,5279.0,3.6e-05,0.001039,0.0,0.0,0.0,0.0,0.04


In [9]:
sampled_df[sampled_df['rain_accumulation'] == 0].shape

(5271, 13)

In [None]:
sampled_df[sampled_df['rain_duration'] == 0].shape

In [10]:
del sampled_df['rain_accumulation']
del sampled_df['rain_duration']

In [11]:
rows_before = sampled_df.shape[0]
sampled_df = sampled_df.dropna()
rows_after = sampled_df.shape[0]

In [12]:
rows_before - rows_after

3

In [13]:
sampled_df.columns

Index(['rowID', 'hpwren_timestamp', 'air_pressure', 'air_temp',
       'avg_wind_direction', 'avg_wind_speed', 'max_wind_direction',
       'max_wind_speed', 'min_wind_direction', 'min_wind_speed',
       'relative_humidity'],
      dtype='object')

In [14]:
features = ['air_pressure', 'air_temp', 'avg_wind_direction', 'avg_wind_speed', 'max_wind_direction', 'max_wind_speed','relative_humidity']

In [15]:
select_df = sampled_df[features]

In [17]:
select_df.columns

Index(['air_pressure', 'air_temp', 'avg_wind_direction', 'avg_wind_speed',
       'max_wind_direction', 'max_wind_speed', 'relative_humidity'],
      dtype='object')

In [18]:
select_df

Unnamed: 0,air_pressure,air_temp,avg_wind_direction,avg_wind_speed,max_wind_direction,max_wind_speed,relative_humidity
0,912.3,64.76,97.0,1.2,106.0,1.6,60.5
10,912.3,62.24,144.0,1.2,167.0,1.8,38.5
20,912.2,63.32,100.0,2.0,122.0,2.5,58.3
30,912.2,62.60,91.0,2.0,103.0,2.4,57.9
40,912.2,64.04,81.0,2.6,88.0,2.9,57.4
...,...,...,...,...,...,...,...
52750,919.1,69.98,354.0,2.0,4.0,2.9,22.5
52760,919.1,70.16,0.0,2.8,15.0,3.3,24.9
52770,919.2,69.98,0.0,2.6,13.0,3.3,22.2
52780,919.2,69.98,10.0,3.2,18.0,3.7,23.1


In [19]:
X = StandardScaler().fit_transform(select_df)
X

array([[-2.4441149 , -0.37203061, -0.56013932, ..., -0.49109443,
        -0.64095812,  0.61298933],
       [-2.4441149 , -0.62902833, -0.08481088, ...,  0.1482167 ,
        -0.5310778 , -0.31987974],
       [-2.50592258, -0.51888645, -0.5297992 , ..., -0.32340627,
        -0.14649668,  0.51970243],
       ...,
       [ 1.82061512,  0.1603218 , -1.5411363 , ..., -1.4657819 ,
         0.29302459, -1.01105092],
       [ 1.82061512,  0.1603218 , -1.44000259, ..., -1.41337934,
         0.51278523, -0.97288809],
       [ 1.8824228 ,  0.1603218 , -1.50068282, ..., -1.48674292,
         0.56772539, -0.95168652]])

In [20]:
#Using kmeans clustering
kmeans = KMeans(n_clusters=12)
model = kmeans.fit(X)
print("model\n", model)

model
 KMeans(n_clusters=12)


In [21]:
centers = model.cluster_centers_
centers


array([[-0.68012367,  0.34250462,  0.32963507, -0.66162029,  0.51834175,
        -0.68722994, -0.30370059],
       [-0.7290024 ,  0.16110423, -1.16743896, -0.43667524, -1.06389728,
        -0.48226211, -0.43963038],
       [-1.18170668, -1.58521578,  0.66456306,  1.19547895,  0.8072213 ,
         1.23316579,  1.6454495 ],
       [-0.52658383, -1.53053044,  0.22114709, -0.58354319,  0.35330392,
        -0.59866923,  1.66776231],
       [-0.35537299,  0.91933273, -0.90457927,  1.37299354, -0.83277255,
         1.36691922, -0.99231718],
       [ 0.30232087,  0.19244651,  0.50000104,  0.94524056,  0.62160259,
         0.90525128,  0.13209978],
       [ 0.84909783, -0.44572228, -1.1830416 ,  0.03597503, -1.08008104,
         0.0104301 ,  0.02495039],
       [-2.03415002, -1.69582445,  0.54145002,  3.73986956,  0.69479522,
         3.8015449 ,  1.76957522],
       [ 1.00301714,  0.73659786,  0.30964114, -0.37868642,  0.5352515 ,
        -0.3655104 , -0.59645454],
       [ 0.04575136,  0.3368