In [56]:
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import pandas as pd
import numpy as np
from itertools import cycle, islice
import matplotlib.pyplot as plt
from pandas.plotting import parallel_coordinates

%matplotlib inline

In [57]:
data = pd.read_csv('../input/minute_weather.csv')

In [58]:
data.shape

In [59]:
sampled_df = data[(data['rowID'] % 10) == 0]
sampled_df.shape

In [60]:
sampled_df.describe().transpose()

In [61]:
sampled_df[sampled_df['rain_accumulation'] == 0].shape

In [62]:
sampled_df[sampled_df['rain_duration'] == 0].shape

In [63]:
del sampled_df['rain_accumulation']
del sampled_df['rain_duration']

In [64]:
rows_before = sampled_df.shape[0]
sampled_df = sampled_df.dropna()
rows_after = sampled_df.shape[0]

In [65]:
rows_before - rows_after

In [66]:
sampled_df.columns

In [67]:
features = ['air_pressure', 'air_temp','relative_humidity']

In [68]:
select_df = sampled_df[features]

In [69]:
select_df.columns

In [70]:
select_df

In [71]:
X = StandardScaler().fit_transform(select_df)
X

In [72]:
kmeans = KMeans(n_clusters=12)
model = kmeans.fit(X)
print("model\n", model)

In [73]:
centers = model.cluster_centers_
centers

<p style="font-family: Arial; font-size:2.75em;color:purple; font-style:bold"><br>

Plots
<br><br></p>


Let us first create some utility functions which will help us in plotting graphs:

In [74]:
# Function that creates a DataFrame with a column for Cluster Number

def pd_centers(featuresUsed, centers):
	colNames = list(featuresUsed)
	colNames.append('prediction')

	# Zip with a column called 'prediction' (index)
	Z = [np.append(A, index) for index, A in enumerate(centers)]

	# Convert to pandas data frame for plotting
	P = pd.DataFrame(Z, columns=colNames)
	P['prediction'] = P['prediction'].astype(int)
	return P

In [75]:
# Function that creates Parallel Plots

def parallel_plot(data):
	my_colors = list(islice(cycle(['b', 'r', 'g', 'y', 'k']), None, len(data)))
	plt.figure(figsize=(15,8)).gca().axes.set_ylim([-3,+3])
	parallel_coordinates(data, 'prediction', color = my_colors, marker='o')

In [76]:
P = pd_centers(features, centers)
P

# Dry Days

In [77]:
parallel_plot(P[P['relative_humidity'] < -0.5])

# Warm Days

In [78]:
parallel_plot(P[P['air_temp'] > 0.5])

# Cool Days

In [79]:
parallel_plot(P[(P['relative_humidity'] > 0.5) & (P['air_temp'] < 0.5)])