#### Prepare the data

In [None]:
# import needed libraries
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

In [None]:
# read the bike data
df = pd.read_csv('../../data/bike_dataset_new.csv',parse_dates=['datetime'])
df

#### Check the number of weather types

In [None]:
# Print out the unique values in the weather column
df['weather'].unique()

#### Create a dataframe with weather related data only ('temp', 'atemp', 'humidity', 'windspeed')

In [None]:
# Create a subset of the dataframe with only given columns
df_weather = df[['temp', 'atemp', 'humidity', 'windspeed', 'weather']]
df_weather

#### Check the data distribution and standardize it if needed

In [None]:
# Run describe method on the new dataframe that was just created
df_weather.describe()

In [None]:
# Standardize the data due to the difference in magnitude 
scaler = StandardScaler()
scaler.fit(df_weather)
df_weather_scaled = scaler.transform(df_weather)
df_weather_scaled

In [None]:
# Transform scaled data into a dataframe
df_weather = pd.DataFrame(df_weather_scaled, columns=df_weather.columns)
df_weather

In [None]:
df_weather.describe()

#### Use the elbow method to determine the number of clusters

In [None]:
# Use the code from the encounter and adapt it to implement the elbow method
K = range(2, 10) #let's give it a range
inertia = []

for k in K:
    kmeans = KMeans(n_clusters=k,
                    random_state=1234,
                    n_init=10) # a different random state
    kmeans.fit(df_weather)
    inertia.append(kmeans.inertia_) #Inertia measures how well a dataset was clustered by K-Means

plt.figure(figsize=(16,8))
plt.plot(K, inertia, 'bx-') # shows the x symbols on the graph
plt.xlabel('k')
plt.ylabel('inertia')
plt.xticks(np.arange(min(K), max(K), 1.0))
plt.title('Elbow Method showing the optimal k');

You should see a slight elbow for k = 4 which fits our knowledge of the dataset.

### Repeat k-means clustering with k = 4. Visualize the results and compare them with the weather from the `weather` column. Are the results aligned or rather different?

In [None]:
# Predict the labels and visualize the data. Adapt the code from the encounter.
# Note: in the encounter notebook we only visualize the data in a 2 dimensional space which means we are using only two columns (you can select column 1 - 'atemp' and column 2 - 'humidity')
kmeans = KMeans(n_clusters=4, random_state=42, n_init=10)
kmeans.fit(df_weather)

#  Get the cluster labels assigned to each data point
labels = kmeans.labels_

# Plot the data points with color-coded clusters
plt.scatter(df_weather['temp'], df_weather['atemp'], c=labels)

plt.xlabel('temp')
plt.ylabel('atemp')
plt.title('K-means Clustering Results (k = 4)')



plt.show()


In [None]:
# Add weather information to the plot
weather = df_weather['weather']
temp = df_weather['temp']
atemp = df_weather['atemp']



In [None]:
kmeans = KMeans(n_clusters=4, random_state=42, n_init=10)
kmeans.fit(df_weather)

# Get the cluster labels assigned to each data point
labels = kmeans.labels_

# Get the weather information
weather = df_weather['weather']

# Create a dictionary of markers for different weather conditions
weather_markers = {'sunny': 'o', 'rainy': 's', 'cloudy': '^', 'snowy': 'v'}

# Plot the data points with color-coded clusters and weather markers
for index, row in df_weather.iterrows():
    temp = row['temp']
    atemp = row['atemp']
    weather_label = row['weather']
    marker = weather_markers.get(weather_label, 'x')  # Use 'x' as the default marker symbol
    plt.scatter(temp, atemp, c=labels[index], marker=marker, label=weather_label)

plt.xlabel('temp')
plt.ylabel('atemp')
plt.title('K-means Clustering Results (k = 4)')
plt.legend()

plt.show()



#### Conclusions:

Write down your thoughts here.