# Step 2. World weather data model

## 2) Weather Clustering Model 

### Load data and data preparation 

In [51]:
# Import dependencies
import pandas as pd
import plotly.express as px
import hvplot.pandas
from sklearn.cluster import KMeans
import plotly.express as px
import hvplot.pandas

In [52]:
import pandas as pd
# Read data file
city_data_df = pd.read_csv("../WeatherAnalysis/data/cities.csv")

In [87]:
city_data_df.head()

Unnamed: 0,City_ID,City,Country,Date,Lat,Lng,Max Temp,Humidity,Cloudiness,Wind Speed,Current Description
0,0,Bambous Virieux,MU,2021-08-27 22:08:43,-20.3428,57.7575,71.22,90,95,22.97,light rain
1,1,Khatanga,RU,2021-08-27 22:08:44,71.9667,102.5,35.96,96,100,22.3,moderate rain
2,2,Hobart,AU,2021-08-27 22:08:44,-42.8794,147.3294,44.11,97,75,3.0,broken clouds
3,3,Ati,TD,2021-08-27 22:08:45,13.2154,18.3353,88.47,48,37,4.25,scattered clouds
4,4,Vaini,TO,2021-08-27 22:08:45,-21.2,-175.2,69.96,78,40,8.05,scattered clouds


In [108]:
# Split dataset for test and train 
new_city_df = city_data_df.loc[:,['Max Temp','Humidity','Cloudiness','Wind Speed']]

output_file_path = "data/new_cities.csv"
new_city_df.to_csv(output_file_path, index=False)
new_city_df.head()

Unnamed: 0,Max Temp,Humidity,Cloudiness,Wind Speed
0,71.22,90,95,22.97
1,35.96,96,100,22.3
2,44.11,97,75,3.0
3,88.47,48,37,4.25
4,69.96,78,40,8.05


In [78]:
new_city_df.dtypes

Max Temp      float64
Humidity        int64
Cloudiness      int64
Wind Speed    float64
dtype: object

In [79]:
# Checking null values for unsupervised learning 
for column in new_city_df.columns:
    print (f"{column} has {new_city_df[column].isnull().sum()} null values")

Max Temp has 0 null values
Humidity has 0 null values
Cloudiness has 0 null values
Wind Speed has 0 null values


In [80]:
# Find duplicate entries
print(f" Duplicate entries: {new_city_df.duplicated().sum()}")

 Duplicate entries: 0


### Clustering Model using K-Means

In [81]:
# Function to cluster and plot dataset
def test_cluster_amount(df, clusters):
   model = KMeans(n_clusters=clusters, random_state=5)
   # Fitting model
   model.fit(df)
   # Get the prediction 
   predictions = model.predict(df)

In [82]:
# Initialize model with k=2 (since there's distinctive differences for north and south hemisphere) 
test_cluster_amount(new_city_df, 2)
# Add a new coord column to df
new_city_df["Coord"] = model.labels_
new_city_df.hvplot.scatter(x="Max Temp", y="Humidity", by="Coord")

In [84]:
# Plotting the clusters with three features
fig = px.scatter_3d(new_city_df, x="Max Temp", y="Humidity", z="Cloudiness", color="Coord", symbol="Coord", size="Max Temp",width=800)
fig.update_layout(legend=dict(x=0,y=1))
fig.show()

In [91]:
# Initialize model with k=3 
model = KMeans(n_clusters=3, random_state=5)
# Fitting model
model.fit(new_city_df)
# Get the prediction 
predictions = model.predict(new_city_df)
new_city_df["Coord"] = model.labels_

In [94]:
# Plotting the clusters with three features
fig = px.scatter_3d(new_city_df, x="Max Temp", y="Humidity", z="Cloudiness", color="Coord", symbol="Coord", size="Max Temp",width=800)
fig.update_layout(legend=dict(x=0,y=1))
fig.show()

In [102]:
# Initialize model with k= 
model = KMeans(n_clusters=4, random_state=5)
# Fitting model
model.fit(new_city_df)
# Get the prediction 
predictions = model.predict(new_city_df)
new_city_df["Coord"] = model.labels_

In [105]:
# Plotting the clusters with three features
fig = px.scatter_3d(new_city_df, x="Max Temp", y="Humidity", z="Cloudiness", color="Coord", symbol="Coord", size="Max Temp",width=800)
fig.update_layout(legend=dict(x=0,y=1))
fig.show()

In [99]:
# Initialize model with k=5 
model = KMeans(n_clusters=5, random_state=5)
# Fitting model
model.fit(new_city_df)
# Get the prediction 
predictions = model.predict(new_city_df)
new_city_df["Coord"] = model.labels_

In [100]:
# Plotting the clusters with three features
fig = px.scatter_3d(new_city_df, x="Max Temp", y="Humidity", z="Cloudiness", color="Coord", symbol="Coord", size="Max Temp",width=800)
fig.update_layout(legend=dict(x=0,y=1))
fig.show()

### Plot the elbow curve to decide best number of cluster 

In [111]:
# Store values in inertia
inertia = []
k = list(range(1, 11))

# Looking for the best K
for i in k:
    km = KMeans(n_clusters=i, random_state=0)
    km.fit(new_city_df)
    inertia.append(km.inertia_)

In [112]:
# Define a DataFrame to plot the Elbow Curve using hvPlot
elbow_data = {"k": k, "inertia": inertia}
df_elbow = pd.DataFrame(elbow_data)
df_elbow.hvplot.line(x="k", y="inertia", title="Elbow Curve", xticks=k)

### Next step : Principal componant analysis to eliminate features