In [None]:
# Initial imports.

import pandas as pd
from sklearn import tree
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from matplotlib import pyplot as plt
import pydotplus
from IPython.display import Image
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.cluster import KMeans, AgglomerativeClustering, Birch
import seaborn as sns
from statsmodels.tsa.seasonal import seasonal_decompose 
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
import folium
import geopandas as gpd
from mpl_toolkits.axes_grid1 import make_axes_locatable
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.arima.model import ARIMA

In [None]:
# Create K-means model.

model = KMeans(n_clusters=4, n_init='auto', random_state=42)
model.fit(X_scaled_df)
kmeans_predictions = model.predict(X_scaled_df)
print(kmeans_predictions)

In [None]:
# Create a copy of the DataFrame
clusters_df = birdflu_data_encoded.copy()

# Add a column to the DataFrame that contains Clusters.
clusters_df['Cluster'] = kmeans_predictions

# Review the DataFrame
clusters_df.head()

In [None]:
# Plot the data points.

clusters_df.plot.scatter(
    x='Flock Size',
    y= 'Sampling Method',
    c='Cluster',
    colormap='winter')

In [None]:
# Create an empty list to store the inertia values
inertia = []

# Create a list with the number of k-values to try
k = list(range(1, 11))

In [None]:
# Create a for loop to compute the inertia with each possible value of k and add the values to the inertia list.
for i in k:
    model = KMeans(n_clusters=i, n_init=10, random_state=42)
    model.fit(clusters_df)
    inertia.append(model.inertia_)

In [None]:
# Create a dictionary with the data to plot the elbow curve
elbow_data = {
    "k": k,
    "inertia": inertia
}

# Create a DataFrame with the data to plot the elbow curve
df_elbow = pd.DataFrame(elbow_data)

# Display the DataFrame
df_elbow

In [None]:
# Plot the Elbow curve
df_elbow.plot.line(x="k",
                   y="inertia",
                   title="Elbow Curve",
                   xticks=k)


In [None]:
# Determine the rate of decrease between each k value. 
k = elbow_data["k"]
inertia = elbow_data["inertia"]
for i in range(1, len(k)):
    percentage_decrease = (inertia[i-1] - inertia[i]) / inertia[i-1] * 100
    print(f"Percentage decrease from k={k[i-1]} to k={k[i]}: {percentage_decrease:.2f}%")

In [None]:
# Fit a AgglomerativeClustering Model with five clusters
agglo_model = AgglomerativeClustering(n_clusters=5)

# Make predictions with the AgglomerativeClustering model
agglo_predictions = agglo_model.fit_predict(clusters_df)

# Previewing the predicted customer classifications for AgglomerativeClustering
agglo_predictions[-10:]

In [None]:
# Fit a Birch Model with five clusters.
birch_model = Birch(n_clusters=None)
birch_model.fit(clusters_df)

# Make predictions with the Birch model
birch_predictions = birch_model.labels_

# Previewing the predicted customer classifications for BIRCH
birch_predictions[-10:]

# Look at the results
print("\nNumber of points in each cluster:")
print(pd.Series(birch_predictions).value_counts())

In [None]:
# Create a copy of the preprocessed data
cluster_predictions_df = clusters_df.copy()

# Add class columns with the labels to the new DataFrame

cluster_predictions_df["kmeans-segments"] = kmeans_predictions
cluster_predictions_df["agglomerative-segments"] = agglo_predictions
cluster_predictions_df["birch-segments"] = birch_predictions
cluster_predictions_df[['kmeans-segments','agglomerative-segments', 'birch-segments']].head(3)
cluster_predictions_df.head()

In [None]:
# Plot the kmeans clusters. 
cluster_predictions_df.plot.scatter(
    x='Days Since First Outbreak',
    y='HPAI Strain',
    c='kmeans-segments',
    colormap='viridis')

In [None]:
# Plot the agglomerative clusters using the limit_bal and age columns. 
cluster_predictions_df.plot.scatter(
    x='Days Since First Outbreak',
    y='HPAI Strain',
    c='agglomerative-segments',
    colormap='viridis')

In [None]:
# Plot the birch clusters.

cluster_predictions_df.plot.scatter(
    x='Days Since First Outbreak',
    y='HPAI Strain',
    c='birch-segments',
    colormap='viridis')

In [None]:
# Create a list to store values and the values of k
score_kmeans = []
score_agglomerative = []
score_birch = []

# Create a list to set the range of k values to test
k = list(range(2, 11))

In [None]:
from sklearn import metrics
# For each model, we iterate through the different cluster count (`i`). 
# Then, calculate the variance ratio for each algorithm, given that specified cluster count.

for i in k:
    # Kmeans variance and score
    k_model = KMeans(n_clusters=i, n_init='auto',random_state=0)
    k_model.fit(clusters_df)
    labels = k_model.labels_
    score = metrics.calinski_harabasz_score(clusters_df, labels)    
    score_kmeans.append(score)
    
    # AgglomerativeClustering variance and score
    agglo_model = AgglomerativeClustering(n_clusters=i)
    labels = agglo_model.fit_predict(clusters_df)
    score = metrics.calinski_harabasz_score(clusters_df, labels)    
    score_agglomerative.append(score)    
    
    # Birch variance and score
    birch_model = Birch(n_clusters=i)
    birch_model.fit(clusters_df)
    labels = birch_model.labels_
    score = metrics.calinski_harabasz_score(clusters_df, labels)    
    score_birch.append(score)

In [None]:
# Display the scores. 
display(score_kmeans)
display(score_agglomerative)
display(score_birch)