In [None]:
# Initial imports
import pandas as pd
import hvplot.pandas
from pathlib import Path
import plotly.express as px
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

### Deliverable 1: Preprocessing the Data for PCA

In [None]:
# Load the costco_data.csv and demographic datasets.
file_path="../Resources/.CSV OR DATABASE CONNECTION/PATH"
file_path="../Resources/.CSV OR DATABASE CONNECTION/PATH"
costco_df=pd.read_csv(file_path, index_col=0)
costco_df.head()

In [None]:
coscto_df.dtypes

In [None]:
# Keep all the LOCATIONS THAT HAVE A HEARING CENTER.
coscto_df.drop(coscto_df[coscto_df["? COSTCO LOCATION YES"] == False].index, inplace=True)
#len(crypto_df.index)
coscto_df.shape

In [None]:
# Remove the "COSTCO LOCATION YES" column. 
coscto_df = coscto_df.drop(['? COSTCO LOCATION YES'], axis=1)
coscto_df.shape

In [None]:
# Remove rows that have at least 1 null value.
coscto_df = coscto_df.dropna()
coscto_df.shape

In [None]:
# Keep the rows where AGE IS WITHIN THE RANGE WE ARE TARGETING.
coscto_df = coscto_df.loc[coscto_df['? AGE COLUMNS'] != 0]
coscto_df.shape

In [None]:
coscto_df = coscto_df.loc[coscto_df['? AGE COLUMNS'] > 0]
crypto_df.shape

In [None]:
coscto_df.head(10)

In [None]:
# Create a new DataFrame that holds only the ??ZIPCODE??
coscto_name_df = coscto_df.iloc[: , [0]].copy()
coscto_name_df.head()

In [None]:
# Drop the '?? ZIPCODE' column since it's not going to be used on the clustering algorithm.
coscto_df = coscto_df.drop(['?? ZIPCODE'], axis=1)
coscto_df.head()

In [None]:
# Use get_dummies() to create variables for text features.
X=pd.get_dummies(coscto_df, columns=["   ", "   "])
X.head()

In [None]:
# Standardize the data with StandardScaler().
 # 1. Create instance of StandardScaler
data_scaler=StandardScaler()

# 2. train scaler and transform the data
X=data_scaler.fit_transform(X)

# 3. preview the scaled data
X[:5]

### Deliverable 2: Reducing Data Dimensions Using PCA

In [None]:
# Using PCA to reduce dimension to three principal components.
# 1.  Initialize PCA model
pca=PCA(n_components=3)

# 2. Get 3 principal components the data.
X_pca = pca.fit_transform(X)

In [None]:
# Create a DataFrame with the three principal components.
pcs_df = pd.DataFrame(
    data=X_pca, index=coscto_name_df.index, columns=["PC 1", "PC 2", "PC 3"])
pcs_df.head()

### Deliverable 3: Clustering Crytocurrencies Using K-Means

#### Finding the Best Value for `k` Using the Elbow Curve

In [None]:
# Create an elbow curve to find the best value for K.
# 1.  Find the best value for K
inertia = []
k = list(range(1, 11))

# 2. Calculate the inertia for the range of K values
for i in k:
    km = KMeans(n_clusters=i, random_state=0)
    km.fit(pcs_df)
    inertia.append(km.inertia_)
    
# 3. Create the elbow curve
elbow_data = {"k": k, "inertia": inertia}
df_elbow = pd.DataFrame(elbow_data)
df_elbow.hvplot.line(x="k", y="inertia", xticks=k, title="Elbow Curve")

Running K-Means with `k=4`

In [None]:
# Initialize the K-Means model.
model = KMeans(n_clusters=3, random_state=0)

# Fit the model
model.fit(pcs_df)

# Predict clusters
predictions = model.predict(pcs_df)

# Add the predicted class columns
pcs_df["class"] = model.labels_
print(predictions)

In [None]:
# Create a new DataFrame including predicted clusters and coscto features.
# Concatentate the coscto_df and pcs_df DataFrames on the same columns.
clustered_df = pd.DataFrame(data = coscto_df, index = coscto_df.index)
clustered_df["PC 1"] = pcs_df["PC 1"]
clustered_df["PC 2"] = pcs_df["PC 2"]
clustered_df["PC 3"] = pcs_df["PC 3"]


#  Add a new column, "?? ZIPCODE" to the clustered_df DataFrame that holds the names of the cryptocurrencies. 
clustered_df["?? ZIPCODE"] = coscto_name_df["?? ZIPCODE"]

#  Add a new column, "Class" to the clustered_df DataFrame that holds the predictions.
clustered_df["?? ZIPCODE"] = model.labels_

# Print the shape of the clustered_df
print(clustered_df.shape)
clustered_df.head(10)

### Deliverable 4: Visualizing Coscto Results

#### 3D-Scatter with Clusters

In [None]:
# Creating a 3D-Scatter with the PCA data and the clusters
fig=px.scatter_3d(clustered_df, x="PC 1", 
                  y="PC 2", z="PC 3", 
                  color="Class", symbol="Class", 
                  hover_name="?? ZIPCODE", hover_data=["    "], 
                  width=800)
fig.update_layout(legend=dict(x=0, y=1))
fig.show()
                 

In [None]:
# Create a table with POSSIBLE COSTCO LOCATION.
clustered_df.hvplot.table(columns=['?? ZIPCODE', '  ', '  ', '  ', 'Class'],
                         sortable=True, selectable=True)


In [None]:
# Print the total number of with POSSIBLE COSTCO LOCATION.
print("?? There are", clustered_df.shape[0], "with POSSIBLE COSTCO LOCATION")

In [None]:
# Scaling data to create the scatter plot with POSSIBLE COSTCO LOCATION.
 # 1. Create instance of MinMaxScaler
X = pd.DataFrame(clustered_df, columns=['    ', '    '], index = clustered_df.index)

# 2. train scaler and transform the data
X_scaled = MinMaxScaler().fit_transform(X)

# 3. preview the scaled data
X_scaled

In [None]:
# Create a new DataFrame that has the scaled data with the clustered_df DataFrame index.
plot_df = pd.DataFrame(X_scaled, columns=['TotalCoinSupply', 'TotalCoinsMined'], 
                       index = costco_df.index)


# Add the "?? ZIPCODE" column from the clustered_df DataFrame to the new DataFrame.
plot_df['?? ZIPCODE'] = clustered_df['?? ZIPCODE']

# Add the "Class" column from the clustered_df DataFrame to the new DataFrame. 
plot_df['Class'] = clustered_df['Class']

plot_df.head(10)

In [None]:
# Create a hvplot.scatter plot using x="    " and y="    ".
plot_df.hvplot.scatter(x="   ", y="  ", hover_cols=["?? ZIPCODE"], by="Class")
