from sklearn.cluster import KMeans
https://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html#sklearn.cluster.KMeans

from sklearn.cluster import AgglomerativeClustering
https://scikit-learn.org/stable/modules/generated/sklearn.cluster.AgglomerativeClustering.html#sklearn.cluster.AgglomerativeClustering


## K-means Clustering
- Select k initial seeds
- Assign each observation to the clusted to which it is "closest"
- Recompute the cluster centroids
- Reassign the observations to one of the clusters according to some rule
- Stop if there is no reallocation.

k_means = KMeans(n_clusters=3) ###### Must set number of clusters at initialization time!

k_means.fit(some_df) ###### Run the clustering algorithm

cluster_assignments = k_means.predict(some_df) ###### Generate cluster index values for each row in df

##### cluster predictions for each point are also stored in k_means.labels_ attribute

#### Computing Variance Ratios
 from sklearn.metrics import calinski_harabaz_score

print(calinski_harabaz_score(some_df, cluster_assignments))
 
###### The higher the score, the better the fit.
###### Note: could also pass in k_means.labels_ instead of cluster_assignments, as they are the same thing
#### Finding the Optimal Value of K by visualizing the scores using an Elbow Plot:
###### Calinski Harabaz Score
from sklearn.metrics import calinski_harabaz_score

CH_score = []

for k in k_list:
    labels=k.labels_
    chs=calinski_harabaz_score(X_2, labels)
    CH_score.append(chs)

###### plotting CH Score
plt.plot([3, 4, 5, 6, 7], CH_score)

plt.xticks([3,4,5,6,7])

plt.title("Calinski Harabaz Scores for Different Values of K")

plt.ylabel("Variance Ratio")

plt.xlabel("K=")

plt.show()


## Heirarchal Agglomerative Clustering - HAC
agg_clust = AgglomerativeClustering(n_clusters=3)

agg_clust

assigned_clust = agg_clust.fit_predict(X)

### Dendrogram Plot
from scipy.cluster.hierarchy import dendrogram, ward

linkage_array = ward(X)

###### Now we plot the dendrogram for the linkage_array containing the distances between clusters
dendrogram(linkage_array)

ax = plt.gca()

bounds = ax.get_xbound()

plt.xlabel("Sample index")

plt.ylabel("Cluster distance");


## Clustering Evaluations
from sklearn import metrics
#### Adjusted Rand Index
metrics.adjusted_rand_score(labels_kmeans, y)
###### generates number between -1 and 1. Better score as it gets closer to 1
#### Fowlkes-Mallows Score
metrics.fowlkes_mallows_score(labels_kmeans, y)
###### generates number between 0 and 1. Better score as it gets closer to 1
#### Calinksi-Harabaz Index
metrics.calinski_harabaz_score(X, labels_kmeans)
###### higher the score, the better
#### Silhouette Coefficient
metrics.silhouette_score(X, labels_kmeans)
###### generates number between -1 and 1. Closer to -1 means incorrect clustering, closer to 1 means each cluster is dense

## Market Segmentation with Clustering

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
np.random.seed(0)
%matplotlib inline


###### load data and explore
raw_df = pd.read_csv('wholesale_customers_data.csv')

raw_df.head()

Now, let's go ahead and store the 'Channel' column in a separate variable, and then drop both the 'Channel' and 'Region' columnns. Then, display the head of the new DataFrame to ensure everything worked correctly.

###### set target and features
channels = raw_df['Channel']

df=raw_df.drop(['Channel', 'Region'], axis = 1, inplace=False) 

df.head()

#### Scale the Dataset
from sklearn.preprocessing import StandardScaler

scale=StandardScaler()

df_scale=scale.fit_transform(df)

#### Run K-Means with 2 clusters
k_means2=KMeans(n_clusters=2)

k_means2.fit(df_scale)

scaled_preds=k_means2.predict(df_scale)

#### get adjusted rand index
ajs2=adjusted_rand_score(channels, scaled_preds)

print(ajs2)

#### Incorporating PCA
from sklearn.decomposition import PCA

pca = PCA(n_components=4)

pca_df = pca.fit_transform(df_scale)

np.cumsum(pca.explained_variance_ratio_)

###### rerun KMeans model and adjust PCA n_components until highest evaluation score achieved.

#### Hierarchical Agglomerative Clustering
from sklearn.cluster import AgglomerativeClustering

hac = AgglomerativeClustering(n_clusters=2)
###### fitted to PCA data
hac.fit(pca_df)

hac_pca_preds = hac.labels_

adjusted_rand_score(channels, hac_pca_preds)

hac2 = AgglomerativeClustering(n_clusters=2)
###### fitted to scaled only data
hac2.fit(df_scale)

hac_scaled_preds = hac2.labels_

adjusted_rand_score(channels, hac_scaled_preds)

hac3 = AgglomerativeClustering(n_clusters=2)
###### fitted to original data
hac3.fit(df)

hac__preds = hac3.labels_

adjusted_rand_score(channels, hac__preds)

## Distance Function for Manhattan, Euclidean and Minkowski
def distance(a, b, c=2, verbose=True):
    if len(a) != len(b):
        raise ValueError("Both vectors must be of equal length!")
    
    root = 1 / c
    running_total = 0
    
    if verbose:
        if c == 1:
            print("Calculating Manhattan Distance:")
        elif c == 2:
            print('Calculating Euclidean Distance:')
        else:
            print("Calcuating Minkowski Distance (c={}):".format(c))
    
    for ind, val_a in enumerate(a):
        val_b = b[ind]
        running_total += np.power(np.abs(val_a - val_b), c)
    
    return np.power(running_total, root)

test_point_1 = ()

test_point_2 = ()

print(distance(test_point_1, test_point_2))

print(distance(test_point_1, test_point_2, c=1))

print(distance(test_point_1, test_point_2, c=3))

## KNN
from sklearn.neighbors import KNeighborsClassifier
https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html#sklearn.neighbors.KNeighborsClassifier


## KNN raw class creation 
Getting Started¶
We'll begin this lab by creating our classifier. To keep things simple, we'll be using a helper function from the scipy library to calcluate euclidean distance for us--specifically, the euclidean() function from the scipy.spatial.distance module. Import this function in the cell below.

#### Create Helper Function
from scipy.spatial.distance import euclidean as euc

import numpy as np

np.random.seed(0)

#### Create an class called KNN.
class KNN(object):
    def fit():
        pass
    def predict():
        pass
#### Completing the fit Method
def fit(self, X_train, y_train):
    self.X_train = X_train
    self.y_train = y_train
    
###### This line updates the knn.fit method to point to the function we've just written
KNN.fit = fit

#### Helper Functions
def _get_distances(self, x):
    distances = []
    for ind, val in enumerate(self.X_train):
        dist_to_i = euc(x, val)
        distances.append((ind, dist_to_i))
    return distances
###### This line attaches the function we just created as a method to our KNN class.
KNN._get_distances = _get_distances

def _get_k_nearest(self, dists, k):
    sorted_dists = sorted(dists, key=lambda x: x[1])
    return sorted_dists[:k]
###### This line attaches the function we just created as a method to our KNN class.
KNN._get_k_nearest = _get_k_nearest

def _get_label_prediction(self, k_nearest):
    labels = [self.y_train[i] for i, _ in k_nearest]
    counts = np.bincount(labels)
    return np.argmax(counts)
###### This line attaches the function we just created as a method to our KNN class.
KNN._get_label_prediction = _get_label_prediction
Great! Now, we need to complete the predict method. This will be much simpler, now that we have some

#### Completing the predict Method
def predict(self, X_test, k=3):
    preds = []
    # Iterate through each item in X_test
    for i in X_test:
        # Get distances between i and each item in X_train
        dists = self._get_distances(i)
        k_nearest = self._get_k_nearest(dists, k)
        predicted_label = self._get_label_prediction(k_nearest)
        preds.append(predicted_label)
    return preds
        
KNN.predict = predict

#### Testing Our KNN Classifier with Iris dataset
from sklearn.datasets import load_iris

from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score

iris = load_iris()

data = iris.data

target = iris.target

X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.25)

knn = KNN()

knn.fit(X_train, y_train)

preds = knn.predict(X_test)

print("Testing Accuracy: {}".format(accuracy_score(y_test, preds)))

## Predicting Titanic Survivors with KNN
import pandas as pd
###### get data
raw_df = pd.read_csv('titanic.csv')

raw_df.head()
#### Preprocessing Our Data
###### drop unnecessary columns
df = raw_df.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1, inplace=False)
###### change binary selection for male/female to integer
df.Sex = df.Sex.map({'female': 0, 'male': 1})
###### check for nulls
df.isna().sum()
###### fill null age with median
df.Age = df.Age.fillna(df.Age.median())
###### drop rest of nulls in dataset
df = df.dropna()
##### one hot encode categorical data to convert to numerical
one_hot_df = pd.get_dummies(df)

one_hot_df.head()
###### set target and features 
labels = one_hot_df.Survived

one_hot_df.drop('Survived', axis=1, inplace=True)
#### Normalizing Our Data
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

scaled_data = scaler.fit_transform(one_hot_df)
###### recover column names and save as dataframe
scaled_df = pd.DataFrame(scaled_data, columns=one_hot_df.columns)

scaled_df.head()

#### Creating Training and Testing Sets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(one_hot_df, labels, test_size=0.25)

#### Creating and Fitting our KNN Model
from sklearn.neighbors import KNeighborsClassifier

clf1 = KNeighborsClassifier()

clf1.fit(X_train, y_train)

test_preds = clf1.predict(X_test)

#### Evaluate initial model
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score

def print_metrics(labels, preds):
    print("Precision Score: {}".format(precision_score(labels, preds)))
    print("Recall Score: {}".format(recall_score(labels, preds)))
    print("Accuracy Score: {}".format(accuracy_score(labels, preds)))
    print("F1 Score: {}".format(f1_score(labels, preds)))
    
###### Note: Overall, f1-score is the most informative about the performance of the model, followed by accuracy. For multicategorical models, accuracy is best.

#### Improving Model Performance (k hyperparameter only raw code)
def find_best_k(X_train, y_train, X_test, y_test, min_k=1, max_k=25):
    best_k = 0
    best_score = 0.0
    for k in range(min_k, max_k+1, 2):
        knn = KNeighborsClassifier(n_neighbors=k)
        knn.fit(X_train, y_train)
        preds = knn.predict(X_test)
        f1 = f1_score(y_test, preds)
        if f1 > best_score:
            best_k = k
            best_score = f1
    
    print("Best Value for k: {}".format(best_k))
    print("F1-Score: {}".format(best_score))
find_best_k(X_train, y_train, X_test, y_test)