<a href="https://www.kaggle.com/code/ibrahimawad02/clustering-methods-in-different-data?scriptVersionId=142917575" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

# Using Clustering methods on FIFA 18 sample data
<div class="alert alert-block alert-info" style="font-size:24px; font-family:arial;">
    Presented by Ibrahim Hossam
</div>

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import seaborn as sns
from numpy import unique
from numpy import where
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn import metrics

In [None]:
df = pd.read_csv("/kaggle/input/fifa-18-sample-data/fifa_18_sample_data.csv")
df

In [None]:
df.drop('club_logo', axis = 1, inplace = True)

In [None]:
df.columns.to_list()


In [None]:
df.isnull().sum().to_list()

In [None]:
df[df['gk'].isnull()==True]

In [None]:
df[df['club'].isnull()==True]


In [None]:
df[df['eur_release_clause'].isnull()==True]

#### From the previous cells of code we can find that the null values are the GK column for the pich players and the other positions column for the GKs, so we can fill this null values as 0 for these players to play in a different position. For the club and league nulls it is the players who are in clubs not verified in the game so we can also fill this value with no club. For the release clause we can fill it using the KNN but it will take much time due to many features so we will fill it using the players' value because the release clause is most of the time equal to the players' values or more than it.

In [None]:
df['eur_release_clause'].fillna(df['eur_value'], inplace = True)
df[df['eur_release_clause'].isnull()==True]

In [None]:
df['club'].fillna('no club', inplace = True)
df[df['club'].isnull()==True]

In [None]:
df['league'].fillna('no club', inplace = True)
df[df['league'].isnull()==True]

In [None]:
df.fillna(0, inplace = True)

In [None]:
df.isnull().sum().to_list()

In [None]:
df[['ID', 'flag', 'special', 'real_face', 'photo', 'birth_date', 'full_name', 'name']]

In [None]:
# We can drop these because they don't contribute to the data and some like birth date and name can be unnecessary since we have age and full name
# Also we don't need these columns when we do PCA for clusterring
df.drop(['ID', 'flag', 'real_face', 'photo', 'birth_date', 'name'], axis = 1, inplace = True)

In [None]:
preferred_position_columns = ['prefers_rs',
 'prefers_rw',
 'prefers_rf',
 'prefers_ram',
 'prefers_rcm',
 'prefers_rm',
 'prefers_rdm',
 'prefers_rcb',
 'prefers_rb',
 'prefers_rwb',
 'prefers_st',
 'prefers_lw',
 'prefers_cf',
 'prefers_cam',
 'prefers_cm',
 'prefers_lm',
 'prefers_cdm',
 'prefers_cb',
 'prefers_lb',
 'prefers_lwb',
 'prefers_ls',
 'prefers_lf',
 'prefers_lam',
 'prefers_lcm',
 'prefers_ldm',
 'prefers_lcb',
 'prefers_gk']   #all relevant position columns

In [None]:
def extract_true_preferred_position(row):
    for column in preferred_position_columns:
        if row[column] == True:
            return column
    return None  # Return None if no true preferred position is found

df['True_Preferred_Position'] = df.apply(extract_true_preferred_position, axis=1)
df

In [None]:
df.drop(preferred_position_columns, axis = 1, inplace = True)
df

In [None]:
label_encoder = LabelEncoder()

In [None]:
df['True_Preferred_Position'] = label_encoder.fit_transform(df['True_Preferred_Position'])

In [None]:
x = df.loc[:,['special', 'eur_wage']].values
x

In [None]:
# Using the elbow method to find the optimal number of clusters
from sklearn.cluster import KMeans
wcss = []            ## total distances from the centeriods
for i in range(1, 11):
    kmeans = KMeans(n_clusters = i, init = 'k-means++')
    kmeans.fit(x)
    print('Cost_Function=',kmeans.inertia_,'with', i, 'Clusters')
    wcss.append(kmeans.inertia_)
    
plt.plot(range(1, 11), wcss)
plt.title('The Elbow Method')
plt.xlabel('Number of clusters')
plt.ylabel('WCSS')
plt.show()

In [None]:
kmeans = KMeans(n_clusters = 5, init = 'k-means++', random_state = 49)
y_kmeans = kmeans.fit_predict(x)

In [None]:
# Visualising the clusters
plt.scatter(x[y_kmeans == 0, 0], x[y_kmeans == 0, 1], s = 100, c = 'red', label = 'Cluster 1')
plt.scatter(x[y_kmeans == 1, 0], x[y_kmeans == 1, 1], s = 100, c = 'blue', label = 'Cluster 2')
plt.scatter(x[y_kmeans == 2, 0], x[y_kmeans == 2, 1], s = 100, c = 'green', label = 'Cluster 3')
plt.scatter(x[y_kmeans == 3, 0], x[y_kmeans == 3, 1], s = 100, c = 'cyan', label = 'Cluster 4')
plt.scatter(x[y_kmeans == 4, 0], x[y_kmeans == 4, 1], s = 100, c = 'magenta', label = 'Cluster 5')
plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], s = 300, c = 'yellow', label = 'Centroids')
plt.title('Clusters of footballers')
plt.xlabel('Special')
plt.ylabel('Wage in Euros')
plt.legend()
plt.show()

***
# Hierarchal Clustering

In [None]:
from scipy.cluster.hierarchy import dendrogram, linkage
from sklearn.cluster import AgglomerativeClustering

### Different linkage methods
- <span style = 'font-size:18px;'> Single Linkage.    
- <span style = 'font-size:18px;'> Average Linkage.    
- <span style = 'font-size:18px;'> Complete Linkage.    
- <span style = 'font-size:18px;'> Ward's Linkage.    
- <span style = 'font-size:18px;'> Centroid Linkage (UPGMA).

In [None]:
SC = StandardScaler()

In [None]:
xc = SC.fit_transform(x)

In [None]:
linked = linkage(xc, method='single')

In [None]:
plt.figure(figsize=(20, 8))
dendrogram(linked, orientation='top', distance_sort='descending', show_leaf_counts=True)
plt.title('Hierarchical Clustering Dendrogram - Single Linkage')
plt.show()

In [None]:
HCS = AgglomerativeClustering(n_clusters=5, linkage='single')
Clt = HCS.fit_predict(x)

In [None]:
Clt # points clusters

In [None]:
plt.scatter(x[:, 0], x[:, 1], c=Clt, cmap='cividis')
plt.title("HC - Single - Clustering")
plt.xlabel('International Reputaion')
plt.ylabel('Wage in Euros')
plt.show()

In [None]:
linked = linkage(xc, method='average')

In [None]:
plt.figure(figsize=(20, 8))
dendrogram(linked, orientation='top', distance_sort='descending', show_leaf_counts=True)
plt.title('Hierarchical Clustering Dendrogram - Average Linkage')
plt.show()

In [None]:
HCA = AgglomerativeClustering(n_clusters=5, linkage='average')
Clt = HCA.fit_predict(x)

In [None]:
Clt # points clusters

In [None]:
plt.scatter(x[:, 0], x[:, 1], c=Clt, cmap='inferno')
plt.title("HC - Average - Clustering")
plt.xlabel('International Reputaion')
plt.ylabel('Wage in Euros')
plt.show()

In [None]:
linked = linkage(xc, method='complete')

In [None]:
plt.figure(figsize=(20, 8))
dendrogram(linked, orientation='top', distance_sort='descending', show_leaf_counts=True)
plt.title('Hierarchical Clustering Dendrogram - Complete Linkage')
plt.show()

In [None]:
HCC = AgglomerativeClustering(n_clusters=5, linkage='complete')
Clt = HCC.fit_predict(x)

In [None]:
Clt # points clusters

In [None]:
plt.scatter(x[:, 0], x[:, 1], c=Clt, cmap='RdBu')
plt.title("HC - Complete - Clustering")
plt.xlabel('International Reputaion')
plt.ylabel('Wage in Euros')
plt.show()

In [None]:
linked = linkage(xc, method='ward')

In [None]:
plt.figure(figsize=(20, 8))
dendrogram(linked, orientation='top', distance_sort='descending', show_leaf_counts=True)
plt.title('Hierarchical Clustering Dendrogram - Ward Linkage')
plt.show()

In [None]:
HCW = AgglomerativeClustering(n_clusters=5, linkage='ward')
Clt = HCW.fit_predict(x)

In [None]:
Clt # points clusters

In [None]:
plt.scatter(x[:, 0], x[:, 1], c=Clt, cmap='coolwarm')
plt.title("HC - Ward - Clustering")
plt.xlabel('International Reputaion')
plt.ylabel('Wage in Euros')
plt.show()

In [None]:
linked = linkage(xc, method='centroid')

In [None]:
plt.figure(figsize=(20, 8))
dendrogram(linked, orientation='top', distance_sort='descending', show_leaf_counts=True)
plt.title('Hierarchical Clustering Dendrogram - Centroid Linkage')
plt.show()

In [None]:
HCCe = AgglomerativeClustering(n_clusters=5, linkage='centroid')
try:
    Clt = HCCe.fit_predict(x)
except:
    print('Library not found')

### There is no centroid linkage in Agglomerative Clustering library so there is no visualization.

***
# DBSCAN

In [None]:
from sklearn.cluster import DBSCAN
from sklearn import metrics
from sklearn.datasets import make_blobs

In [None]:
def dbscan_grid_search(X_data, model_list, clst_count, eps_space = 0.5,
                       min_samples_space = 5, min_clust = 2, max_clust = 10):

    """
Performs a hyperparameter grid search for DBSCAN.

Parameters:
    * X_data            = data used to fit the DBSCAN instance
    * model_list        = A list to save the models generated
    * clst_count        = a list to store the number of non-whitespace clusters
    * eps_space         = the range values for the eps parameter
    * min_samples_space = the range values for the min_samples parameter
    * min_clust         = the minimum number of clusters required after each search iteration in order for a result to be appended to the lst
    * max_clust         = the maximum number of clusters required after each search iteration in order for a result to be appended to the lst


Example:

# Inputting function parameters
dbscan_grid_search(X_data = dbscan_X_scaled,
                   lst = dbscan_clusters,
                   clst_count = cluster_count
                   eps_space = pd.np.arange(0.1, 5, 0.1),
                   min_samples_space = pd.np.arange(1, 50, 1),
                   min_clust = 3,
                   max_clust = 6)

"""

    # Importing counter to count the amount of data in each cluster
    from collections import Counter


    # Starting a tally of total iterations
    n_iterations = 0
    model_list = []


    # Looping over each combination of hyperparameters
    for eps_val in eps_space:
        for samples_val in range(1, min_samples_space):

            dbscan_grid = DBSCAN(eps = eps_val,
                                 min_samples = samples_val)


            # fit_transform
            clusters = dbscan_grid.fit_predict(X = X_data)


            # Counting the amount of data in each cluster
            cluster_count = Counter(clusters)


            # Saving the number of clusters
            n_clusters = sum(abs(np.unique(clusters))) - 1


            # Increasing the iteration tally with each run of the loop
            n_iterations += 1
            
            
            # Calculating the percentage of noise points
            noise_percent = list(clusters).count(-1)/len(list(clusters))*100
            
            
            #visualizing the clusters
            plt.scatter(X_data[:, 0], X_data[:, 1], c=clusters, cmap='coolwarm')
            plt.title(f"DBSCAN - Clustering - eps = {eps_val}, min points = {samples_val}, clusters = {n_clusters}")
            plt.xlabel(f'Noise Percentage = {noise_percent}%')
            plt.legend()
            plt.show()
            print(175*'-')

            # Appending the model list each time n_clusters criteria is reached
            if n_clusters >= min_clust and n_clusters <= max_clust:

                clst_count.append(cluster_count)
                model_list.append([eps_val,samples_val, n_clusters, noise_percent])

    # Printing grid search summary information
    print(f"""Hyperparameter combinations checked: {n_iterations}. \n""")
    print(model_list)

In [None]:
clst_count = []
model_list = []
dbscan_grid_search(x, model_list, clst_count, np.arange(0.5, 1.5, 0.1), 7)

- <span style = 'font-size:20px;'> Another example on generated data

In [None]:
centers = [[1, 1], [-1, -2], [2, -2]]
X, labels_true = make_blobs(n_samples=750, centers=centers, cluster_std=0.4,
                            random_state=0)

X = StandardScaler().fit_transform(X)

In [None]:
plt.rcParams.update({'figure.figsize':(10,7.5), 'figure.dpi':100})
plt.scatter(X[:, 0], X[:, 1])

In [None]:
clst_count = []
model_list = []
dbscan_grid_search(X, model_list, clst_count, np.arange(0.1, 0.7, 0.1), 7)

### From the previous cell we can conclude that the best model with the least noise ratio and of 3 cluster is the dbscan of epsilon = 0.3 and minimum points of 2 to 5 points with 0.26% noise

In [None]:
db = DBSCAN(eps=0.3, min_samples=3).fit(X)
core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
core_samples_mask[db.core_sample_indices_] = True
labels = db.labels_

In [None]:
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
n_noise_ = list(labels).count(-1)

In [None]:
print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels_true, labels))
print("Completeness: %0.3f" % metrics.completeness_score(labels_true, labels))
print("V-measure: %0.3f" % metrics.v_measure_score(labels_true, labels))
print("Adjusted Rand Index: %0.3f"
      % metrics.adjusted_rand_score(labels_true, labels))
print("Adjusted Mutual Information: %0.3f"
      % metrics.adjusted_mutual_info_score(labels_true, labels))
print("Silhouette Coefficient: %0.3f"
      % metrics.silhouette_score(X, labels))

In [None]:
%matplotlib inline

# Black removed and is used for noise instead.
unique_labels = set(labels)
colors = [plt.cm.Spectral(each)
          for each in np.linspace(0, 1, len(unique_labels))]
for k, col in zip(unique_labels, colors):
    if k == -1:
        # Black used for noise.
        col = [0, 0, 0, 1]

    class_member_mask = (labels == k)

    xy = X[class_member_mask & core_samples_mask]
    plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=tuple(col),
             markeredgecolor='k', markersize=14)

    xy = X[class_member_mask & ~core_samples_mask]
    plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=tuple(col),
             markeredgecolor='k', markersize=6)

plt.title('Estimated number of clusters: %d' % n_clusters_)
plt.show()

***
# PCA

In [None]:
# Label Encoding Categorical data 
#Before Encoding
df.describe(include = ['object', 'category'])

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
# We can drop the full names in PCA since they are many and will not affect the clusterring so we will label encode the rest
categorical_columns = ['club', 'league','body_type', 'nationality','work_rate_att','work_rate_def','preferred_foot']
# The rest of the columns have many unique values and will not affect me as a ML model
label_encoder = LabelEncoder()

for col in categorical_columns:
    df[col] = label_encoder.fit_transform(df[col])

# After Encoding
df.info()

In [None]:
#Improting the PCA module
from sklearn.decomposition import PCA
pca = PCA(svd_solver='randomized', random_state=42)

In [None]:
# Putting feature variable to X
x = df.drop(['True_Preferred_Position', 'full_name'],axis=1)

# Putting response variable to y
y = df['True_Preferred_Position']

In [None]:
#Doing the PCA on the train data
pca.fit(x)

In [None]:
pca.components_


In [None]:
pca.explained_variance_ratio_[0]

In [None]:
colnames = list(x.columns)
pcs_df = pd.DataFrame({'PC1':pca.components_[0],'PC2':pca.components_[1], 'PC3':pca.components_[2], 'PC4':pca.components_[3], 'Feature':colnames})
pcs_df.head()

In [None]:
pca.explained_variance_ratio_

In [None]:
#Making the screeplot - plotting the cumulative variance against the number of components
%matplotlib inline
fig = plt.figure(figsize = (15,8))
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('number of components')
plt.ylabel('cumulative explained variance')
plt.savefig('pca_no')
plt.show()

In [None]:
# We can see that two features is enough for more than 95% coverage
from sklearn.decomposition import IncrementalPCA
pca_final = IncrementalPCA(n_components=2)

In [None]:
df_pca = pca_final.fit_transform(x)
df_pca.shape

In [None]:
df_pca = pd.DataFrame(df_pca)
df_pca.head()

In [None]:
#creating correlation matrix for the principal components
corrmat = np.corrcoef(df_pca.transpose())

In [None]:
#plotting the correlation matrix
%matplotlib inline
plt.figure(figsize = (10,5))
sns.heatmap(corrmat,annot = True)

In [None]:
from sklearn.neighbors import NearestNeighbors
from random import sample
from numpy.random import uniform
import numpy as np
from math import isnan
 
def hopkins(X):
    d = X.shape[1]
    #d = len(vars) # columns
    n = len(X) # rows
    m = int(0.1 * n) 
    nbrs = NearestNeighbors(n_neighbors=1).fit(X.values)
 
    rand_X = sample(range(0, n, 1), m)
 
    ujd = []
    wjd = []
    for j in range(0, m):
        u_dist, _ = nbrs.kneighbors(uniform(np.amin(X,axis=0),np.amax(X,axis=0),d).reshape(1, -1), 2, return_distance=True)
        ujd.append(u_dist[0][1])
        w_dist, _ = nbrs.kneighbors(X.iloc[rand_X[j]].values.reshape(1, -1), 2, return_distance=True)
        wjd.append(w_dist[0][1])
 
    H = sum(ujd) / (sum(ujd) + sum(wjd))
    if isnan(H):
        print(ujd, wjd)
        H = 0
 
    return H

In [None]:
hopkins(df_pca)

## We can conclude that the data has high tendency to cluster

***
<div style="color:white;
           display:fill;
           border-radius:5px;
           background-color:#231855;
           font-size:20px;
           font-family:Nexa;
           letter-spacing:0.5px">
        <p style="padding: 10px;
              color:white;">
            <b>Thanks For Reading</b>
        </p>
</div>