   # Cluster classification. University of Barcelona 

### Dependencies:
numpy:https://numpy.org/

pandas:https://pandas.pydata.org/

matplotlib:https://matplotlib.org/

sklearn:https://scikit-learn.org/

xgboost:https://xgboost.readthedocs.io/

smote_variants: https://smote-variants.readthedocs.io/

imblearn.over_sampling:https://imbalanced-learn.org/


In [None]:
import sys; sys.path.append('../')
import warnings
warnings.filterwarnings('ignore')

from src.data_loader import load_data

import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

from sklearn.metrics import classification_report
from sklearn.model_selection import StratifiedKFold

import time

In [None]:
# Classifier Models
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
import xgboost as xgb
from sklearn.neural_network import MLPClassifier

from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
from sklearn.ensemble import (AdaBoostClassifier, BaggingClassifier, ExtraTreesClassifier,
                              GradientBoostingClassifier, RandomForestClassifier, VotingClassifier)
from sklearn.svm import SVC

models = [KNeighborsClassifier, GaussianNB,
    DecisionTreeClassifier, AdaBoostClassifier,
    RandomForestClassifier, SVC, ExtraTreesClassifier,
    LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis, MLPClassifier, xgb.XGBClassifier
]

In [None]:
#Smote variants
import smote_variants as sv
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import ADASYN
import smote_variants as sv
SPIDER = sv.Stefanowski
from src.oversampling import SWIM
oversamplers = [sv.polynom_fit_SMOTE, sv.ProWSyn,
    sv.SMOTE_IPF, sv.Lee, sv.SMOBD, sv.G_SMOTE, sv.LVQ_SMOTE, sv.Assembled_SMOTE,
    sv.SMOTE_TomekLinks, SMOTE, ADASYN,
    SWIM, SPIDER 
]

##  Input X,y Training Option 1     

In [None]:
#path training data file
clusters, _ = load_data('DEGOTALLS_E_Leica_Clustering_dec_2020_jun_2018_Training_6.txt')
clusters =clusters.replace('None', np.nan).dropna(axis=1)\
    .drop([
        'x', 'y', 'z', 'n_points', 'n_order','file_origin', 'file_destination', 'confidence',
        'texture_code_origin', 'texture_code_destination'
    ], axis=1)

#Delete columns with one value
for col in clusters.columns:
    if len(clusters[col].unique()) == 1:
        clusters.drop(col,inplace=True,axis=1)
#Binarize
clust=clusters
clusters['classification'] = (
    ((clusters['classification'] == 'Candidate'))
)
X = clusters.drop('classification', axis=1)
X = (X - X.mean()) / X.std()
y = clusters['classification']

 ## Input X,y Test

In [None]:
#path test data file
clusters, _ = load_data('P170_6_DEGOTALLS_E_Cluster_1.txt')
clusters =clusters.replace('None', np.nan).dropna(axis=1)\
    .drop([
        'x', 'y', 'z', 'n_points', 'n_order','file_origin', 'file_destination', 'confidence',
        'texture_code_origin', 'texture_code_destination'
    ], axis=1)

#Delete columns with one value
for col in clusters.columns:
    if len(clusters[col].unique()) == 1:
        clusters.drop(col,inplace=True,axis=1)
#Binarize
clusters['classification'] = (
    ((clusters['classification'] == 'Unknow'))
)

X_test = clusters.drop('classification', axis=1)
X_test = (X_test - X_test.mean()) / X_test.std()
y_test = clusters['classification']

# Classifier Models

In [None]:
from sklearn.model_selection import GridSearchCV

# Cross Validation properties
cv_value=5

# Model properties
def Knn_function(X_train, y_train):
    print("0",'Knn')
    param_grid = {'n_neighbors': np.arange(1, 17)} 
    knn_gscv = GridSearchCV(KNeighborsClassifier(), param_grid, cv=cv_value, return_train_score=False)
    knn_gscv.fit(X_train, y_train)
    y_predknn = knn_gscv.predict(X_test)
    queue.put(y_predknn)
    print("Knn: ", knn_gscv.best_params_)
    print("Knn: ", knn_gscv.best_score_)

def gs_function(X_train, y_train):
    print("1",'Gaussian')
    param_grid = {'var_smoothing': np.logspace(0,-9, num=100)}
    gs = GridSearchCV(GaussianNB(), param_grid, cv=cv_value, return_train_score=False)
    gs.fit(X_train, y_train)
    y_predgs = gs.predict(X_test)
    queue.put(y_predgs)
    print("Gaussian: ", gs.best_params_)
    print("Gaussian: ",gs.best_score_)               

def dtc_function(X_train, y_train):
    print("2",'DecisionTreeClassifier')
    param_grid =  { 'criterion':['gini','entropy'],'max_depth': np.arange(3, 15)}
    dtc = GridSearchCV(DecisionTreeClassifier(), param_grid, cv=cv_value, return_train_score=False)
    dtc.fit(X_train, y_train)
    y_preddtc = dtc.predict(X_test)
    queue.put(y_preddtc)
    print("Decision Tree: ", dtc.best_params_)
    print("Decision Tree: ", dtc.best_score_)    

def abc_function(X_train, y_train):
    print("3",'AdaBoostClassifier')
    param_grid =  {'n_estimators':np.arange(1, 50),
                  'learning_rate':[0.2]}
    abc = GridSearchCV(AdaBoostClassifier(), param_grid, cv=cv_value, return_train_score=False)
    abc.fit(X_train, y_train)
    y_predabc = abc.predict(X_test)
    queue.put(y_predabc)
    print("Ada Boost: ", abc.best_params_)
    print("Ada Boost: ", abc.best_score_)

def rfc_function(X_train, y_train):
    print("4",'RandomForestClassifier')
    param_grid =  {'n_estimators':np.arange(1, 20),
                  'max_depth': np.arange(3, 15),
                  'criterion' :['gini', 'entropy']}
    rfc = GridSearchCV(RandomForestClassifier(), param_grid, cv=cv_value, return_train_score=False)
    rfc.fit(X_train, y_train)
    y_predrfc = rfc.predict(X_test)
    queue.put(y_predrfc)
    print("Random Forest: ", rfc.best_params_)
    print("Random Forest: ", rfc.best_score_)                       

def svc_function(X_train, y_train):
    print("5",'SVC')
    param_grid = {'C': [0.1, 1, 10], 'gamma': [1, 0.01],'kernel': ['rbf']}
    svc = GridSearchCV(SVC(), param_grid, cv=cv_value, return_train_score=False)
    svc.fit(X_train, y_train)
    y_predsvc = svc.predict(X_test)
    queue.put(y_predsvc)
    print("SVC: ", svc.best_params_)
    print("SVC: ", svc.best_score_)
    
def etc_function(X_train, y_train):
    print("6",'ExtraTreeClassifier')
    param_grid =  {'n_estimators':np.arange(1, 20),
                  'max_depth': np.arange(3, 15),
                  'criterion' :['gini', 'entropy']}
    etc = GridSearchCV(ExtraTreesClassifier(), param_grid, cv=cv_value, return_train_score=False)
    etc.fit(X_train, y_train)
    y_predetc = etc.predict(X_test)
    queue.put(y_predetc)
    print("Etc: ", etc.best_params_)
    print("Etc: ", etc.best_score_)                         
    
def lda_function(X_train, y_train):
    print("7",'LinearDiscriminantAnalysis')
    param_grid =  {'solver' :['svd', 'lsqr', 'eigen']}
    lda = GridSearchCV(LinearDiscriminantAnalysis(), param_grid, cv=cv_value, return_train_score=False)
    lda.fit(X_train, y_train)
    y_predlda = lda.predict(X_test)
    queue.put(y_predlda)
    print("lda: ", lda.best_params_)
    print("lda: ", lda.best_score_)

def qda_function(X_train, y_train):
    print("8",'QuadraticDiscriminantAnalysis')
    param_grid =  {'reg_param':[0.1, 0.3, 0.5]}
    qda = GridSearchCV(QuadraticDiscriminantAnalysis(), param_grid, cv=cv_value, return_train_score=False)
    qda.fit(X_train, y_train)
    y_predqda = qda.predict(X_test)
    queue.put(y_predqda)
    print("qda: ", qda.best_params_)
    print("qda: ", qda.best_score_)

def mlp_function(X_train, y_train):
    print("9",'MLPClassifier')
    param_grid =  {'solver':['lbfgs', 'SGD', 'ADAM'],
                  'activation': ['relu'],
                  'hidden_layer_sizes': [50, 100, 150]}
    mlp = GridSearchCV(MLPClassifier(), param_grid, cv=cv_value, return_train_score=False)
    mlp.fit(X_train, y_train)
    y_predmlp = mlp.predict(X_test)
    queue.put(y_predmlp)
    print("mlp: ", mlp.best_params_)
    print("mlp: ", mlp.best_score_)
    
def xg_function(X_train, y_train):
    print("10",'XGBoost')
    param_grid =  {'nthread':[4], #when use hyperthread, xgboost may become slower
                  'booster':['gblinear', 'gbtree'],
                  'learning_rate': [0.1, 0.2, 0.3], #so called `eta` value
                  'n_estimators': [50, 500, 1000], #number of trees, change it to 1000 for better results
                  'missing':[-999],
                  'seed': [1337],
    'disable_default_eval_metric': [True]}
    xg = GridSearchCV(xgb.XGBClassifier(), param_grid, cv=cv_value, return_train_score=False)
    xg.fit(X_train, y_train)
    y_predxg = xg.predict(X_test)
    queue.put(y_predxg)
    print("xgb: ", xg.best_params_)
    print("xgb: ", xg.best_score_)
        

# Undersampler Models

In [None]:
import threading
from sklearn.model_selection import GridSearchCV
import time
import queue


from sklearn.cluster import KMeans
NameCluster=[]
Contatge=[]
performances=[]
Dades=[]
undersamplers  = ["ClusterCentroid", "ClusterRepresentative"]
dfprint = []

queue = queue.Queue()


f = open("test.csv", "w")
f.write("Model, Sampler, Items, Indexes")
f.write('\n') 


def resample(X, y):
    class_counts = y.value_counts().to_dict()
    mayority_class = max(class_counts, key=class_counts.get)
    mayority_class_index = y[y == mayority_class].index
 
    
    num_clusters = min(class_counts.values())
    clusters = KMeans(n_clusters=num_clusters)\
                    .fit(X.loc[mayority_class_index])\
                    .cluster_centers_
    print(num_clusters)
    closest_indices = [
        np.argmin([np.linalg.norm(c - e) for e in X.loc[mayority_class_index].values])
        for c in clusters
    ]
    representatives = X.iloc[closest_indices]

    y_train = y.drop(mayority_class_index)
    X_train = X.drop(mayority_class_index)

    X = pd.concat([X, representatives])
    y = pd.concat([y, pd.Series([mayority_class for _ in range(len(representatives))])])
    
    return X, y


def resample1(X, y):
    class_counts = y.value_counts().to_dict()
    mayority_class = max(class_counts, key=class_counts.get)
    mayority_class_index = y[y == mayority_class].index

    num_clusters = min(class_counts.values())

    clusters = KMeans(n_clusters=num_clusters)\
                    .fit(X.loc[mayority_class_index])\
                    .cluster_centers_
    clusters = pd.DataFrame(
                    clusters, columns=X.columns
                )

    y = y.drop(mayority_class_index)
    X = X.drop(mayority_class_index)

    X = pd.concat([X, clusters])
    y = pd.concat([y, pd.Series([mayority_class for _ in range(len(clusters))])])
    
    return X, y

time_start=time.time()

for Undersampler in undersamplers:
    if Undersampler == ("ClusterCentroid"):
                
            X_train, y_train = resample(X, y)
    else:

            X_train, y_train = resample1(X, y)
        
    cont=0
    print(cont)
    lock = threading.Lock()
    thread0 = threading.Thread(target=Knn_function, args=(X_train, y_train))
    thread1 = threading.Thread(target=gs_function, args=(X_train, y_train))
    thread2 = threading.Thread(target=dtc_function, args=(X_train, y_train))
    thread3 = threading.Thread(target=abc_function, args=(X_train, y_train))
    thread4 = threading.Thread(target=rfc_function, args=(X_train, y_train))
    thread5 = threading.Thread(target=svc_function, args=(X_train, y_train))
    thread6 = threading.Thread(target=etc_function, args=(X_train, y_train))
    thread7 = threading.Thread(target=lda_function, args=(X_train, y_train))
    thread8 = threading.Thread(target=qda_function, args=(X_train, y_train))
    thread9 = threading.Thread(target=mlp_function, args=(X_train, y_train))
    thread10 = threading.Thread(target=xg_function, args=(X_train, y_train))
    
    thread0.start() 
    thread1.start() 
    thread2.start() 
    thread3.start()
    thread4.start() 
    thread5.start() 
    thread6.start() 
    thread7.start() 
    thread8.start() 
    thread9.start() 
    thread10.start() 
   
    thread0.join()
    thread1.join()
    thread2.join() 
    thread3.join()
    thread4.join()
    thread5.join()
    thread6.join() 
    thread7.join()
    thread8.join()
    thread9.join() 
    thread10.join()
    
    for Model in models:
        model = Model()
        try:

            if cont == 0:
                y_pred = queue.get()
            if cont == 1:
                y_pred = queue.get()            
            if cont == 2:
                y_pred = queue.get() 
            if cont == 3:
                y_pred = queue.get()
            if cont == 4:
                y_pred = queue.get()                       
            if cont == 5:
                y_pred = queue.get()
            if cont == 6:
                y_pred = queue.get()                      
            if cont == 7:
                y_pred = queue.get()      
            if cont == 8:
                y_pred = queue.get()
            if cont == 9:
                y_pred = queue.get()
            if cont == 10:
                y_pred = queue.get()
  
        except Exception as e:
              print(e)
       
    
        print(model)
        NameCluster.append(model)
        NameCluster.append(Undersampler)
        NameCluster.append(sum(y_pred))
        dfprint.extend([model, Undersampler, sum(y_pred)])
        for n in range(len(y_pred)):
            if (y_pred[n]==True):
                NameCluster2=[X_test.index[n]]
                Contatge.extend(NameCluster2)
                NameCluster.extend([NameCluster2])
                dfprint.extend([NameCluster2])       
        f.write(str(dfprint)+'\n')        
        dfprint.clear()
        Dades.append([sum(y_pred == True), Undersampler, model])
        cont=(cont + 1)
f.write('\n')
end_time=time.time()
print("total time: ", (end_time - time_start)/60)

## Oversampler Models

In [None]:
bar_plot=[]
performances=[]
Dades=[]

time_start=time.time()

for Oversampler in oversamplers:
    if Oversampler == (SMOTE) or Oversampler == (ADASYN):
            X_train, y_train = Oversampler(n_jobs=-1).fit_resample(X, y)
    elif Oversampler == (SPIDER):     
            X_train, y_train = Oversampler(n_jobs=-1).sample(X.values, y.values)
    else:
            X_train, y_train = Oversampler().sample(X.values, y.values)

    cont=0
    print(cont)
    lock = threading.Lock()
    thread0 = threading.Thread(target=Knn_function, args=(X_train, y_train))
    thread1 = threading.Thread(target=gs_function, args=(X_train, y_train))
    thread2 = threading.Thread(target=dtc_function, args=(X_train, y_train))
    thread3 = threading.Thread(target=abc_function, args=(X_train, y_train))
    thread4 = threading.Thread(target=rfc_function, args=(X_train, y_train))
    thread5 = threading.Thread(target=svc_function, args=(X_train, y_train))
    thread6 = threading.Thread(target=etc_function, args=(X_train, y_train))
    thread7 = threading.Thread(target=lda_function, args=(X_train, y_train))
    thread8 = threading.Thread(target=qda_function, args=(X_train, y_train))
    thread9 = threading.Thread(target=mlp_function, args=(X_train, y_train))
    thread10 = threading.Thread(target=xg_function, args=(X_train, y_train))
   
    thread0.start() 
    thread1.start() 
    thread2.start() 
    thread3.start()
    thread4.start() 
    thread5.start() 
    thread6.start() 
    thread7.start() 
    thread8.start() 
    thread9.start() 
    thread10.start() 
    
    thread0.join()
    thread1.join()
    thread2.join() 
    thread3.join()
    thread4.join()
    thread5.join()
    thread6.join() 
    thread7.join()
    thread8.join()
    thread9.join() 
    thread10.join()
    
    for Model in models:
        model = Model()
        try:

            if cont == 0:
                y_pred = queue.get()
            if cont == 1:
                y_pred = queue.get()          
            if cont == 2:
                y_pred = queue.get()
            if cont == 3:
                y_pred = queue.get()
            if cont == 4:
                y_pred = queue.get()                       
            if cont == 5:
                y_pred = queue.get()
            if cont == 6:
                y_pred = queue.get()                     
            if cont == 7:
                y_pred = queue.get()    
            if cont == 8:
                y_pred = queue.get()
            if cont == 9:
                y_pred = queue.get()
            if cont == 10:
                y_pred = queue.get()
  
        except Exception as e:
                print(e)
        NameCluster.append(model)
        NameCluster.append(Oversampler)
        NameCluster.append(sum(y_pred))
        dfprint.extend([model, Oversampler, sum(y_pred)])
        for n in range(len(y_pred)):
            if (y_pred[n]==True):
                NameCluster2=[X_test.index[n]]
                Contatge.extend(NameCluster2)
                NameCluster.extend([NameCluster2])
                dfprint.extend([NameCluster2]) 
        f.write(str(dfprint)+'\n')        
        dfprint.clear()
        Dades.append([sum(y_pred == True), Oversampler, model])
        cont=(cont + 1)
        print(cont, model)
    f.write('\n') 
f.close()
end_time=time.time()
print("total time: ", (end_time - time_start)/60)

## PRINT RESULTS
in excel file

In [None]:
di= pd.DataFrame(NameCluster).T
de = pd.DataFrame(Dades).T
Contatge2=set(Contatge)
key=[]
value=[]
Percentage=[]
key=list(Contatge2)
for number in Contatge2:
    value.append(Contatge.count(number))
for number in Contatge2:
    Percentage.append((key[value.index(max(value))],(max(value)*100)/((len(undersamplers)+len(oversamplers))*len(models))))
    key.pop(value.index(max(value)))
    value.pop(value.index(max(value)))
df = pd.DataFrame(Percentage, columns=['Name_Cluster','% Models'])
with pd.ExcelWriter("Resultats.xlsx") as writer:
    df.to_excel(writer, sheet_name='Percentage')
    de.to_excel(writer, sheet_name='Models')
    
    di.to_excel(writer, sheet_name='Indexes')