# Clustering evaluation
Script to evaluate clustering method

In [1]:
from pathlib import Path
import pandas as pd
import numpy as np
from datetime import datetime
import time
import matplotlib.pyplot as plt
%matplotlib inline
import itertools
import pickle
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
#from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, GridSearchCV
#from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score
#from sklearn.preprocessing import LabelEncoder
#from sklearn.preprocessing import LabelBinarizer
#from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import StratifiedKFold
#from sklearn.svm import SVC, LinearSVC
#from sklearn.feature_selection import RFECV, RFE, SelectKBest, chi2, SelectFromModel
from sklearn.utils import resample

In [2]:
path_project = Path.home() / Path('Google Drive/Felix')
path_data = path_project / Path("data")
path_dump = path_project / Path("dump")

In [4]:
# loading data
file = path_data / Path("dataset.csv")
with Path.open(file, 'rb') as fp:
    dataset = pd.read_csv(fp,  encoding='utf-8',low_memory=False, index_col = 0)

In [5]:
# load feature sets
filename = path_dump / Path("dict_features_sets.sav")
with open(filename, 'rb') as fp:
     dict_features_sets = pickle.load(fp)

usual_common_features = dict_features_sets['usual_common_features']
indiv_act_features = dict_features_sets['indiv_act_features']
indiv_semi_act_features = dict_features_sets['indiv_semi_act_features']
RFE_LogisticRegression_20_features = dict_features_sets['RFE_LogisticRegression_20_features']

In [6]:
# loading clustering
file = path_data / Path("clustTest3.csv")
with Path.open(file, 'rb') as fp:
    clustTest1 = pd.read_csv(fp,  encoding='utf-8',low_memory=False, sep=";", index_col = 0)

In [7]:
clustTest1.head()

Unnamed: 0_level_0,clust1,clust2,clust3,clust4,clust5,clust6,clust7,clust8,clust9,clust10,clust11
INTER6,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
373001,1,1,2,3,3,5,3,4,1,4,5
373002,1,1,2,1,4,2,1,4,5,3,1
373003,1,1,4,1,4,4,2,4,3,2,1
373004,1,1,2,1,4,4,2,4,1,3,3
373005,1,1,2,2,4,3,1,4,3,3,4


In [8]:
df = dataset.loc[:,:]
# reducing problem to a 2 class classification problem
df["HEUREUX_CLF"] = 0
df.loc[df["HEUREUX"]==4, "HEUREUX_CLF"] = 1
df.loc[df["HEUREUX"]==3, "HEUREUX_CLF"] = 1
df.loc[df["HEUREUX"]==5, "HEUREUX_CLF"] = None

scope = ( RFE_LogisticRegression_20_features | indiv_act_features )  & set(dataset.columns)
n_max = 2000

df = df.loc[:,scope | {"HEUREUX_CLF"} ].dropna()
features = df.loc[:,scope ].columns

X = df.loc[:,scope]
y = df["HEUREUX_CLF"]


Xs, ys = resample(X, y, random_state=42)

Xs = Xs.iloc[0:n_max,:]
ys = ys.iloc[0:n_max]

X_train, X_test, y_train, y_test = train_test_split(Xs, ys, 
                                                    test_size=0.2, 
                                                    random_state=42
                                                   )

scaler = StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

print(f"Number exemple: {y.shape[0]}\n- training set: \
{y_train.shape[0]}\n- test set: {y_test.shape[0]}")
print(f"Number of features: p={X_train.shape[1]}")
print(f"Number of class: {len(np.unique(y))}")
for c in np.unique(y):
    print(f"class {c:0.0f} : {100*np.sum(y==c)/len(y):0.1f}%")

Number exemple: 10674
- training set: 1600
- test set: 400
Number of features: p=66
Number of class: 2
class 0 : 35.0%
class 1 : 65.0%


In [20]:
startTime = time.time()
n_estimators_range = [32,64,128,256,512]
max_depth_range = [4,8,16,32,64] 
param_grid = dict(n_estimators=n_estimators_range, max_depth = max_depth_range)

params = {'max_features' :'sqrt', 'random_state' : 32,
          'min_samples_split' : 2, 'class_weight' : 'balanced'}
clf = RandomForestClassifier(**params)

grid = GridSearchCV(clf, scoring='accuracy', param_grid=param_grid)
grid.fit(X_train, y_train)
print(f"Determination of optimal hyperparameters in {time.time() - startTime:0.1f} s")
print(f"Optimal values are {grid.best_params_} \n\
Accuracy Score of cross valdation {100*grid.best_score_:0.2f}%")

# Learning on full training set with optimals hyperparameters and score on test set
params = {'max_features' :'sqrt', 'random_state' : 32, 
          'min_samples_split' : 2, 'class_weight' : 'balanced',
          'n_estimators' : grid.best_params_['n_estimators'],
          'max_depth' : grid.best_params_['max_depth']}
clf = RandomForestClassifier(**params).fit(X_train, y_train)
clf.fit(X_train, y_train)
y_test_pred = clf.predict(X_test)

print(f"Random Forest, p={X_train.shape[1]}")
accuracy = clf.score(X_test, y_test)
f1 = f1_score(y_test, y_test_pred)
p = precision_score(y_test, y_test_pred)
r = recall_score(y_test, y_test_pred)
print(f"Model score\n- Accuracy : {accuracy*100:0.1f} %")
print(f"- Precision : {p*100:0.1f} % (Happy # positive class)")
print(f"- Recall : {r*100:0.1f} %")
print(f"- F1 score : {f1*100:0.1f} %")
res_full  = {
    'f1_score' : f1,
    'accuracy' : accuracy,
    'precision' : p,
    'recall' : r
}

Determination of optimal hyperparameters in 44.5 s
Optimal values are {'max_depth': 16, 'n_estimators': 64} 
Accuracy Score of cross valdation 75.44%
Random Forest, p=66
Model score
- Accuracy : 78.5 %
- Precision : 80.2 % (Happy # positive class)
- Recall : 88.8 %
- F1 score : 84.3 %


#### Estimation on each clusters...

In [21]:
n_estimators_range = [16,32,64,128]
max_depth_range = [2,4,8,16,32,64] 
param_grid = dict(n_estimators=n_estimators_range, max_depth = max_depth_range)
params = {'max_features' :'sqrt', 
          'random_state' : 32, 
          'min_samples_split' : 2, 
          'class_weight' : 'balanced'
         }
scope = ( RFE_LogisticRegression_20_features | indiv_act_features )  & set(dataset.columns)
features = df.loc[:,scope].columns

In [25]:
score_clustering_methods = []
clustering_methods = clustTest1.columns[0:]

for method in clustering_methods:
    print("--------------------------------------------")
    print(f"\nAnalysis cluster method {method}")
    cluster_list = clustTest1[method].unique()
    print(f"liste of clusters : {cluster_list}")
    score_cluster = []
    for cluster in cluster_list:
        index_scope = clustTest1.loc[clustTest1[method]==cluster,:].index
        print("++++++++++++")
        print(f"cluster {cluster} : {len(index_scope)} elements")
        
        Xc = X.loc[index_scope.intersection(X.index),:]
        yc = y[index_scope.intersection(X.index)]
        
        Xs, ys = resample(Xc, yc, random_state=42)
        
        Xs = Xs.iloc[0:n_max,:]
        ys = ys.iloc[0:n_max]
        
        X_train, X_test, y_train, y_test = train_test_split(Xs, ys,
                                                            test_size=0.2, 
                                                            random_state=42)

        scaler = StandardScaler().fit(X_train)
        X_train = scaler.transform(X_train)
        X_test = scaler.transform(X_test)
                
        print(f"Number exemple: {ys.shape[0]}\n\
        - training set: {y_train.shape[0]}\n\
        - test set: {y_test.shape[0]}")
        print(f"Number of features: p={X_train.shape[1]}")
        print(f"Number of class: {len(np.unique(y))}")
        for c in np.unique(y):
            print(f"class {c:0.0f} : {100*np.sum(y==c)/len(y):0.1f}%")
            
            
        startTime = time.time()
        clf = RandomForestClassifier(**params)
        grid = GridSearchCV(clf, 
                            scoring='accuracy', 
                            param_grid=param_grid)

        grid.fit(X_train, y_train)
        print(f"Optimal values are {grid.best_params_} \n\
        Score of cross valdation {100*grid.best_score_:0.2f}%")
        print()

        # Learning on full training set with optimals hyperparameters and score on test set
        params_opt = {'max_features' :'sqrt', 'random_state' : 32, 
                      'min_samples_split' : 2, 'class_weight' : 'balanced',
                      'n_estimators' : grid.best_params_['n_estimators'],
                      'max_depth' : grid.best_params_['max_depth']}
        clf = RandomForestClassifier(**params_opt).fit(X_train, y_train)

            
        y_test_pred = clf.predict(X_test)
        accuracy = clf.score(X_test, y_test)
        f1 = f1_score(y_test, y_test_pred)
        p = precision_score(y_test, y_test_pred)
        r = recall_score(y_test, y_test_pred)            

        res  = {'f1_score' : f1,
                'accuracy' : accuracy,
                'precision' : p,
                'recall' : r}
            
        cl = {'cluster' : cluster,
              'size' : len(index_scope),
              'model' : 'RandomForestClassifier',
              'params' : params_opt,
              'metrics' : res
             }
         
        score_cluster.append(cl)
        
    d = {'clustering_method' : method,
         'cluster_scores' : score_cluster
        }
    score_clustering_methods.append(d) 

--------------------------------------------

Analysis cluster method clust1
liste of clusters : [1 2 3 4]
++++++++++++
cluster 1 : 8416 elements
Number exemple: 2000
        - training set: 1600
        - test set: 400
Number of features: p=66
Number of class: 2
class 0 : 35.0%
class 1 : 65.0%
Optimal values are {'max_depth': 16, 'n_estimators': 128} 
        Score of cross valdation 76.69%

++++++++++++
cluster 2 : 470 elements
Number exemple: 422
        - training set: 337
        - test set: 85
Number of features: p=66
Number of class: 2
class 0 : 35.0%
class 1 : 65.0%
Optimal values are {'max_depth': 8, 'n_estimators': 128} 
        Score of cross valdation 83.09%

++++++++++++
cluster 3 : 1982 elements
Number exemple: 1935
        - training set: 1548
        - test set: 387
Number of features: p=66
Number of class: 2
class 0 : 35.0%
class 1 : 65.0%
Optimal values are {'max_depth': 32, 'n_estimators': 128} 
        Score of cross valdation 86.24%

++++++++++++
cluster 4 : 263 el

Optimal values are {'max_depth': 32, 'n_estimators': 128} 
        Score of cross valdation 78.25%

++++++++++++
cluster 2 : 725 elements
Number exemple: 692
        - training set: 553
        - test set: 139
Number of features: p=66
Number of class: 2
class 0 : 35.0%
class 1 : 65.0%
Optimal values are {'max_depth': 8, 'n_estimators': 64} 
        Score of cross valdation 81.56%

++++++++++++
cluster 5 : 121 elements
Number exemple: 46
        - training set: 36
        - test set: 10
Number of features: p=66
Number of class: 2
class 0 : 35.0%
class 1 : 65.0%
Optimal values are {'max_depth': 4, 'n_estimators': 16} 
        Score of cross valdation 86.11%

--------------------------------------------

Analysis cluster method clust6
liste of clusters : [5 2 4 3 1 6]
++++++++++++
cluster 5 : 731 elements
Number exemple: 687
        - training set: 549
        - test set: 138
Number of features: p=66
Number of class: 2
class 0 : 35.0%
class 1 : 65.0%
Optimal values are {'max_depth': 16, '

Optimal values are {'max_depth': 4, 'n_estimators': 128} 
        Score of cross valdation 81.63%

--------------------------------------------

Analysis cluster method clust11
liste of clusters : [5 1 3 4 2 6 7]
++++++++++++
cluster 5 : 1231 elements
Number exemple: 1188
        - training set: 950
        - test set: 238
Number of features: p=66
Number of class: 2
class 0 : 35.0%
class 1 : 65.0%
Optimal values are {'max_depth': 32, 'n_estimators': 32} 
        Score of cross valdation 83.68%

++++++++++++
cluster 1 : 2753 elements
Number exemple: 2000
        - training set: 1600
        - test set: 400
Number of features: p=66
Number of class: 2
class 0 : 35.0%
class 1 : 65.0%
Optimal values are {'max_depth': 32, 'n_estimators': 16} 
        Score of cross valdation 83.44%

++++++++++++
cluster 3 : 2344 elements
Number exemple: 2000
        - training set: 1600
        - test set: 400
Number of features: p=66
Number of class: 2
class 0 : 35.0%
class 1 : 65.0%
Optimal values are {'ma



Optimal values are {'max_depth': 2, 'n_estimators': 16} 
        Score of cross valdation 75.00%



In [26]:
# F1 score
for score_method in score_clustering_methods:
    print(f"method {score_method['clustering_method']}:")
    average_score = 0
    total_size = 0
    for i, score_cluster in enumerate(score_method['cluster_scores']):
        print(f"cluster {score_cluster['cluster']} ({score_cluster['size']}), f1 macro {100*score_cluster['metrics']['f1_score']:0.1f}%")  
        average_score += score_cluster['metrics']['f1_score']*score_cluster['size']
        total_size += score_cluster['size']
        
    average_score = average_score / total_size
    print(f"average f1 on clusters {100*average_score:0.1f}% gain {100*(average_score-res_full['f1_score']):0.1f}\n")  

method clust1:
cluster 1 (8416), f1 macro 84.9%
cluster 2 (470), f1 macro 83.2%
cluster 3 (1982), f1 macro 92.3%
cluster 4 (263), f1 macro 89.3%
average f1 on clusters 86.3% gain 2.0

method clust2:
cluster 1 (7186), f1 macro 85.2%
cluster 2 (337), f1 macro 77.8%
cluster 3 (1982), f1 macro 92.3%
cluster 4 (1230), f1 macro 87.7%
cluster 5 (100), f1 macro 78.3%
cluster 6 (133), f1 macro 100.0%
cluster 7 (163), f1 macro 94.1%
average f1 on clusters 86.8% gain 2.5

method clust3:
cluster 2 (3053), f1 macro 90.5%
cluster 4 (2359), f1 macro 89.1%
cluster 6 (2313), f1 macro 90.5%
cluster 1 (528), f1 macro 81.3%
cluster 3 (1384), f1 macro 91.0%
cluster 5 (1494), f1 macro 92.9%
average f1 on clusters 90.2% gain 5.9

method clust4:
cluster 3 (818), f1 macro 83.4%
cluster 1 (4494), f1 macro 86.9%
cluster 2 (1735), f1 macro 93.6%
cluster 6 (1012), f1 macro 92.9%
cluster 5 (1059), f1 macro 88.3%
cluster 4 (1477), f1 macro 92.6%
cluster 7 (536), f1 macro 88.3%
average f1 on clusters 89.2% gain 4.9



In [27]:
# accuracy
for score_method in score_clustering_methods:
    print(f"method {score_method['clustering_method']}:")
    average_score = 0
    total_size = 0
    for i, score_cluster in enumerate(score_method['cluster_scores']):
        print(f"cluster {score_cluster['cluster']} ({score_cluster['size']}) , accuracy {100*score_cluster['metrics']['accuracy']:0.1f}%")  
        average_score = average_score + score_cluster['metrics']['accuracy']*score_cluster['size']
        total_size += score_cluster['size']
    average_score = average_score / total_size
    print(f"average accuracy on clusters {100*average_score:0.1f}% gain {100*(average_score-res_full['accuracy']):0.1f}\n")  

method clust1:
cluster 1 (8416) , accuracy 78.2%
cluster 2 (470) , accuracy 80.0%
cluster 3 (1982) , accuracy 89.4%
cluster 4 (263) , accuracy 87.8%
average accuracy on clusters 80.5% gain 2.0

method clust2:
cluster 1 (7186) , accuracy 78.5%
cluster 2 (337) , accuracy 80.0%
cluster 3 (1982) , accuracy 89.4%
cluster 4 (1230) , accuracy 86.3%
cluster 5 (100) , accuracy 73.7%
cluster 6 (133) , accuracy 100.0%
cluster 7 (163) , accuracy 93.5%
average accuracy on clusters 81.8% gain 3.3

method clust3:
cluster 2 (3053) , accuracy 87.0%
cluster 4 (2359) , accuracy 85.0%
cluster 6 (2313) , accuracy 86.5%
cluster 1 (528) , accuracy 82.1%
cluster 3 (1384) , accuracy 88.2%
cluster 5 (1494) , accuracy 91.5%
average accuracy on clusters 87.0% gain 8.5

method clust4:
cluster 3 (818) , accuracy 83.3%
cluster 1 (4494) , accuracy 81.0%
cluster 2 (1735) , accuracy 91.4%
cluster 6 (1012) , accuracy 90.4%
cluster 5 (1059) , accuracy 85.0%
cluster 4 (1477) , accuracy 89.3%
cluster 7 (536) , accuracy 84.