# Clustering evaluation
Script to evaluate clustering method

In [1]:
from pathlib import Path
import pandas as pd
import numpy as np
from datetime import datetime
import time
import matplotlib.pyplot as plt
%matplotlib inline
import itertools
import pickle
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
#from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, GridSearchCV
#from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score
#from sklearn.preprocessing import LabelEncoder
#from sklearn.preprocessing import LabelBinarizer
#from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import StratifiedKFold
#from sklearn.svm import SVC, LinearSVC
#from sklearn.feature_selection import RFECV, RFE, SelectKBest, chi2, SelectFromModel
from sklearn.utils import resample

In [2]:
path_project = Path.home() / Path('Google Drive/Felix')
path_data = path_project / Path("data")
path_dump = path_project / Path("dump")

In [3]:
# loading data
file = path_data / Path("dataset.csv")
with Path.open(file, 'rb') as fp:
    dataset = pd.read_csv(fp,  encoding='utf-8',low_memory=False, index_col = 0)

In [4]:
# load feature sets
filename = path_dump / Path("dict_features_sets.sav")
with open(filename, 'rb') as fp:
     dict_features_sets = pickle.load(fp)

usual_common_features = dict_features_sets['usual_common_features']
indiv_act_features = dict_features_sets['indiv_act_features']
indiv_semi_act_features = dict_features_sets['indiv_semi_act_features']
RFE_LogisticRegression_20_features = dict_features_sets['RFE_LogisticRegression_20_features']

In [5]:
# loading clustering
file = path_data / Path("clustTest1.csv")
with Path.open(file, 'rb') as fp:
    clustTest1 = pd.read_csv(fp,  encoding='utf-8',low_memory=False, sep=";", index_col = 0)

In [6]:
df = dataset.loc[:,:]
# reducing problem to a 2 class classification problem
df["HEUREUX_CLF"] = 0
df.loc[df["HEUREUX"]==4, "HEUREUX_CLF"] = 1
df.loc[df["HEUREUX"]==3, "HEUREUX_CLF"] = 1
df.loc[df["HEUREUX"]==5, "HEUREUX_CLF"] = None

scope = ( RFE_LogisticRegression_20_features | indiv_act_features )  & set(dataset.columns)
n_max = 2000

df = df.loc[:,scope | {"HEUREUX_CLF"} ].dropna()
features = df.loc[:,scope ].columns

X = df.loc[:,scope]
y = df["HEUREUX_CLF"]


Xs, ys = resample(X, y, random_state=42)

Xs = Xs.iloc[0:n_max,:]
ys = ys.iloc[0:n_max]

X_train, X_test, y_train, y_test = train_test_split(Xs, ys, 
                                                    test_size=0.2, 
                                                    random_state=42
                                                   )

scaler = StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

print(f"Number exemple: {y.shape[0]}\n- training set: \
{y_train.shape[0]}\n- test set: {y_test.shape[0]}")
print(f"Number of features: p={X_train.shape[1]}")
print(f"Number of class: {len(np.unique(y))}")
for c in np.unique(y):
    print(f"class {c:0.0f} : {100*np.sum(y==c)/len(y):0.1f}%")

Number exemple: 10674
- training set: 160
- test set: 40
Number of features: p=66
Number of class: 2
class 0 : 35.0%
class 1 : 65.0%


In [7]:
startTime = time.time()
n_estimators_range = [32,64,128,256,512]
max_depth_range = [4,8,16,32,64] 
param_grid = dict(n_estimators=n_estimators_range, max_depth = max_depth_range)

params = {'max_features' :'sqrt', 'random_state' : 32,
          'min_samples_split' : 2, 'class_weight' : 'balanced'}
clf = RandomForestClassifier(**params)

grid = GridSearchCV(clf, scoring='accuracy', param_grid=param_grid)
grid.fit(X_train, y_train)
print(f"Determination of optimal hyperparameters in {time.time() - startTime:0.1f} s")
print(f"Optimal values are {grid.best_params_} \n\
Accuracy Score of cross valdation {100*grid.best_score_:0.2f}%")

# Learning on full training set with optimals hyperparameters and score on test set
params = {'max_features' :'sqrt', 'random_state' : 32, 
          'min_samples_split' : 2, 'class_weight' : 'balanced',
          'n_estimators' : grid.best_params_['n_estimators'],
          'max_depth' : grid.best_params_['max_depth']}
clf = RandomForestClassifier(**params).fit(X_train, y_train)
clf.fit(X_train, y_train)
y_test_pred = clf.predict(X_test)

print(f"Random Forest, p={X_train.shape[1]}")
accuracy = clf.score(X_test, y_test)
f1 = f1_score(y_test, y_test_pred)
p = precision_score(y_test, y_test_pred)
r = recall_score(y_test, y_test_pred)
print(f"Model score\n- Accuracy : {accuracy*100:0.1f} %")
print(f"- Precision : {p*100:0.1f} % (Happy # positive class)")
print(f"- Recall : {r*100:0.1f} %")
print(f"- F1 score : {f1*100:0.1f} %")
res_full  = {
    'f1_score' : f1,
    'accuracy' : accuracy,
    'precision' : p,
    'recall' : r
}

Determination of optimal hyperparameters in 36.6 s
Optimal values are {'max_depth': 4, 'n_estimators': 32} 
Accuracy Score of cross valdation 71.88%
Random Forest, p=66
Model score
- Accuracy : 62.5 %
- Precision : 76.9 % (Happy # positive class)
- Recall : 69.0 %
- F1 score : 72.7 %


#### Estimation on each clusters...

In [8]:
n_estimators_range = [16,32,64,128]
max_depth_range = [2,4,8,16,32,64] 
param_grid = dict(n_estimators=n_estimators_range, max_depth = max_depth_range)
params = {'max_features' :'sqrt', 
          'random_state' : 32, 
          'min_samples_split' : 2, 
          'class_weight' : 'balanced'
         }
scope = ( RFE_LogisticRegression_20_features | indiv_act_features )  & set(dataset.columns)
features = df.loc[:,scope].columns

In [9]:
score_clustering_methods = []
clustering_methods = clustTest1.columns[0:3]

for method in clustering_methods:
    print("--------------------------------------------")
    print(f"\nAnalysis cluster method {method}")
    cluster_list = clustTest1[method].unique()
    print(f"liste of clusters : {cluster_list}")
    score_cluster = []
    for cluster in cluster_list:
        index_scope = clustTest1.loc[clustTest1[method]==cluster,:].index
        print("++++++++++++")
        print(f"cluster {cluster} : {len(index_scope)} elements")
        
        Xc = X.loc[index_scope.intersection(X.index),:]
        yc = y[index_scope.intersection(X.index)]
        
        Xs, ys = resample(Xc, yc, random_state=42)
        
        Xs = Xs.iloc[0:n_max,:]
        ys = ys.iloc[0:n_max]
        
        X_train, X_test, y_train, y_test = train_test_split(Xs, ys,
                                                            test_size=0.2, 
                                                            random_state=42)

        scaler = StandardScaler().fit(X_train)
        X_train = scaler.transform(X_train)
        X_test = scaler.transform(X_test)
                
        print(f"Number exemple: {ys.shape[0]}\n\
        - training set: {y_train.shape[0]}\n\
        - test set: {y_test.shape[0]}")
        print(f"Number of features: p={X_train.shape[1]}")
        print(f"Number of class: {len(np.unique(y))}")
        for c in np.unique(y):
            print(f"class {c:0.0f} : {100*np.sum(y==c)/len(y):0.1f}%")
            
            
        startTime = time.time()
        clf = RandomForestClassifier(**params)
        grid = GridSearchCV(clf, 
                            scoring='accuracy', 
                            param_grid=param_grid)

        grid.fit(X_train, y_train)
        print(f"Optimal values are {grid.best_params_} \n\
        Score of cross valdation {100*grid.best_score_:0.2f}%")
        print()

        # Learning on full training set with optimals hyperparameters and score on test set
        params_opt = {'max_features' :'sqrt', 'random_state' : 32, 
                      'min_samples_split' : 2, 'class_weight' : 'balanced',
                      'n_estimators' : grid.best_params_['n_estimators'],
                      'max_depth' : grid.best_params_['max_depth']}
        clf = RandomForestClassifier(**params_opt).fit(X_train, y_train)

            
        y_test_pred = clf.predict(X_test)
        accuracy = clf.score(X_test, y_test)
        f1 = f1_score(y_test, y_test_pred)
        p = precision_score(y_test, y_test_pred)
        r = recall_score(y_test, y_test_pred)            

        res  = {'f1_score' : f1,
                'accuracy' : accuracy,
                'precision' : p,
                'recall' : r}
            
        cl = {'cluster' : cluster,
              'size' : len(index_scope),
              'model' : 'RandomForestClassifier',
              'params' : params_opt,
              'metrics' : res
             }
         
        score_cluster.append(cl)
        
    d = {'clustering_method' : method,
         'cluster_scores' : score_cluster
        }
    score_clustering_methods.append(d) 

--------------------------------------------

Analysis cluster method clust1
liste of clusters : [1 2 3 4 5 6]
++++++++++++
cluster 1 : 295 elements
Number exemple: 200
        - training set: 160
        - test set: 40
Number of features: p=66
Number of class: 2
class 0 : 35.0%
class 1 : 65.0%
Optimal values are {'max_depth': 4, 'n_estimators': 32} 
        Score of cross valdation 76.88%

++++++++++++
cluster 2 : 1729 elements
Number exemple: 200
        - training set: 160
        - test set: 40
Number of features: p=66
Number of class: 2
class 0 : 35.0%
class 1 : 65.0%
Optimal values are {'max_depth': 4, 'n_estimators': 128} 
        Score of cross valdation 71.25%

++++++++++++
cluster 3 : 3633 elements
Number exemple: 200
        - training set: 160
        - test set: 40
Number of features: p=66
Number of class: 2
class 0 : 35.0%
class 1 : 65.0%
Optimal values are {'max_depth': 8, 'n_estimators': 16} 
        Score of cross valdation 81.25%

++++++++++++
cluster 4 : 218 elements

In [10]:
# F1 score
for score_method in score_clustering_methods:
    print(f"method {score_method['clustering_method']}:")
    average_score = 0
    total_size = 0
    for i, score_cluster in enumerate(score_method['cluster_scores']):
        print(f"cluster {score_cluster['cluster']} ({score_cluster['size']}), f1 macro {100*score_cluster['metrics']['f1_score']:0.1f}%")  
        average_score += score_cluster['metrics']['f1_score']*score_cluster['size']
        total_size += score_cluster['size']
        
    average_score = average_score / total_size
    print(f"average f1 on clusters {100*average_score:0.1f}% gain {100*(average_score-res_full['f1_score']):0.1f}\n")  

method clust1:
cluster 1 (295), f1 macro 81.0%
cluster 2 (1729), f1 macro 81.5%
cluster 3 (3633), f1 macro 76.4%
cluster 4 (218), f1 macro 84.6%
cluster 5 (137), f1 macro 54.5%
cluster 6 (24), f1 macro 80.0%
average f1 on clusters 77.9% gain 5.1

method clust2:
cluster 4 (212), f1 macro 87.8%
cluster 6 (1137), f1 macro 89.2%
cluster 5 (750), f1 macro 87.9%
cluster 1 (1257), f1 macro 57.8%
cluster 3 (1254), f1 macro 73.9%
cluster 2 (857), f1 macro 84.2%
cluster 7 (569), f1 macro 68.3%
average f1 on clusters 76.6% gain 3.9

method clust3:
cluster 5 (373), f1 macro 73.7%
cluster 4 (2682), f1 macro 70.6%
cluster 1 (1593), f1 macro 89.6%
cluster 2 (1246), f1 macro 75.5%
cluster 3 (142), f1 macro 91.7%
average f1 on clusters 77.3% gain 4.6



In [11]:
# accuracy
for score_method in score_clustering_methods:
    print(f"method {score_method['clustering_method']}:")
    average_score = 0
    total_size = 0
    for i, score_cluster in enumerate(score_method['cluster_scores']):
        print(f"cluster {score_cluster['cluster']} ({score_cluster['size']}) , accuracy {100*score_cluster['metrics']['accuracy']:0.1f}%")  
        average_score = average_score + score_cluster['metrics']['accuracy']*score_cluster['size']
        total_size += score_cluster['size']
    average_score = average_score / total_size
    print(f"average accuracy on clusters {100*average_score:0.1f}% gain {100*(average_score-res_full['accuracy']):0.1f}\n")  

method clust1:
cluster 1 (295) , accuracy 80.0%
cluster 2 (1729) , accuracy 75.0%
cluster 3 (3633) , accuracy 67.5%
cluster 4 (218) , accuracy 80.0%
cluster 5 (137) , accuracy 52.4%
cluster 6 (24) , accuracy 75.0%
average accuracy on clusters 70.4% gain 7.9

method clust2:
cluster 4 (212) , accuracy 87.2%
cluster 6 (1137) , accuracy 82.5%
cluster 5 (750) , accuracy 80.0%
cluster 1 (1257) , accuracy 52.5%
cluster 3 (1254) , accuracy 70.0%
cluster 2 (857) , accuracy 77.5%
cluster 7 (569) , accuracy 67.5%
average accuracy on clusters 71.4% gain 8.9

method clust3:
cluster 5 (373) , accuracy 75.0%
cluster 4 (2682) , accuracy 62.5%
cluster 1 (1593) , accuracy 82.5%
cluster 2 (1246) , accuracy 67.5%
cluster 3 (142) , accuracy 90.5%
average accuracy on clusters 70.2% gain 7.7

