### Imports

In [1]:
import pandas as pd
import numpy as np
from time import time
import h5py
import matplotlib.pyplot as plt

import  sklearn as sk
from sklearn.cluster import KMeans, OPTICS, DBSCAN
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.decomposition import SparsePCA as PCA

### Loading Data

In [2]:
def load_data(name):
    with h5py.File(f'{name}.h5', 'r') as f:
        return pd.DataFrame(f[name][:])

train = load_data('train')
test  = load_data('test')

In [3]:
print (f'Shape of training data set: {train.shape}')
print (f'Shape of test data set: {test.shape}')

Shape of training data set: (162500, 166)
Shape of test data set: (160651, 164)


### Finding unique features and momovalue columns

In [4]:
def find_unique(X):
  means = np.mean(np.array(X), axis=0)
  unique_values, unique_indices = np.unique(means, return_index=True)
  sorted_indices = np.sort(unique_indices)
  unique_variables = []
  for i in unique_indices:
    unique_variables.append(all_variables[i])
  return unique_variables

In [5]:
def find_multivalues(X, unique_variables):
    multivalue_variables = []
    uniques = []
    j = 0
    for i in unique_variables:
        uniques.append(np.unique(X[i]))
        if len(uniques[j]) >= 2:
            multivalue_variables.append(i)
        j+=1
    return multivalue_variables

In [6]:
all_variables = ['actualInteractionsPerCrossing', 'averageInteractionsPerCrossing', 'correctedActualMu', 'correctedAverageMu', 'correctedScaledActualMu', 'correctedScaledAverageMu', 'NvtxReco', 'p_nTracks', 'p_pt_track', 'p_eta', 'p_phi', 'p_charge', 'p_qOverP', 'p_z0', 'p_d0', 'p_sigmad0', 'p_d0Sig', 'p_EptRatio', 'p_dPOverP', 'p_z0theta', 'p_etaCluster', 'p_phiCluster', 'p_eCluster', 'p_rawEtaCluster', 'p_rawPhiCluster', 'p_rawECluster', 'p_eClusterLr0', 'p_eClusterLr1', 'p_eClusterLr2', 'p_eClusterLr3', 'p_etaClusterLr1', 'p_etaClusterLr2', 'p_phiClusterLr2', 'p_eAccCluster', 'p_f0Cluster', 'p_etaCalo', 'p_phiCalo', 'p_eTileGap3Cluster', 'p_cellIndexCluster', 'p_phiModCalo', 'p_etaModCalo', 'p_dPhiTH3', 'p_R12', 'p_fTG3', 'p_weta2', 'p_Reta', 'p_Rphi', 'p_Eratio', 'p_f1', 'p_f3', 'p_Rhad', 'p_Rhad1', 'p_deltaEta1', 'p_deltaPhiRescaled2', 'p_TRTPID', 'p_TRTTrackOccupancy', 'p_numberOfInnermostPixelHits', 'p_numberOfPixelHits', 'p_numberOfSCTHits', 'p_numberOfTRTHits', 'p_numberOfTRTXenonHits', 'p_chi2', 'p_ndof', 'p_SharedMuonTrack', 'p_E7x7_Lr2', 'p_E7x7_Lr3', 'p_E_Lr0_HiG', 'p_E_Lr0_LowG', 'p_E_Lr0_MedG', 'p_E_Lr1_HiG', 'p_E_Lr1_LowG', 'p_E_Lr1_MedG', 'p_E_Lr2_HiG', 'p_E_Lr2_LowG', 'p_E_Lr2_MedG', 'p_E_Lr3_HiG', 'p_E_Lr3_LowG', 'p_E_Lr3_MedG', 'p_ambiguityType', 'p_asy1', 'p_author', 'p_barys1', 'p_core57cellsEnergyCorrection', 'p_deltaEta0', 'p_deltaEta2', 'p_deltaEta3', 'p_deltaPhi0', 'p_deltaPhi1', 'p_deltaPhi2', 'p_deltaPhi3', 'p_deltaPhiFromLastMeasurement', 'p_deltaPhiRescaled0', 'p_deltaPhiRescaled1', 'p_deltaPhiRescaled3', 'p_e1152', 'p_e132', 'p_e235', 'p_e255', 'p_e2ts1', 'p_ecore', 'p_emins1', 'p_etconeCorrBitset', 'p_ethad', 'p_ethad1', 'p_f1core', 'p_f3core', 'p_maxEcell_energy', 'p_maxEcell_gain', 'p_maxEcell_time', 'p_maxEcell_x', 'p_maxEcell_y', 'p_maxEcell_z', 'p_nCells_Lr0_HiG', 'p_nCells_Lr0_LowG', 'p_nCells_Lr0_MedG', 'p_nCells_Lr1_HiG', 'p_nCells_Lr1_LowG', 'p_nCells_Lr1_MedG', 'p_nCells_Lr2_HiG', 'p_nCells_Lr2_LowG', 'p_nCells_Lr2_MedG', 'p_nCells_Lr3_HiG', 'p_nCells_Lr3_LowG', 'p_nCells_Lr3_MedG', 'p_pos', 'p_pos7', 'p_poscs1', 'p_poscs2', 'p_ptconeCorrBitset', 'p_ptconecoreTrackPtrCorrection', 'p_r33over37allcalo', 'p_topoetconeCorrBitset', 'p_topoetconecoreConeEnergyCorrection', 'p_topoetconecoreConeSCEnergyCorrection', 'p_weta1', 'p_widths1', 'p_widths2', 'p_wtots1', 'p_e233', 'p_e237', 'p_e277', 'p_e2tsts1', 'p_ehad1', 'p_emaxs1', 'p_fracs1', 'p_DeltaE', 'p_E3x5_Lr0', 'p_E3x5_Lr1', 'p_E3x5_Lr2', 'p_E3x5_Lr3', 'p_E5x7_Lr0', 'p_E5x7_Lr1', 'p_E5x7_Lr2', 'p_E5x7_Lr3', 'p_E7x11_Lr0', 'p_E7x11_Lr1', 'p_E7x11_Lr2', 'p_E7x11_Lr3', 'p_E7x7_Lr0', 'p_E7x7_Lr1' ]
X = train[all_variables]
unique_variables = find_unique(X)
X = train[unique_variables]
multivariables = find_multivalues(X, unique_variables)
X = train[multivariables]
X = pd.DataFrame(MinMaxScaler().fit_transform(X))
X.columns = multivariables
print('Found {0:d} unusuable features!'.format(len(all_variables)-len(multivariables)))

Found 17 unusuable features!


# Random Feature Selection Clustering Function

In [7]:
def auswertung(X, alg, scores, var_sets, clusters, extra):
    column_names = [str(i)+' clusters' for i in clusters]
    top_indices = np.empty((5,len(clusters)))
    top_scores  = np.empty_like(top_indices)
    sil_scores  = np.empty_like(top_indices)
    score_frame = pd.DataFrame(scores)
    score_frame.columns = column_names
    j = 0
    for col in column_names:
        top_indices[:,j] = score_frame.sort_values(col).index[:5]
        top_scores[:,j]  = score_frame.sort_values(col)[col][:5]
        alg.set_params(n_clusters = clusters[j], **extra)
        for i in range(5):
            
            X_clust         = X[var_sets[int(top_indices[i,j])]]
            cluster_labels  = alg.fit_predict(X_clust)
            sil_scores[i,j] = sk.metrics.silhouette_score(X_clust, cluster_labels, sample_size=5000)
        j+=1
    final_score = 1 - top_scores + sil_scores
    best_num_clusters = np.argmax(final_score, axis=1)[0]+np.min(clusters)
    best_features = var_sets[np.argmax(final_score, axis=1)[0]]
    return best_num_clusters, best_features, np.max(final_score)

In [8]:
import json
def good_clustering(X, alg, variable_set=multivariables, clusters = [3], n_trials=5, seed=16, extra = {}):
    start = time()
    np.random.seed(seed)
    scores   = np.zeros((n_trials, len(clusters)))
    var_sets = []
    for trial in range(n_trials):
        var_sets.append([variable_set[i] for i in np.random.randint(0,X.shape[1],25)])
        X_clust = X[var_sets[trial]]
        for n_clusters in clusters:
            alg.set_params(n_clusters = n_clusters, **extra)
            cluster_labels = alg.fit_predict(X_clust)
            scores[trial, n_clusters-np.min(clusters)] = sk.metrics.davies_bouldin_score(X_clust, cluster_labels)
        if trial%50==0 and trial != 0:
            print('{0:2d}/{1:d} trials finished. Best score: {2:3.2f}. This took so far: {3:2.1f} minutes.'.format(trial, n_trials, np.min(scores[np.nonzero(scores)]),
                                                                                                                   (time()-start)/60))
    print('Trials finished. Now analysing results. Get back to you shortly. This took: {0:4.1f} minutes.'.format((time()-start)/60))
    cl, var, sc = auswertung(X, alg, scores, var_sets, clusters, extra)
    return cl, var, sc

## Running K-Means Clustering

In [23]:
seed         = 8
clusters     = [i for i in range(3,8)]
algorithm    = KMeans()
extra_params = {'random_state':seed}
num_clusters, features, high_score = good_clustering(X.sample(5000, random_state=seed), algorithm, clusters = clusters, n_trials=5000,
                                                             seed=seed, extra=extra_params)
kmeans_results = {'num_clusters':num_clusters, 'features':features, 'high_score':high_score}

50/5000 trials finished. Best score: 0.51. This took so far: 0.8 minutes.
100/5000 trials finished. Best score: 0.51. This took so far: 1.5 minutes.
150/5000 trials finished. Best score: 0.51. This took so far: 2.1 minutes.
200/5000 trials finished. Best score: 0.51. This took so far: 2.8 minutes.
250/5000 trials finished. Best score: 0.41. This took so far: 3.5 minutes.
300/5000 trials finished. Best score: 0.41. This took so far: 4.2 minutes.
350/5000 trials finished. Best score: 0.41. This took so far: 4.8 minutes.
400/5000 trials finished. Best score: 0.41. This took so far: 5.4 minutes.
450/5000 trials finished. Best score: 0.41. This took so far: 6.1 minutes.
500/5000 trials finished. Best score: 0.41. This took so far: 6.8 minutes.
550/5000 trials finished. Best score: 0.41. This took so far: 7.5 minutes.
600/5000 trials finished. Best score: 0.41. This took so far: 8.2 minutes.
650/5000 trials finished. Best score: 0.41. This took so far: 8.8 minutes.
700/5000 trials finished. 

In [25]:
print(kmeans_results['high_score'])
X_testing = pd.DataFrame(MinMaxScaler().fit_transform(test[kmeans_results['features']]))
test_labels = KMeans(n_clusters=kmeans_results['num_clusters'], random_state=seed).fit_predict(X_testing)
print(np.unique(test_labels, return_counts=True))

1.6453270078779338
(array([0, 1, 2, 3], dtype=int32), array([27658, 52814, 52529, 27650]))


In [26]:
pd.Series(test_labels).to_csv('Clustering_LaurentLindpointner_KMeans.txt', header=False)
pd.Series(kmeans_results['features']).to_csv('Clustering_LaurentLindpointner_KMeans_VariableList.txt', index=False, header=False)


## Running BIRCH Clustering

In [11]:
seed         = 18
clusters     = [i for i in range(3,8)]
algorithm    = Birch(threshold=0.3)
num_clusters, features, high_score = good_clustering(X.sample(5000, random_state=seed), algorithm, clusters = clusters, n_trials=5000,
                                                             seed=seed)
Birch_results = {'num_clusters':num_clusters, 'features':features, 'high_score':high_score}

50/5000 trials finished. Best score: 0.62. This took so far: 1.0 minutes.
100/5000 trials finished. Best score: 0.53. This took so far: 1.9 minutes.




150/5000 trials finished. Best score: 0.44. This took so far: 2.8 minutes.
200/5000 trials finished. Best score: 0.44. This took so far: 3.8 minutes.
250/5000 trials finished. Best score: 0.44. This took so far: 4.7 minutes.
300/5000 trials finished. Best score: 0.44. This took so far: 5.6 minutes.
350/5000 trials finished. Best score: 0.44. This took so far: 6.5 minutes.
400/5000 trials finished. Best score: 0.44. This took so far: 7.5 minutes.
450/5000 trials finished. Best score: 0.44. This took so far: 8.4 minutes.




500/5000 trials finished. Best score: 0.44. This took so far: 9.3 minutes.
550/5000 trials finished. Best score: 0.44. This took so far: 10.3 minutes.
600/5000 trials finished. Best score: 0.44. This took so far: 11.3 minutes.
650/5000 trials finished. Best score: 0.43. This took so far: 12.2 minutes.
700/5000 trials finished. Best score: 0.43. This took so far: 13.0 minutes.
750/5000 trials finished. Best score: 0.38. This took so far: 14.0 minutes.




800/5000 trials finished. Best score: 0.38. This took so far: 14.9 minutes.




850/5000 trials finished. Best score: 0.38. This took so far: 15.7 minutes.
900/5000 trials finished. Best score: 0.38. This took so far: 16.7 minutes.
950/5000 trials finished. Best score: 0.38. This took so far: 17.6 minutes.




1000/5000 trials finished. Best score: 0.38. This took so far: 18.6 minutes.
1050/5000 trials finished. Best score: 0.38. This took so far: 19.5 minutes.
1100/5000 trials finished. Best score: 0.38. This took so far: 20.4 minutes.
1150/5000 trials finished. Best score: 0.38. This took so far: 21.4 minutes.
1200/5000 trials finished. Best score: 0.38. This took so far: 22.3 minutes.
1250/5000 trials finished. Best score: 0.38. This took so far: 23.2 minutes.
1300/5000 trials finished. Best score: 0.38. This took so far: 24.1 minutes.




1350/5000 trials finished. Best score: 0.38. This took so far: 25.1 minutes.
1400/5000 trials finished. Best score: 0.38. This took so far: 26.0 minutes.
1450/5000 trials finished. Best score: 0.38. This took so far: 26.9 minutes.
1500/5000 trials finished. Best score: 0.38. This took so far: 27.9 minutes.
1550/5000 trials finished. Best score: 0.38. This took so far: 28.8 minutes.
1600/5000 trials finished. Best score: 0.38. This took so far: 29.7 minutes.
1650/5000 trials finished. Best score: 0.38. This took so far: 30.6 minutes.




1700/5000 trials finished. Best score: 0.38. This took so far: 31.6 minutes.
1750/5000 trials finished. Best score: 0.38. This took so far: 32.6 minutes.




1800/5000 trials finished. Best score: 0.38. This took so far: 33.5 minutes.
1850/5000 trials finished. Best score: 0.38. This took so far: 34.5 minutes.




1900/5000 trials finished. Best score: 0.26. This took so far: 35.4 minutes.
1950/5000 trials finished. Best score: 0.26. This took so far: 36.4 minutes.
2000/5000 trials finished. Best score: 0.26. This took so far: 37.3 minutes.
2050/5000 trials finished. Best score: 0.26. This took so far: 38.2 minutes.
2100/5000 trials finished. Best score: 0.26. This took so far: 39.2 minutes.
2150/5000 trials finished. Best score: 0.26. This took so far: 40.1 minutes.
2200/5000 trials finished. Best score: 0.26. This took so far: 41.1 minutes.
2250/5000 trials finished. Best score: 0.26. This took so far: 41.9 minutes.
2300/5000 trials finished. Best score: 0.26. This took so far: 42.9 minutes.
2350/5000 trials finished. Best score: 0.26. This took so far: 43.8 minutes.
2400/5000 trials finished. Best score: 0.26. This took so far: 44.7 minutes.




2450/5000 trials finished. Best score: 0.26. This took so far: 45.6 minutes.
2500/5000 trials finished. Best score: 0.26. This took so far: 46.6 minutes.




2550/5000 trials finished. Best score: 0.26. This took so far: 47.5 minutes.
2600/5000 trials finished. Best score: 0.26. This took so far: 48.5 minutes.
2650/5000 trials finished. Best score: 0.26. This took so far: 49.5 minutes.
2700/5000 trials finished. Best score: 0.26. This took so far: 50.4 minutes.
2750/5000 trials finished. Best score: 0.26. This took so far: 51.4 minutes.
2800/5000 trials finished. Best score: 0.26. This took so far: 52.3 minutes.
2850/5000 trials finished. Best score: 0.26. This took so far: 53.2 minutes.
2900/5000 trials finished. Best score: 0.26. This took so far: 54.1 minutes.
2950/5000 trials finished. Best score: 0.26. This took so far: 55.1 minutes.
3000/5000 trials finished. Best score: 0.26. This took so far: 56.0 minutes.
3050/5000 trials finished. Best score: 0.26. This took so far: 56.9 minutes.
3100/5000 trials finished. Best score: 0.26. This took so far: 57.8 minutes.
3150/5000 trials finished. Best score: 0.26. This took so far: 58.7 minutes.



3450/5000 trials finished. Best score: 0.26. This took so far: 64.3 minutes.




3500/5000 trials finished. Best score: 0.26. This took so far: 65.2 minutes.
3550/5000 trials finished. Best score: 0.26. This took so far: 66.1 minutes.
3600/5000 trials finished. Best score: 0.26. This took so far: 67.0 minutes.




3650/5000 trials finished. Best score: 0.26. This took so far: 67.9 minutes.
3700/5000 trials finished. Best score: 0.26. This took so far: 68.8 minutes.
3750/5000 trials finished. Best score: 0.26. This took so far: 69.8 minutes.
3800/5000 trials finished. Best score: 0.26. This took so far: 70.8 minutes.
3850/5000 trials finished. Best score: 0.26. This took so far: 71.7 minutes.
3900/5000 trials finished. Best score: 0.26. This took so far: 72.6 minutes.
3950/5000 trials finished. Best score: 0.26. This took so far: 73.5 minutes.
4000/5000 trials finished. Best score: 0.26. This took so far: 74.5 minutes.
4050/5000 trials finished. Best score: 0.26. This took so far: 75.4 minutes.
4100/5000 trials finished. Best score: 0.26. This took so far: 76.4 minutes.
4150/5000 trials finished. Best score: 0.26. This took so far: 77.2 minutes.
4200/5000 trials finished. Best score: 0.26. This took so far: 78.2 minutes.
4250/5000 trials finished. Best score: 0.26. This took so far: 79.0 minutes.



Trials finished. Now analysing results. Get back to you shortly. This took: 92.8 minutes.




In [12]:
print(Birch_results['high_score'])
X_testing = pd.DataFrame(MinMaxScaler().fit_transform(test[Birch_results['features']]))
test_labels = Birch(n_clusters=int(Birch_results['num_clusters'])).fit_predict(X_testing)
print(np.unique(test_labels, return_counts=True))

1.5307099475396522
(array([0, 1, 2, 3]), array([43965, 44103, 36222, 36361]))


In [13]:
pd.Series(test_labels).to_csv('Clustering_LaurentLindpointner_Birch.txt', header=False)
pd.Series(Birch_results['features']).to_csv('Clustering_LaurentLindpointner_Birch_VariableList.txt', index=False, header=False)