In [None]:
from sklearn.metrics import silhouette_score, root_mean_squared_error, r2_score
from scipy.cluster.hierarchy import dendrogram, linkage
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.ensemble import RandomForestRegressor
from hirarchy import build_tree, get_level_data
from yellowbrick.cluster import KElbowVisualizer
from sklearn.preprocessing import MinMaxScaler
from scipy.stats import chisquare, pearsonr
from sklearn.model_selection import KFold
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
from matplotlib import rcParams
from tqdm import tqdm
import seaborn as sns
import pickle as pkl
import pandas as pd
import numpy as np
import warnings
import joblib
import sys
import os

In [None]:
import tensorflow as tf
print(f'GPU Name: {tf.test.gpu_device_name()}')

In [None]:
rcParams['font.family'] = 'DejaVu Sans'
warnings.filterwarnings('ignore')
sys.path.append('./')
os.makedirs('./submission/morgan/results', exist_ok=True)
os.makedirs('./submission/morgan/data', exist_ok=True)
os.makedirs('./submission/morgan/data/train', exist_ok=True)
os.makedirs('./submission/morgan/data/test', exist_ok=True)
os.makedirs('./submission/morgan/data/folds', exist_ok=True)
os.makedirs('./submission/morgan/model', exist_ok=True)
os.makedirs('./submission/morgan/tmp', exist_ok=True)

In [None]:
morgan = pd.read_csv('data/raw/Morgan_Fingerprint.csv')
morgan.rename(columns={'Unnamed: 0': 'CID'}, inplace=True)
features = morgan[morgan.columns[1:]]
cids = morgan['CID']

In [None]:
turn_svd = TruncatedSVD(n_components=20)
turn_svd.fit(features)
features_20 = turn_svd.transform(features)
features_20 = pd.DataFrame(features_20, index=cids)
features_20

In [None]:
features_20_distances = pd.DataFrame(cosine_similarity(features_20), index=cids, columns=cids)
features_20_distances

In [None]:
km = KMeans(random_state=42)
visualizer = KElbowVisualizer(km, k=(2,10))
visualizer.fit(features_20_distances) # best at 4
visualizer.show()

In [None]:
km = KMeans(n_clusters=4, random_state=42)
features_20_km = km.fit(features_20_distances)
labels = km.labels_

In [None]:
cluster_labels_df = pd.DataFrame({'CID': cids, 'clusters': labels})
cluster_labels_df

In [None]:
cluster_labels_df['clusters'].value_counts()

In [None]:
centers = km.cluster_centers_
cluster_center_df = km.transform(features_20_distances)
cluster_center_df = pd.DataFrame(cluster_center_df, index=cids, columns=[f'distance_to_center_{i}' for i in range(4)])
cluster_center_df

In [None]:
def correct_mix_def(row):
    invalid_cids = {
        78605: 0,
        25137858: 0,
        19789253: 0,
        66328: 0,
    }
    replace_cid = {
        11002307: [11002, 307]
    }
    indices = row.index[2:]
    last_index = row.index[-1]
    for idx in indices:
        if row.loc[idx] in invalid_cids.keys():
            row.loc[idx] = invalid_cids[row.loc[idx]]
        if row.loc[idx] in replace_cid.keys():
            # replace current entry with 1st value and replace last column with 2nd value
            row.loc[last_index] = replace_cid[row.loc[idx]][1]
            row.loc[idx] = replace_cid[row.loc[idx]][0]
    return row

In [None]:
mixers_def = pd.read_csv('data/raw/Mixure_Definitions_Training_set.csv')
mixers_def = mixers_def.apply(correct_mix_def, axis=1)
mixers_def_group = mixers_def.groupby(['Dataset'])
mixers_def

In [None]:
mixers_val = pd.read_csv('data/raw/TrainingData_mixturedist.csv')
mixers_val.dropna(inplace=True)
mixers_val

In [None]:
def retrive_mixer_cids(gr_nm, mx_1, mx_2):
    grp_name = mixers_def_group.get_group(gr_nm)
    
    test1 = np.array(grp_name[grp_name['Mixture Label'] == mx_1])[0][2:]
    test1 = [cid for cid in test1 if cid != 0]
    
    test2 = np.array(grp_name[grp_name['Mixture Label'] == mx_2])[0][2:]
    test2 = [cid for cid in test2 if cid != 0]
    
    return test1, test2

In [None]:
def get_cluster_counts(ids):
    '''
    Get the counts of each cluster for the given ids
    '''
    clusters = cluster_labels_df[cluster_labels_df['CID'].isin(ids)]['clusters']
    clusters_matrix =pd.DataFrame(clusters.value_counts()).reset_index()

    dict_ = {}
    for idx, row in clusters_matrix.iterrows():
        dict_[row['clusters']] = row['count']
    
    if 0 not in dict_.keys():
        dict_[0] = 0
    if 1 not in dict_.keys():
        dict_[1] = 0
    if 2 not in dict_.keys():
        dict_[2] = 0
    if 3 not in dict_.keys():
        dict_[3] = 0
    
    return dict_

In [None]:
compute_type = {
    1: features_20,
    2: features_20_distances,
    3: cluster_center_df,
}
compute_type_name = {
    1: 'features_20',
    2: 'features_20_distances',
    3: 'cluster_center_df',
}

In [None]:
def get_features(test1, test2, compute_type_key):
    '''
    Get the features for the given test1 and test2
    '''
    features1 = compute_type[compute_type_key].loc[test1]
    features2 = compute_type[compute_type_key].loc[test2]

    features1 = np.array(features1.mean(axis=0))
    features2 = np.array(features2.mean(axis=0))
    
    return features1, features2

In [None]:
test1, test2 = retrive_mixer_cids('Snitz 1', 1, 2)
features1, features2 = get_features(test1, test2, 1)

In [None]:
def get_chisquare_val(test1, test2):
    '''
    Get the p-value of the chisquare test between two mixtures on cluster distribution
    '''
    dict1 = get_cluster_counts(test1)
    dict2 = get_cluster_counts(test2)    
    combined = pd.DataFrame([dict1, dict2])
    result_p_value = chisquare(combined, axis=None)[-1]
    return result_p_value

In [None]:
def get_pearsonr_val(test1, test2, compute_type_key):
    '''
    Get the pearson correlation value between two mixtures on the given compute type
    '''
    # print(f'Using Compute Type: {compute_type_key}')
    m1_diatances = compute_type[compute_type_key].loc[test1]
    m2_diatances = compute_type[compute_type_key].loc[test2]

    m1_diatances = np.array(m1_diatances.mean(axis=0))
    m2_diatances = np.array(m2_diatances.mean(axis=0))
    
    corr_ = pearsonr(m1_diatances, m2_diatances)[0]

    return corr_

In [None]:
test1, test2 = retrive_mixer_cids('Snitz 1', 1, 2)
print(f'Corr: {get_pearsonr_val(test1, test2, 1)}')
print(f'Corr: {get_pearsonr_val(test1, test2, 2)}')
print(f'Corr: {get_pearsonr_val(test1, test2, 3)}')

In [None]:
def get_cosine_val(test1, test2, compute_type_key):
    '''
    Get the cosine similarity value between two mixtures on the given compute type
    '''
    m1_diatances = compute_type[compute_type_key].loc[test1]
    m2_diatances = compute_type[compute_type_key].loc[test2]

    m1_diatances = np.array(m1_diatances.mean(axis=0))
    m2_diatances = np.array(m2_diatances.mean(axis=0))

    cos_ = cosine_similarity([m1_diatances], [m2_diatances])[0][0]

    return cos_

In [None]:
test1, test2 = retrive_mixer_cids('Snitz 1', 1, 2)
print(f'Cosine Similarity: {get_cosine_val(test1, test2, 1)}')
print(f'Cosine Similarity: {get_cosine_val(test1, test2, 2)}')
print(f'Cosine Similarity: {get_cosine_val(test1, test2, 3)}')

In [None]:
def get_euclidean_val(test1, test2, compute_type_key):
    '''
    Get the euclidean distance value between two mixtures on the given compute type
    '''
    m1_diatances = compute_type[compute_type_key].loc[test1]
    m2_diatances = compute_type[compute_type_key].loc[test2]

    m1_diatances = np.array(m1_diatances.mean(axis=0))
    m2_diatances = np.array(m2_diatances.mean(axis=0))

    euc_ = np.linalg.norm(m1_diatances - m2_diatances)

    return euc_

In [None]:
test1, test2 = retrive_mixer_cids('Snitz 1', 1, 2)
print(f'Euclidean Distance: {get_euclidean_val(test1, test2, 1)}')
print(f'Euclidean Distance: {get_euclidean_val(test1, test2, 2)}')
print(f'Euclidean Distance: {get_euclidean_val(test1, test2, 3)}')

In [None]:
mixers_val

In [None]:
def make_x_df(x_df, compute_type_key):
    '''
    Make the x_df for the given compute type
    '''
    for idx, row in tqdm(mixers_val.iterrows(), total=mixers_val.shape[0]):
        dataset = row['Dataset']
        mix1 = row['Mixture 1']
        mix2 = row['Mixture 2']
        expected = row['Experimental Values']

        test1, test2 = retrive_mixer_cids(dataset, mix1, mix2)

        chi2 = get_chisquare_val(test1, test2)
        pearsonr_val = get_pearsonr_val(test1, test2, compute_type_key)
        cosine_val = get_cosine_val(test1, test2, compute_type_key)
        euclidean_val = get_euclidean_val(test1, test2, compute_type_key)
        
        features1, features2 = get_features(test1, test2, compute_type_key)
        
        in_df = [dataset, mix1, mix2]
        in_df.extend(features1)
        in_df.extend(features2)
        in_df.extend([chi2, pearsonr_val, cosine_val, euclidean_val, expected])
        in_df = np.array(in_df)

        x_df = pd.concat([x_df, pd.DataFrame([in_df], columns=x_df.columns)], ignore_index=True)

    x_df[x_df.columns[1:]] = x_df[x_df.columns[1:]].astype(float) 
    
    return x_df

# X:
Types of X:
- Cluster Centers. Shape: $(500, 4*2+1+1+1+1+1)$
- Feature 20. Shape: $(500, 20*2+1+1+1+1+1)$
- Feature 20 Distances. Shape: $(500, 169*2+1+1+1+1+1)$

Other included features $(1+1+1+1+1)$:
- chi-square p-value between two mixtures on cluster distribution
- pearson correlation between two mixtures on given compute type
- cosine similarity between two mixtures on given compute type
- euclidean distance between two mixtures on given compute type
- expected value of the cluster distribution

In [None]:
cols_cluster_center = ['Dataset', 'Mixture 1', 'Mixture 2']+[f'm{i}_d{j}' for i in range(2) for j in range(4)]+['chi2','pearsonr', 'cosine', 'euclidean', 'expected']
X_cluster_center = pd.DataFrame(columns=cols_cluster_center)

cols_features_20 = ['Dataset', 'Mixture 1', 'Mixture 2']+[f'm{i}_d{j}' for i in range(2) for j in range(20)]+['chi2','pearsonr', 'cosine', 'euclidean', 'expected']
X_features_20 = pd.DataFrame(columns=cols_features_20)

cols_features_20_distances = ['Dataset', 'Mixture 1', 'Mixture 2']+[f'm{i}_d{j}' for i in range(2) for j in range(169)]+['chi2','pearsonr', 'cosine', 'euclidean', 'expected']
X_features_20_distances = pd.DataFrame(columns=cols_features_20_distances)

In [None]:
X_cluster_center = make_x_df(X_cluster_center, 3)
X_cluster_center

In [None]:
X_features_20 = make_x_df(X_features_20, 1)
X_features_20

In [None]:
X_features_20_distances = make_x_df(X_features_20_distances, 2)
X_features_20_distances

In [None]:
# Saves
X_cluster_center.to_csv('submission/morgan/data/X_cluster_center.csv', index=False)
X_features_20.to_csv('submission/morgan/data/X_features_20.csv', index=False)
X_features_20_distances.to_csv('submission/morgan/data/X_features_20_distances.csv', index=False)
features_20.to_csv('submission/morgan/data/features_20.csv')
features_20_distances.to_csv('submission/morgan/data/features_20_distances.csv')
cluster_center_df.to_csv('submission/morgan/data/cluster_center_df.csv')
cluster_labels_df.to_csv('submission/morgan/data/cluster_labels_df.csv')
pkl.dump(km, open('submission/morgan/model/km.pkl', 'wb'))
pkl.dump(turn_svd, open('submission/morgan/model/turn_svd.pkl', 'wb'))
mixers_def.to_csv('submission/morgan/data/mixers_def.csv', index=False)