In [50]:
import random
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import pandas as pd
from sklearn.model_selection import train_test_split

In [153]:
# functions
def add_address_of_data(given_address): # could be useful for easily allowing others to use this file
    return "".join([given_address, "/visit_meaning_vectors/visit_meanings.csv"])

# calculating Euclidean distance
def calculate_prob_of_visit_with_sorting(given_visit, given_pca):
    # assuming given visit is a meaning vector, with number of variables same as number of PCs

    given_pca_sorted = given_pca.sort_values(by=["Labels"]) # sorting given PCA list by labels
    prev_label = 0
    current_label = 0
    lowest_dist_to_visit = float('inf')
    current_labels_dist = 0
    total_dist = 0
    lowest_dist_to_visits_label = 0


    for x in range(len(given_pca_sorted)):
        current_label = given_pca_sorted.at[given_pca_sorted.index[x], "Labels"] # get current label

        # extracting required variables of the point in PCA data
        temp_point = given_pca_sorted.iloc[x]
        temp_point = temp_point.tolist()
        temp_point = temp_point[:-1]

        if current_label >= 0: # to skip "-1" labels
            if current_label == prev_label: current_labels_dist += np.linalg.norm(np.array(given_visit) - np.array(temp_point)) # performing the euclidean distance calculation
            else: # have arrived to next label in the sorted PCA, so check final values
                if current_labels_dist < lowest_dist_to_visit:
                    lowest_dist_to_visit = current_labels_dist
                    lowest_dist_to_visits_label = current_label
                prev_label = current_label
                total_dist += current_labels_dist
                current_labels_dist = 0 # since we are going to next label, reset distance

    if total_dist == 0: print(len(given_visit))
    return 1 - (lowest_dist_to_visits_label / total_dist), lowest_dist_to_visits_label - 1

# calculating Euclidean distance and assumes that "given_pca" is already sorted
def calculate_prob_of_visit_without_sorting(given_visit, given_pca_sorted):
    # assuming given visit is a meaning vector, with number of variables same as number of PCs

    prev_label = 0
    current_label = 0
    lowest_dist_to_visit = float('inf')
    current_labels_dist = 0
    total_dist = 0
    lowest_dist_to_visits_label = 0

    for x in range(0, len(given_pca_sorted)):
        current_label = given_pca_sorted.at[given_pca_sorted.index[x], "Labels"] # get current label

        # extracting required variables of the point in PCA data
        temp_point = given_pca_sorted.iloc[x]
        temp_point = temp_point.tolist()
        temp_point = temp_point[:-1]

        if current_label >= 0: # to skip "-1" labels
            if current_label == prev_label: current_labels_dist += np.linalg.norm(np.array(given_visit) - np.array(temp_point)) # performing the euclidean distance calculation
            else: # have arrived to next label in the sorted PCA, so check final values
                if current_labels_dist < lowest_dist_to_visit:
                    lowest_dist_to_visit = current_labels_dist
                    lowest_dist_to_visits_label = current_label
                prev_label = current_label
                total_dist += current_labels_dist
                current_labels_dist = 0 # since we are going to next label, reset distance

    if total_dist == 0: print(len(given_visit))
    return 1 - (lowest_dist_to_visits_label / total_dist), lowest_dist_to_visits_label

def get_avg_pca(given_pca):
    given_pca_sorted = given_pca.sort_values(by=["Labels"]) # sorting given PCA list by labels
    list_of_avg_values = []
    prev_label = 0
    current_labels_dist = [0] * (len(given_pca.columns) - 1)
    current_label_count = 0
    for x in range(0, len(given_pca_sorted)):
        current_label = given_pca_sorted.at[given_pca_sorted.index[x], "Labels"] # get current label


        if current_label >= 0: # to skip "-1" labels
            current_label_count += 1

            if current_label == prev_label:
                # extracting required variables of the point in PCA data
                temp_point = given_pca_sorted.iloc[x]
                temp_point = temp_point.tolist()
                temp_point = temp_point[:-1]

                current_labels_dist = [current_labels_dist[y] + temp_point[y] for y in range (len(current_labels_dist))]
            else: # have arrived to next label in the sorted PCA, so check final values
                prev_label = current_label
                list_of_avg_values.append([current_labels_dist[z] / current_label_count for z in range (len(current_labels_dist))])
                # resetting
                current_labels_dist = [0] * (len(given_pca.columns)-1)
                current_label_count = 0

    temp_df = pd.DataFrame(list_of_avg_values, columns=[b for b in range(0, len(given_pca.columns)-1)])
    return temp_df

# calculating Euclidean distance and assumes that "given_pca" is already sorted and averaged
def calculate_prob_of_visit_with_sorted_avg_pca(given_visit, given_avg_pca_sorted):
    # assuming given visit is a meaning vector, with number of variables same as number of PCs

    lowest_dist_to_visit = float('inf')
    total_dist = 0
    lowest_dist_to_visits_label = 0

    for x in range(0, len(given_avg_pca_sorted)): # each index is a label
        # extracting required variables of the point in PCA data
        temp_point = given_avg_pca_sorted.iloc[x]
        temp_point = temp_point.tolist()

        current_labels_dist = np.linalg.norm(np.array(given_visit) - np.array(temp_point)) # performing the euclidean distance calculation

        if current_labels_dist < lowest_dist_to_visit:
            lowest_dist_to_visit = current_labels_dist
            lowest_dist_to_visits_label = x

        total_dist += current_labels_dist

    if total_dist == 0: print(len(given_visit))
    return 1 - (lowest_dist_to_visits_label / total_dist), lowest_dist_to_visits_label

In [52]:
# meaning vectors will be made of several hundreds of variables and each cluster will have a set of data points (each data point containing values of each of these variables (so a meaning vector)

# main data
my_address = "C:/Users/dnaen/APG_data"  # only this has to be modified
df = pd.read_csv(add_address_of_data(my_address))

In [53]:
# will be used to get the labels
combined_address = "".join([my_address, "/cluster_paths.csv"])
df_with_labels = pd.read_csv(combined_address)

df_with_labels.head()

Unnamed: 0,visit_id,cluster_label,path
0,33888[1],-1,"[3, 1, 3, 1, 13, 956]"
1,445844[2],0,"[1, 14, 18, 14, 31, 14, 21]"
2,39476[1],1,"[188, 187, 3, 1, 7]"
3,671874[9],-1,"[188, 1, 14, 5, 16, 12, 22, 7, 13, 1, 12, 22]"
4,381389[1],-1,"[3, 1, 12, 7, 13, 20, 1, 12, 22, 20, 1, 13, 88..."


In [54]:
# dividing the dataset into a feature set and corresponding labels
features_X = df.pop("meaning_vectors")
labels_Y = df_with_labels.pop("cluster_label")


In [55]:
features_X = features_X.to_frame(name="meaning_vectors") # converting from series to df

In [56]:
# expanding features df, because currently there is only one column where each row contains a list of meaning values, to make it work for PCA method defined in sklearn library, it needs to be a df where each cell is only one number
features_X_expanded = features_X.meaning_vectors.str.split(",",expand=True,)
features_X_expanded.head(10)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,[-2.8568043961120506,-1.0411711915039288,1.0061674131898883,-1.1113399198600664,-1.655285350981589,-2.6691645604368404,-2.5958565738627417,-0.1800041119591187,3.400636765062071,-2.8970298264139283,...,-1.0407846027233123,-1.3534519248437211,-0.7829219487934165,0.8125388988660006,-0.90483542617853,-3.1732080388866635,1.325712796877335,0.99422387889872,1.094814822799679,0.2922618730065183]
1,[-0.37217951978468866,-0.6032857196871719,-0.2644388258666698,-0.705210610800887,-0.5024737471499354,1.1758432209879477,0.2028001229748211,-0.1665542381449563,1.1064652689149002,-0.2352754099935789,...,-0.0742195997210637,-0.6932625302516809,0.963516385545068,-0.683989136256496,-1.828101435391904,0.127727237623023,-0.3031440234714888,1.156684679867159,0.8138702224222183,0.7501007141180425]
2,[0.8520023563924629,-0.74428102453283,-0.3087686735290868,0.7550820699425076,-1.1884070250271683,0.2594626318878004,-0.5789710086976391,-0.4396557466653106,0.2754557052112137,-0.3950977929958402,...,-0.9679174543001748,-0.0124008305979463,0.7944702992178544,0.2147520861282463,-0.4608931170258705,-0.2407134760513881,0.6540681256201899,1.0477228156685898,0.5418776481298473,0.4532574043021911]
3,[0.13135490096017363,-0.8527024690877495,-0.3277147787074725,0.7094480944134656,-1.225388474398729,0.2570751190759703,-0.7784168996772194,-0.5117655031068278,0.8066044909496284,-0.557091680900335,...,-0.7360540829119613,-0.1618150391705337,0.6513718346307573,0.1054713497080715,-0.4870473429087532,-0.5749379154936738,0.8275773326331273,0.7831383562317353,0.3331744460311919,0.4599801233564802]
4,[-0.18643798272444848,-1.4383675231123787,-0.4327137724151131,-0.6960175697949025,-0.8957982682654766,-1.691897993858623,-1.4967544962443398,0.2701244439415525,2.301723363677365,-0.8436691699799425,...,-0.3968408996437739,-1.8327583531723195,-0.0013573005007014,0.147503005825377,-0.2018465669491506,-1.1137789443869652,-0.4976740609206737,0.3075755553414648,1.7229589170293227,-0.8132785194235608]
5,[0.8017517267105219,-0.7876907524983324,-0.332988247874923,0.830129793077704,-1.1531769196160255,0.3498837970615013,-0.5916411215336989,-0.5064417526467957,0.2931870839908932,-0.4113058651529327,...,-0.9471900352710836,0.0358734592319908,0.8264904988214079,0.2228689429216546,-0.4840687867477915,-0.2409167885491525,0.62657057032324,1.1048656925778593,0.4838400432639818,0.4945056179621481]
6,[0.8520023563924629,-0.74428102453283,-0.3087686735290868,0.7550820699425076,-1.1884070250271683,0.2594626318878004,-0.5789710086976391,-0.4396557466653106,0.2754557052112137,-0.3950977929958402,...,-0.9679174543001748,-0.0124008305979463,0.7944702992178544,0.2147520861282463,-0.4608931170258705,-0.2407134760513881,0.6540681256201899,1.0477228156685898,0.5418776481298473,0.4532574043021911]
7,[1.5249221499351044,-1.4037420917759558,-1.1786559693686518,-0.445124266985017,-0.4981005783143915,-0.8423023980848011,-0.4965743094008069,0.9325504485647284,1.5277713403101256,0.3663611175518556,...,0.0573146075698068,-1.787841300602305,0.4044431956806383,0.0998980113545788,0.2666563225729433,0.2626319755389393,-1.204812690464013,-0.1830069937048942,2.119338408425937,-1.201881756483654]
8,[1.0858290488108364,-1.2705733838081774,-1.446306172712969,-0.2581839376364062,-1.0839809963039524,-0.9847417019354108,-1.4829461836657758,1.206394662729999,1.386778089035639,0.8785372346622753,...,-0.19981773414601,-2.2441943592574685,0.1573503588486243,0.0200570787809089,-0.080270900645012,0.5338387721314871,-0.6411247097572574,-0.0794679897298436,1.3821247479600585,-1.3505309405548123]
9,[1.627093028977404,-1.324024480073738,-1.796552349819176,0.1300174020788752,-0.941459070768687,-0.5552228530119238,-1.3417394500338924,1.1032342875930838,0.9995642579052516,1.251835273194916,...,-0.0601341036883135,-2.0734854983174844,0.4632465093783834,-0.1052143931207994,0.0358612345821526,0.9616596651355548,-0.8581958482899991,0.0326572849620776,1.2022286501795751,-1.3443378096728382]


In [57]:
len(features_X_expanded.columns)

100

In [58]:
# since we have expanded a string of list, we also have to remove "[" and "]" from first and last column
fixed_first_column = features_X_expanded[0].str.replace("[","")
fixed_last_column = features_X_expanded[99].str.replace("]","")

features_X_expanded[0] = fixed_first_column
features_X_expanded[99] = fixed_last_column

features_X_expanded.head()

  fixed_first_column = features_X_expanded[0].str.replace("[","")
  fixed_last_column = features_X_expanded[99].str.replace("]","")


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,-2.8568043961120506,-1.0411711915039288,1.0061674131898883,-1.1113399198600664,-1.655285350981589,-2.6691645604368404,-2.5958565738627417,-0.1800041119591187,3.400636765062071,-2.8970298264139283,...,-1.0407846027233123,-1.3534519248437211,-0.7829219487934165,0.8125388988660006,-0.90483542617853,-3.1732080388866635,1.325712796877335,0.99422387889872,1.094814822799679,0.2922618730065183
1,-0.3721795197846886,-0.6032857196871719,-0.2644388258666698,-0.705210610800887,-0.5024737471499354,1.1758432209879477,0.2028001229748211,-0.1665542381449563,1.1064652689149002,-0.2352754099935789,...,-0.0742195997210637,-0.6932625302516809,0.963516385545068,-0.683989136256496,-1.828101435391904,0.127727237623023,-0.3031440234714888,1.156684679867159,0.8138702224222183,0.7501007141180425
2,0.8520023563924629,-0.74428102453283,-0.3087686735290868,0.7550820699425076,-1.1884070250271683,0.2594626318878004,-0.5789710086976391,-0.4396557466653106,0.2754557052112137,-0.3950977929958402,...,-0.9679174543001748,-0.0124008305979463,0.7944702992178544,0.2147520861282463,-0.4608931170258705,-0.2407134760513881,0.6540681256201899,1.0477228156685898,0.5418776481298473,0.4532574043021911
3,0.1313549009601736,-0.8527024690877495,-0.3277147787074725,0.7094480944134656,-1.225388474398729,0.2570751190759703,-0.7784168996772194,-0.5117655031068278,0.8066044909496284,-0.557091680900335,...,-0.7360540829119613,-0.1618150391705337,0.6513718346307573,0.1054713497080715,-0.4870473429087532,-0.5749379154936738,0.8275773326331273,0.7831383562317353,0.3331744460311919,0.4599801233564802
4,-0.1864379827244484,-1.4383675231123787,-0.4327137724151131,-0.6960175697949025,-0.8957982682654766,-1.691897993858623,-1.4967544962443398,0.2701244439415525,2.301723363677365,-0.8436691699799425,...,-0.3968408996437739,-1.8327583531723195,-0.0013573005007014,0.147503005825377,-0.2018465669491506,-1.1137789443869652,-0.4976740609206737,0.3075755553414648,1.7229589170293227,-0.8132785194235608


In [59]:
# Perform PCA on all data
# first create train and test
features_X_expanded["Labels"] = labels_Y # appending labels to not lose their assigned labels when performing data split
train, test = train_test_split(features_X_expanded, test_size=0.2)

# saving labels
train_labels = train.pop("Labels")
features_X_expanded = train

test_labels = test.pop("Labels")

# Scaling features such that they all have a mean of 0 and a variance of 1
scaler = StandardScaler()
scaled_train = scaler.fit_transform(features_X_expanded)

pca = PCA() # can be replaced with "PCA(n_components=2)" but need to check variance ratio first
pca_train = pca.fit_transform(scaled_train)

pca.explained_variance_ratio_ # observing how much each PCA is responsible for the variance

array([2.67436094e-01, 1.38522735e-01, 1.23398167e-01, 8.78706719e-02,
       7.92398468e-02, 3.86371569e-02, 3.36798332e-02, 2.27995672e-02,
       2.01238782e-02, 1.64326961e-02, 1.34371143e-02, 1.32297856e-02,
       1.18166805e-02, 1.09812838e-02, 9.81884140e-03, 8.91401525e-03,
       7.48935237e-03, 7.38501282e-03, 7.04718604e-03, 5.71110649e-03,
       5.30549423e-03, 4.80035576e-03, 4.39012934e-03, 4.05731983e-03,
       3.70182515e-03, 3.46913935e-03, 3.31339869e-03, 2.92080531e-03,
       2.68417073e-03, 2.54992633e-03, 2.39285727e-03, 2.30611020e-03,
       2.11745014e-03, 1.98971377e-03, 1.74529967e-03, 1.57725165e-03,
       1.51758151e-03, 1.42356224e-03, 1.36693560e-03, 1.22390893e-03,
       1.20126941e-03, 1.07048150e-03, 1.04657333e-03, 9.56930646e-04,
       9.17216574e-04, 8.86532352e-04, 8.09206170e-04, 7.57266610e-04,
       7.41189798e-04, 6.97078236e-04, 6.61805194e-04, 6.13954555e-04,
       5.85811206e-04, 5.72936776e-04, 5.34902017e-04, 4.94177296e-04,
      

In [60]:
# pca_train_allPCs_df = pd.DataFrame(pca_train, columns = [*range(0, pca_train.shape[1])])

In [61]:
# extracting all PCs data to cvs file
# pca_train_allPCs_df.to_csv("C:/Users/dnaen/APG_data/pca_train_allPCs.csv", index=False)

In [62]:
pca = PCA(n_components=2) # to be able to do Euclidean distance calculation set to 2
pca_train_2PCs = pca.fit_transform(scaled_train)
pca_train_2PCs_df = pd.DataFrame(pca_train_2PCs, columns = ['PC1','PC2'])
pca_train_2PCs_df["Labels"] = train_labels.values # can just append it since row order is not affected

In [63]:
# setting up test
scaled_test = scaler.transform(test)
pca_test_2PCs = pca.transform(scaled_test)

pca_test_2PCs_df = pd.DataFrame(pca_test_2PCs, columns = ['PC1','PC2'])
pca_test_2PCs_df["Labels"] = test_labels.values # can just append it since row order is not affected

In [64]:
pca_test_2PCs_df.head()

Unnamed: 0,PC1,PC2,Labels
0,-5.924999,0.348279,-1
1,-2.71192,2.977129,-1
2,1.307778,-4.317033,-1
3,14.969915,-3.218908,13
4,2.783498,-1.978736,-1


In [65]:
# to extract file to csv
# pca_data_2PCs_df.to_csv("C:/Users/dnaen/APG_data/pca_data_2PCs.csv", index=False)

# to extract from csv
# pca_data_2PCs_df = pd.read_csv(".../pca_data_2PCs.csv")

In [131]:
# to use at visit prob. calculation later
pca_train_2PCs_df_sorted = pca_train_2PCs_df.sort_values(by=["Labels"])
avg_pca_train_2PCs_df = get_avg_pca(pca_train_2PCs_df_sorted)
avg_pca_train_2PCs_df.head()

Unnamed: 0,0,1
0,-4.975416,-1.530899
1,-4.424312,-0.232859
2,0.348942,-4.525183
3,-1.971355,-4.359
4,-0.873455,2.556172


In [157]:
# experiment 1 - done for 2 PCs going through all PCA
accurate_estimation_prob = 0
false_estimation_prob = 0

total_accurate_prob = 0
total_false_prob = 0

# row size (number of data points)
row_size = len(pca_test_2PCs_df.axes[0])
experiment_size = 200
random_list = [] # for experiment
for y in range(0, experiment_size):
    random_list.append(random.randint(0, row_size))

# only using "pca_test_2comp_df_sorted" such that labels match
for x in random_list:
    label = pca_test_2PCs_df.at[pca_test_2PCs_df.index[x], "Labels"]
    if label >= 0:
        current_visit = pca_test_2PCs_df.iloc[x]
        current_visit = current_visit.tolist()
        current_visit = current_visit[:-1]

        estimated_prob, estimated_label = calculate_prob_of_visit_with_sorting(current_visit, pca_train_2PCs_df_sorted)

        if estimated_label == label:
            total_accurate_prob += 1
        else:
            total_false_prob += 1

accurate_estimation_prob = total_accurate_prob / experiment_size
false_estimation_prob = total_false_prob / experiment_size

print("Experiment size:")
print(experiment_size)
print("Accurate prediction with prob.")
print(accurate_estimation_prob)
print("False prediction with prob.")
print(false_estimation_prob)

Experiment size:
200
Accurate prediction with prob.
0.255
False prediction with prob.
0.235


In [None]:
"""
Can save results here
"""

In [152]:
# experiment 2 - done for 2 PCs with averaged PCA
accurate_estimation_prob = 0
false_estimation_prob = 0

total_accurate_prob = 0
total_false_prob = 0

# row size (number of data points)
row_size = len(pca_test_2PCs_df.axes[0])
experiment_size = row_size
random_list = [] # for experiment

for x in range(experiment_size):
    label = pca_test_2PCs_df.at[pca_test_2PCs_df.index[x], "Labels"]
    if label >= 0:
        current_visit = pca_test_2PCs_df.iloc[x]
        current_visit = current_visit.tolist()
        current_visit = current_visit[:-1]
        estimated_prob, estimated_label = calculate_prob_of_visit_with_sorted_avg_pca(current_visit, avg_pca_train_2PCs_df) # visit extracted from test data, pca extracted from train data

        if estimated_label == label:
            total_accurate_prob += 1
        else:
            total_false_prob += 1

accurate_estimation_prob = total_accurate_prob / experiment_size
false_estimation_prob = total_false_prob / experiment_size

print("Experiment size:")
print(experiment_size)
print("Accurate prediction with prob.")
print(accurate_estimation_prob)
print("False prediction with prob.")
print(false_estimation_prob)

Experiment size:
19948
Accurate prediction with prob.
0.48280529376378584
False prediction with prob.
0.009274112693001805


In [None]:
"""
- with split -
Experiment size:
1438474
Accurate prediction with prob.
0.1430056135838301
False prediction with prob.
0.05483585046664002
- without split -
Experiment size:
1438474
Accurate prediction with prob.
0.6983720247985018
False prediction with prob.
0.2959629440643348
"""