In [1]:
import random
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
# functions
def add_address_of_data(given_address): # could be useful for easily allowing others to use this file
    return "".join([given_address, "/visit_meaning_vectors/visit_meanings.csv"])

# calculating Euclidean distance
def calculate_prob_of_visit_with_sorting(given_visit, given_pca):
    # assuming given visit is a meaning vector, with number of variables same as number of PCs

    given_pca_sorted = given_pca.sort_values(by=["Labels"]) # sorting given PCA list by labels
    prev_label = 0
    current_label = 0
    lowest_dist_to_visit = 0
    current_labels_dist = 0
    total_dist = 0
    lowest_dist_to_visits_label = 0

    for x in range(0, len(given_pca_sorted)):
        current_label = given_pca_sorted.at[given_pca_sorted.index[x], "Labels"] # get current label

        # extracting required variables of the point in PCA data
        temp_point = given_pca_sorted.iloc[x]
        temp_point = temp_point.tolist()
        temp_point = temp_point[:-1]

        if current_label >= 0: # to skip "-1" labels
            if current_label == prev_label: current_labels_dist += np.linalg.norm(np.array(given_visit) - np.array(temp_point)) # performing the euclidean distance calculation
            else: # have arrived to next label in the sorted PCA, so check final values
                if current_labels_dist < lowest_dist_to_visit:
                    lowest_dist_to_visit = current_labels_dist
                    lowest_dist_to_visits_label = current_label
                prev_label = current_label
                total_dist += current_labels_dist
                current_labels_dist = 0 # since we are going to next label, reset distance

    if total_dist == 0: print(len(given_visit))
    return 1 - (lowest_dist_to_visits_label / total_dist), lowest_dist_to_visits_label

# calculating Euclidean distance and assumes that "given_pca" is already sorted
def calculate_prob_of_visit_without_sorting(given_visit, given_pca_sorted):
    # assuming given visit is a meaning vector, with number of variables same as number of PCs

    prev_label = 0
    current_label = 0
    lowest_dist_to_visit = 0
    current_labels_dist = 0
    total_dist = 0
    lowest_dist_to_visits_label = 0

    for x in range(0, len(given_pca_sorted)):
        current_label = given_pca_sorted.at[given_pca_sorted.index[x], "Labels"] # get current label

        # extracting required variables of the point in PCA data
        temp_point = given_pca_sorted.iloc[x]
        temp_point = temp_point.tolist()
        temp_point = temp_point[:-1]

        if current_label >= 0: # to skip "-1" labels
            if current_label == prev_label: current_labels_dist += np.linalg.norm(np.array(given_visit) - np.array(temp_point)) # performing the euclidean distance calculation
            else: # have arrived to next label in the sorted PCA, so check final values
                if current_labels_dist < lowest_dist_to_visit:
                    lowest_dist_to_visit = current_labels_dist
                    lowest_dist_to_visits_label = current_label
                prev_label = current_label
                total_dist += current_labels_dist
                current_labels_dist = 0 # since we are going to next label, reset distance

    if total_dist == 0: print(len(given_visit))
    return 1 - (lowest_dist_to_visits_label / total_dist), lowest_dist_to_visits_label

def get_avg_pca(given_pca):
    given_pca_sorted = given_pca.sort_values(by=["Labels"]) # sorting given PCA list by labels
    list_of_avg_values = []
    prev_label = 0
    current_labels_dist = [0] * (len(given_pca.columns) - 1)
    current_label_count = 0
    for x in range(0, len(given_pca_sorted)):
        current_label = given_pca_sorted.at[given_pca_sorted.index[x], "Labels"] # get current label


        if current_label >= 0: # to skip "-1" labels
            current_label_count += 1

            if current_label == prev_label:
                # extracting required variables of the point in PCA data
                temp_point = given_pca_sorted.iloc[x]
                temp_point = temp_point.tolist()
                temp_point = temp_point[:-1]

                current_labels_dist = [current_labels_dist[y] + temp_point[y] for y in range (0, len(current_labels_dist))]
            else: # have arrived to next label in the sorted PCA, so check final values
                prev_label = current_label
                list_of_avg_values.append([current_labels_dist[z] / current_label_count for z in range (0, len(current_labels_dist))])
                # resetting
                current_labels_dist = [0] * (len(given_pca.columns)-1)
                current_label_count = 0

    temp_df = pd.DataFrame(list_of_avg_values, columns=[b for b in range(0, len(given_pca.columns)-1)])
    return temp_df

# calculating Euclidean distance and assumes that "given_pca" is already sorted and averaged
def calculate_prob_of_visit_with_sorted_avg_pca(given_visit, given_avg_pca_sorted):
    # assuming given visit is a meaning vector, with number of variables same as number of PCs

    lowest_dist_to_visit = 0
    current_labels_dist = 0
    total_dist = 0
    lowest_dist_to_visits_label = 0

    for x in range(0, len(given_avg_pca_sorted)): # each index is a label
        # extracting required variables of the point in PCA data
        temp_point = given_avg_pca_sorted.iloc[x]
        temp_point = temp_point.tolist()
        temp_point = temp_point[:-1]

        current_labels_dist += np.linalg.norm(np.array(given_visit) - np.array(temp_point)) # performing the euclidean distance calculation

        if current_labels_dist < lowest_dist_to_visit:
            lowest_dist_to_visit = current_labels_dist
            lowest_dist_to_visits_label = x

        total_dist += current_labels_dist
        current_labels_dist = 0 # since we are going to next label, reset distance

    if total_dist == 0: print(len(given_visit))
    return 1 - (lowest_dist_to_visits_label / total_dist), lowest_dist_to_visits_label

In [3]:
# meaning vectors will be made of several hundreds of variables and each cluster will have a set of data points (each data point containing values of each of these variables (so a meaning vector)

# main data
my_address = "C:/Users/dnaen/APG_data"  # only this has to be modified
df = pd.read_csv(add_address_of_data(my_address))

In [4]:
# will be used to get the labels
combined_address = "".join([my_address, "/cluster_paths.csv"])
df_with_labels = pd.read_csv(combined_address)

df_with_labels.head()

Unnamed: 0,visit_id,cluster_label,path
0,0[1],0,"[188, 1557, 3, 1, 13, 14, 21, 16, 14, 18, 14, ..."
1,5[1],0,"[1557, 3, 1, 13, 1, 1559, 12, 1559, 17, 1556]"
2,9[1],0,"[1557, 3, 86, 3, 86, 3, 92, 3, 7, 19, 14, 18, ..."
3,11[1],0,"[188, 228, 1557, 3, 1, 12, 7, 20, 1, 7, 1, 12,..."
4,12[4],18,"[1557, 1, 17, 12, 17, 13]"


In [5]:
# dividing the dataset into a feature set and corresponding labels
features_X = df.pop("meaning_vectors")
labels_Y = df_with_labels.pop("cluster_label")


In [6]:
features_X = features_X.to_frame(name="meaning_vectors") # converting from series to df

In [7]:
# expanding features df, because currently there is only one column where each row contains a list of meaning values, to make it work for PCA method defined in sklearn library, it needs to be a df where each cell is only one number
features_X_expanded = features_X.meaning_vectors.str.split(",",expand=True,)
features_X_expanded.head(10)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,[-1.0013970414243027,0.1041066298691812,-0.7069594929319728,-0.3670925902659877,-2.1300366871439413,0.5969171592180543,-0.0327198851583636,0.2224920680443176,2.027881099002104,0.827870212403462,...,0.6548517055350448,-0.3846879959843642,1.1044825047930154,0.3155223956828208,-2.8847950848931903,-0.7553962498890037,-1.4720945990902772,2.653286573843067,0.1943617721266645,0.1434588478106824]
1,[-0.22001658113151915,-2.2507345794274003,-0.3933474576639214,-0.2204712098870463,-0.7654681601639308,-1.142924618070034,-1.86187248369162,-2.3556492318840334,0.4819600693957551,0.1885965864764373,...,-2.935719571288854,-1.4991282068852751,1.0062288988970691,-0.8182551957518548,-0.0362307701217957,-1.7267140207545708,0.4275333946298442,-0.2442145154380059,1.433941992453667,-1.8641315484083514]
2,[1.597802667104453,-1.3318458039597452,-1.7876896274176677,0.1531999657748357,-0.9184955153229892,-0.5461559025055277,-1.3570081626634485,1.0627681533667157,0.9851826480709566,1.2266712267690123,...,-0.0887488194966829,-2.042117278138647,0.441273266539929,-0.1112707156981247,0.013890977112452,0.95533802751127,-0.8402656332947769,0.0617006935175403,1.174434637245242,-1.3317815829510649]
3,[-0.08318374597228116,-0.8729878666888199,0.0410371538527755,0.4616454791874244,-1.2901850509770023,-0.3269047963256664,-1.1119495148239311,-0.4324641976679138,1.0586961520473588,-1.0272654039747169,...,-1.0450595649908427,-0.2168291385770548,0.5135529541165453,0.2495246288044326,-0.4754690515821106,-0.8941292800735631,0.7727549164157642,1.0482704489062364,0.7197991172110915,0.45537440922683853]
4,[-2.879218311253632,-1.2799126645257008,1.4575141945832106,-1.166744513134016,-1.3042470403141233,-2.959889254844384,-3.275919495479953,-0.8904921384608795,3.356320868826463,-2.58905896196911,...,-1.66761397197074,-1.561016979377544,-0.6728326083361991,0.4044155336332753,-0.7633771138154553,-3.723181198331464,1.7113150440592182,0.6782706184706131,1.3848466663654022,-0.3254715591507867]
5,[-0.8369180614741538,-0.9360479342720076,-1.4600495143301906,0.6728528936018906,-2.0424073705869854,0.8780426693652943,0.6503694747009278,-0.7550821150290333,0.987214469378856,-1.8657573549489024,...,-0.7416060416975647,-0.0395262391210806,0.7452824610817166,1.3450202088510912,-1.0320636113872823,-0.3669002450228776,-0.3993312144790031,1.434885437273656,0.2288742855726015,1.6715811412887076]
6,[0.41522979898676204,-1.159737958953214,-0.249581765241873,-0.1362331756666125,-0.0028182256505899,-0.113359035790069,-1.5764256339857223,-2.286969566939029,0.7884043577574544,-0.8258271395134206,...,0.9086259165979,-0.9989015212295888,-0.6314352109646598,-2.632272249209213,-1.091078694284572,-0.0611419039837773,0.013034119396266,0.2367848419878538,1.3449782613104728,-0.538909557924523]
7,[-2.9608902166864195,-1.0956738286472023,1.5058700002833445,-1.4171404701882868,-1.330844679769079,-3.228021664124193,-2.914351798926398,-0.1537190887094223,3.7869134416421,-3.0848926771574683,...,-1.0075266551915474,-1.431580054894582,-0.9603913965546478,0.6739031904247236,-0.7635178779790973,-3.653489183589961,1.4316337850813414,0.8270008493919788,1.434033033769334,0.030364470567248808]
8,[-0.21947748986132684,-2.2178541207713884,-0.4156390531330097,-0.2294401256894283,-0.7795311621539934,-1.1547154513449471,-1.80518214707916,-2.2811334928079656,0.5249316969983312,0.2087900403505005,...,-2.856059562474998,-1.5353509528367395,0.9520496397599718,-0.7985087925155984,-0.0435670477377368,-1.7113681682832265,0.4324085498406598,-0.2619146641405508,1.4221927650774926,-1.854721102828428]
9,[1.636401249004699,-1.3215432693000375,-1.799365303324645,0.122686487993441,-0.9487439002267276,-0.5580854809242989,-1.336901155508226,1.116053957525558,1.0041238951464682,1.2598132746366242,...,-0.0510357690559979,-2.083416129378853,0.4702529315839726,-0.1032964432999453,0.0428464453708502,0.9636512408517388,-0.8638822929503234,0.0234637973451601,1.211031653527771,-1.348301502039101]


In [8]:
len(features_X_expanded.columns)

100

In [9]:
# since we have expanded a string of list, we also have to remove "[" and "]" from first and last column
fixed_first_column = features_X_expanded[0].str.replace("[","")
fixed_last_column = features_X_expanded[99].str.replace("]","")

features_X_expanded[0] = fixed_first_column
features_X_expanded[99] = fixed_last_column

features_X_expanded.head()

  fixed_first_column = features_X_expanded[0].str.replace("[","")
  fixed_last_column = features_X_expanded[99].str.replace("]","")


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,-1.0013970414243027,0.1041066298691812,-0.7069594929319728,-0.3670925902659877,-2.1300366871439413,0.5969171592180543,-0.0327198851583636,0.2224920680443176,2.027881099002104,0.827870212403462,...,0.6548517055350448,-0.3846879959843642,1.1044825047930154,0.3155223956828208,-2.8847950848931903,-0.7553962498890037,-1.4720945990902772,2.653286573843067,0.1943617721266645,0.1434588478106824
1,-0.2200165811315191,-2.2507345794274003,-0.3933474576639214,-0.2204712098870463,-0.7654681601639308,-1.142924618070034,-1.86187248369162,-2.3556492318840334,0.4819600693957551,0.1885965864764373,...,-2.935719571288854,-1.4991282068852751,1.0062288988970691,-0.8182551957518548,-0.0362307701217957,-1.7267140207545708,0.4275333946298442,-0.2442145154380059,1.433941992453667,-1.8641315484083516
2,1.597802667104453,-1.3318458039597452,-1.7876896274176677,0.1531999657748357,-0.9184955153229892,-0.5461559025055277,-1.3570081626634485,1.0627681533667157,0.9851826480709566,1.2266712267690123,...,-0.0887488194966829,-2.042117278138647,0.441273266539929,-0.1112707156981247,0.013890977112452,0.95533802751127,-0.8402656332947769,0.0617006935175403,1.174434637245242,-1.3317815829510649
3,-0.0831837459722811,-0.8729878666888199,0.0410371538527755,0.4616454791874244,-1.2901850509770023,-0.3269047963256664,-1.1119495148239311,-0.4324641976679138,1.0586961520473588,-1.0272654039747169,...,-1.0450595649908427,-0.2168291385770548,0.5135529541165453,0.2495246288044326,-0.4754690515821106,-0.8941292800735631,0.7727549164157642,1.0482704489062364,0.7197991172110915,0.4553744092268385
4,-2.879218311253632,-1.2799126645257008,1.4575141945832106,-1.166744513134016,-1.3042470403141233,-2.959889254844384,-3.275919495479953,-0.8904921384608795,3.356320868826463,-2.58905896196911,...,-1.66761397197074,-1.561016979377544,-0.6728326083361991,0.4044155336332753,-0.7633771138154553,-3.723181198331464,1.7113150440592182,0.6782706184706131,1.3848466663654022,-0.3254715591507867


In [10]:
# Perform PCA on all data
# first create train and test
features_X_expanded["Labels"] = labels_Y # appending labels to not lose their assigned labels when performing data split
train, test = train_test_split(features_X_expanded, test_size=0.2)

# saving labels
train_labels = train.pop("Labels")
features_X_expanded = train

test_labels = test.pop("Labels")

# Scaling features such that they all have a mean of 0 and a variance of 1
scaler = StandardScaler()
scaled_train = scaler.fit_transform(features_X_expanded)

pca = PCA() # can be replaced with "PCA(n_components=2)" but need to check variance ratio first
pca_train = pca.fit_transform(scaled_train)

pca.explained_variance_ratio_ # observing how much each PCA is responsible for the variance

array([2.98996470e-01, 2.11891189e-01, 1.12035474e-01, 7.02827623e-02,
       4.42911795e-02, 3.53171876e-02, 2.66142582e-02, 2.60331257e-02,
       1.82677849e-02, 1.51562176e-02, 1.37380363e-02, 1.08030752e-02,
       9.58648404e-03, 9.04959142e-03, 8.06761604e-03, 7.52823087e-03,
       7.04622432e-03, 6.38222820e-03, 5.27454212e-03, 4.53812788e-03,
       4.16271650e-03, 3.87301135e-03, 3.40864161e-03, 3.13457123e-03,
       2.77584737e-03, 2.48966454e-03, 2.32236824e-03, 2.27391186e-03,
       2.09303313e-03, 1.90118136e-03, 1.80413060e-03, 1.72815189e-03,
       1.58408752e-03, 1.54583189e-03, 1.31452776e-03, 1.24106202e-03,
       1.22713516e-03, 1.15302936e-03, 1.11745119e-03, 9.68696407e-04,
       9.29676107e-04, 8.91753351e-04, 8.48408037e-04, 8.29184205e-04,
       7.37714830e-04, 6.65066222e-04, 6.41614633e-04, 6.16185782e-04,
       5.95594978e-04, 5.52744107e-04, 5.10939463e-04, 5.00563107e-04,
       4.90475347e-04, 4.47907681e-04, 4.29382134e-04, 3.93093167e-04,
      

In [11]:
# pca_train_allPCs_df = pd.DataFrame(pca_train, columns = [*range(0, pca_train.shape[1])])

In [12]:
# extracting all PCs data to cvs file
# pca_train_allPCs_df.to_csv("C:/Users/dnaen/APG_data/pca_train_allPCs.csv", index=False)

In [13]:
pca = PCA(n_components=2) # to be able to do Euclidean distance calculation set to 2
pca_train_2PCs = pca.fit_transform(scaled_train)
pca_train_2PCs_df = pd.DataFrame(pca_train_2PCs, columns = ['PC1','PC2'])
pca_train_2PCs_df["Labels"] = train_labels # can just append it since row order is not affected

In [14]:
# setting up test
scaled_test = scaler.transform(test)
pca_test_2PCs = pca.transform(scaled_test)

pca_test_2PCs_df = pd.DataFrame(pca_test_2PCs, columns = ['PC1','PC2'])
pca_test_2PCs_df["Labels"] = test_labels # can just append it since row order is not affected

In [None]:
# to extract file to csv
# pca_data_2PCs_df.to_csv("C:/Users/dnaen/APG_data/pca_data_2PCs.csv", index=False)

# to extract from csv
# pca_data_2PCs_df = pd.read_csv(".../pca_data_2PCs.csv")

In [15]:
# to use at visit prob. calculation later
pca_train_2PCs_df_sorted = pca_train_2PCs_df.sort_values(by=["Labels"])
avg_pca_train_2PCs_df = get_avg_pca(pca_train_2PCs_df_sorted)
avg_pca_train_2PCs_df.head()

Unnamed: 0,0,1
0,-0.006485,-0.000891
1,-0.079668,0.0511
2,-0.04495,0.07889
3,0.287227,-0.223247
4,0.207624,-0.131277


In [None]:
# experiment 1 - done for 2 PCs going through all PCA
accurate_estimation_prob = 0
false_estimation_prob = 0

total_accurate_prob = 0
total_false_prob = 0

# row size (number of data points)
row_size = len(pca_test_2PCs_df.axes[0])
experiment_size = 2
random_list = [] # for experiment
for y in range(0, experiment_size):
    random_list.append(random.randint(0, row_size))

# only using "pca_test_2comp_df_sorted" such that labels match
for x in random_list:
    label = pca_test_2PCs_df.at[pca_test_2PCs_df.index[x], "Labels"]
    if label >= 0:
        current_visit = pca_test_2PCs_df.iloc[x]
        current_visit = current_visit.tolist()
        current_visit = current_visit[:-1]

        estimated_prob, estimated_label = calculate_prob_of_visit_without_sorting(current_visit, pca_train_2PCs_df_sorted)

        if estimated_label == label:
            total_accurate_prob += estimated_prob
        else:
            total_false_prob += estimated_prob

accurate_estimation_prob = total_accurate_prob / experiment_size
false_estimation_prob = total_false_prob / experiment_size

print("Experiment size:")
print(experiment_size)
print("Accurate prediction with prob.")
print(accurate_estimation_prob)
print("False prediction with prob.")
print(false_estimation_prob)

In [None]:
"""
Can save results here
"""

In [16]:
# experiment 2 - done for 2 PCs with averaged PCA
accurate_estimation_prob = 0
false_estimation_prob = 0

total_accurate_prob = 0
total_false_prob = 0

# row size (number of data points)
row_size = len(pca_test_2PCs_df.axes[0])
experiment_size = row_size
random_list = [] # for experiment

for x in range(0, experiment_size):
    label = pca_test_2PCs_df.at[pca_test_2PCs_df.index[x], "Labels"]
    if label >= 0:
        current_visit = pca_test_2PCs_df.iloc[x]
        current_visit = current_visit.tolist()
        current_visit = current_visit[:-1]
        estimated_prob, estimated_label = calculate_prob_of_visit_with_sorted_avg_pca(current_visit, avg_pca_train_2PCs_df) # visit extracted from test data, pca extracted from train data

        if estimated_label == label:
            total_accurate_prob += estimated_prob
        else:
            total_false_prob += estimated_prob

accurate_estimation_prob = total_accurate_prob / experiment_size
false_estimation_prob = total_false_prob / experiment_size

print("Experiment size:")
print(experiment_size)
print("Accurate prediction with prob.")
print(accurate_estimation_prob)
print("False prediction with prob.")
print(false_estimation_prob)

Experiment size:
100
Accurate prediction with prob.
411.42
False prediction with prob.
157.76


In [None]:
"""
- with split -
Experiment size:
1438474
Accurate prediction with prob.
0.1430056135838301
False prediction with prob.
0.05483585046664002
- without split -
Experiment size:
1438474
Accurate prediction with prob.
0.6983720247985018
False prediction with prob.
0.2959629440643348
"""