In [1]:
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from ast import literal_eval

In [None]:
# meanings will give us several hundreds of variables and each cluster will have a set of data points (each data point containing values of each of these variables)

# Creating 20 data points and 5 variables for each data point (between 10 and 50)
#example_data = np.random.randint(10,50,100).reshape(20,5)
#example_data[0:10:]

# trying on a real data
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data"
names = ['sepal-length', 'sepal-width', 'petal-length', 'petal-width', 'Class']
dataset = pd.read_csv(url, names=names)

dataset.head()

# dividing the dataset into a feature set and corresponding labels
features_X = dataset.drop('Class', 1)
labels_Y = dataset['Class']


In [2]:
def add_address_of_data(given_address): # could be useful for easily allowing others to use this file
    return "".join([given_address, "/visit_meaning_vectors/visit_meanings.csv"])

# main data
my_address = "C:/Users/dnaen/APG_data"  # only this has to be modified
df = pd.read_csv(add_address_of_data(my_address))


In [3]:
# will be used to get the labels
combined_address = "".join([my_address, "/cluster_paths.csv"])
df_with_labels = pd.read_csv(combined_address)

df_with_labels.head()

Unnamed: 0,visit_id,cluster_label,path
0,0[1],0,"[188, 1557, 3, 1, 13, 14, 21, 16, 14, 18, 14, ..."
1,5[1],0,"[1557, 3, 1, 13, 1, 1559, 12, 1559, 17, 1556]"
2,9[1],0,"[1557, 3, 86, 3, 86, 3, 92, 3, 7, 19, 14, 18, ..."
3,11[1],0,"[188, 228, 1557, 3, 1, 12, 7, 20, 1, 7, 1, 12,..."
4,12[4],18,"[1557, 1, 17, 12, 17, 13]"


In [4]:
# dividing the dataset into a feature set and corresponding labels
features_X = df.pop("meaning_vectors")
labels_Y = df_with_labels.pop("cluster_label")


In [15]:
labels_Y.head()

0     0
1     0
2     0
3     0
4    18
Name: cluster_label, dtype: int64

In [5]:
features_X = features_X.to_frame(name="meaning_vectors")

In [6]:
features_X_expanded = features_X.meaning_vectors.str.split(",",expand=True,)
features_X_expanded.head(10)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,[-1.0013970414243027,0.1041066298691812,-0.7069594929319728,-0.3670925902659877,-2.1300366871439413,0.5969171592180543,-0.0327198851583636,0.2224920680443176,2.027881099002104,0.827870212403462,...,0.6548517055350448,-0.3846879959843642,1.1044825047930154,0.3155223956828208,-2.8847950848931903,-0.7553962498890037,-1.4720945990902772,2.653286573843067,0.1943617721266645,0.1434588478106824]
1,[-0.22001658113151915,-2.2507345794274003,-0.3933474576639214,-0.2204712098870463,-0.7654681601639308,-1.142924618070034,-1.86187248369162,-2.3556492318840334,0.4819600693957551,0.1885965864764373,...,-2.935719571288854,-1.4991282068852751,1.0062288988970691,-0.8182551957518548,-0.0362307701217957,-1.7267140207545708,0.4275333946298442,-0.2442145154380059,1.433941992453667,-1.8641315484083514]
2,[1.597802667104453,-1.3318458039597452,-1.7876896274176677,0.1531999657748357,-0.9184955153229892,-0.5461559025055277,-1.3570081626634485,1.0627681533667157,0.9851826480709566,1.2266712267690123,...,-0.0887488194966829,-2.042117278138647,0.441273266539929,-0.1112707156981247,0.013890977112452,0.95533802751127,-0.8402656332947769,0.0617006935175403,1.174434637245242,-1.3317815829510649]
3,[-0.08318374597228116,-0.8729878666888199,0.0410371538527755,0.4616454791874244,-1.2901850509770023,-0.3269047963256664,-1.1119495148239311,-0.4324641976679138,1.0586961520473588,-1.0272654039747169,...,-1.0450595649908427,-0.2168291385770548,0.5135529541165453,0.2495246288044326,-0.4754690515821106,-0.8941292800735631,0.7727549164157642,1.0482704489062364,0.7197991172110915,0.45537440922683853]
4,[-2.879218311253632,-1.2799126645257008,1.4575141945832106,-1.166744513134016,-1.3042470403141233,-2.959889254844384,-3.275919495479953,-0.8904921384608795,3.356320868826463,-2.58905896196911,...,-1.66761397197074,-1.561016979377544,-0.6728326083361991,0.4044155336332753,-0.7633771138154553,-3.723181198331464,1.7113150440592182,0.6782706184706131,1.3848466663654022,-0.3254715591507867]
5,[-0.8369180614741538,-0.9360479342720076,-1.4600495143301906,0.6728528936018906,-2.0424073705869854,0.8780426693652943,0.6503694747009278,-0.7550821150290333,0.987214469378856,-1.8657573549489024,...,-0.7416060416975647,-0.0395262391210806,0.7452824610817166,1.3450202088510912,-1.0320636113872823,-0.3669002450228776,-0.3993312144790031,1.434885437273656,0.2288742855726015,1.6715811412887076]
6,[0.41522979898676204,-1.159737958953214,-0.249581765241873,-0.1362331756666125,-0.0028182256505899,-0.113359035790069,-1.5764256339857223,-2.286969566939029,0.7884043577574544,-0.8258271395134206,...,0.9086259165979,-0.9989015212295888,-0.6314352109646598,-2.632272249209213,-1.091078694284572,-0.0611419039837773,0.013034119396266,0.2367848419878538,1.3449782613104728,-0.538909557924523]
7,[-2.9608902166864195,-1.0956738286472023,1.5058700002833445,-1.4171404701882868,-1.330844679769079,-3.228021664124193,-2.914351798926398,-0.1537190887094223,3.7869134416421,-3.0848926771574683,...,-1.0075266551915474,-1.431580054894582,-0.9603913965546478,0.6739031904247236,-0.7635178779790973,-3.653489183589961,1.4316337850813414,0.8270008493919788,1.434033033769334,0.030364470567248808]
8,[-0.21947748986132684,-2.2178541207713884,-0.4156390531330097,-0.2294401256894283,-0.7795311621539934,-1.1547154513449471,-1.80518214707916,-2.2811334928079656,0.5249316969983312,0.2087900403505005,...,-2.856059562474998,-1.5353509528367395,0.9520496397599718,-0.7985087925155984,-0.0435670477377368,-1.7113681682832265,0.4324085498406598,-0.2619146641405508,1.4221927650774926,-1.854721102828428]
9,[1.636401249004699,-1.3215432693000375,-1.799365303324645,0.122686487993441,-0.9487439002267276,-0.5580854809242989,-1.336901155508226,1.116053957525558,1.0041238951464682,1.2598132746366242,...,-0.0510357690559979,-2.083416129378853,0.4702529315839726,-0.1032964432999453,0.0428464453708502,0.9636512408517388,-0.8638822929503234,0.0234637973451601,1.211031653527771,-1.348301502039101]


In [7]:
fixed_first_column = features_X_expanded[0].str.replace("[","")
fixed_last_column = features_X_expanded[99].str.replace("]","")

features_X_expanded[0] = fixed_first_column
features_X_expanded[99] = fixed_last_column

features_X_expanded.head()

  fixed_first_column = features_X_expanded[0].str.replace("[","")
  fixed_last_column = features_X_expanded[99].str.replace("]","")


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,91,92,93,94,95,96,97,98,99,99.1
0,-1.0013970414243027,0.1041066298691812,-0.7069594929319728,-0.3670925902659877,-2.1300366871439413,0.5969171592180543,-0.0327198851583636,0.2224920680443176,2.027881099002104,0.827870212403462,...,-0.3846879959843642,1.1044825047930154,0.3155223956828208,-2.8847950848931903,-0.7553962498890037,-1.4720945990902772,2.653286573843067,0.1943617721266645,0.1434588478106824],0.1434588478106824
1,-0.2200165811315191,-2.2507345794274003,-0.3933474576639214,-0.2204712098870463,-0.7654681601639308,-1.142924618070034,-1.86187248369162,-2.3556492318840334,0.4819600693957551,0.1885965864764373,...,-1.4991282068852751,1.0062288988970691,-0.8182551957518548,-0.0362307701217957,-1.7267140207545708,0.4275333946298442,-0.2442145154380059,1.433941992453667,-1.8641315484083514],-1.8641315484083516
2,1.597802667104453,-1.3318458039597452,-1.7876896274176677,0.1531999657748357,-0.9184955153229892,-0.5461559025055277,-1.3570081626634485,1.0627681533667157,0.9851826480709566,1.2266712267690123,...,-2.042117278138647,0.441273266539929,-0.1112707156981247,0.013890977112452,0.95533802751127,-0.8402656332947769,0.0617006935175403,1.174434637245242,-1.3317815829510649],-1.3317815829510649
3,-0.0831837459722811,-0.8729878666888199,0.0410371538527755,0.4616454791874244,-1.2901850509770023,-0.3269047963256664,-1.1119495148239311,-0.4324641976679138,1.0586961520473588,-1.0272654039747169,...,-0.2168291385770548,0.5135529541165453,0.2495246288044326,-0.4754690515821106,-0.8941292800735631,0.7727549164157642,1.0482704489062364,0.7197991172110915,0.45537440922683853],0.4553744092268385
4,-2.879218311253632,-1.2799126645257008,1.4575141945832106,-1.166744513134016,-1.3042470403141233,-2.959889254844384,-3.275919495479953,-0.8904921384608795,3.356320868826463,-2.58905896196911,...,-1.561016979377544,-0.6728326083361991,0.4044155336332753,-0.7633771138154553,-3.723181198331464,1.7113150440592182,0.6782706184706131,1.3848466663654022,-0.3254715591507867],-0.3254715591507867


In [10]:
    # Perform PCA on all data

# Scaling features such that they all have a mean of 0 and a variance of 1
scaler = StandardScaler()
scaled_data = scaler.fit_transform(features_X_expanded)

pca = PCA() # can be replaced with "PCA(n_components=2)" but need to check variance ratio first
pca_data = pca.fit_transform(scaled_data)

pca.explained_variance_ratio_ # observing how much each PCA is responsible for the variance

array([2.99052111e-01, 2.11635274e-01, 1.12139899e-01, 7.03316791e-02,
       4.43235614e-02, 3.53286958e-02, 2.65834053e-02, 2.60338620e-02,
       1.82591112e-02, 1.51380968e-02, 1.37404036e-02, 1.07784921e-02,
       9.58772469e-03, 9.05525877e-03, 8.07615460e-03, 7.52654514e-03,
       7.07935086e-03, 6.38050206e-03, 5.27969026e-03, 4.54148326e-03,
       4.16694148e-03, 3.86960991e-03, 3.40682486e-03, 3.13813599e-03,
       2.77692621e-03, 2.48912408e-03, 2.32455065e-03, 2.27586505e-03,
       2.09114190e-03, 1.90919109e-03, 1.80432504e-03, 1.72603346e-03,
       1.58407774e-03, 1.54590261e-03, 1.31708167e-03, 1.23876183e-03,
       1.23052628e-03, 1.15343776e-03, 1.11717690e-03, 9.73371330e-04,
       9.32470025e-04, 8.92737715e-04, 8.46773012e-04, 8.30122007e-04,
       7.40074730e-04, 6.65547573e-04, 6.40088476e-04, 6.15663814e-04,
       5.95952510e-04, 5.52631049e-04, 5.11234427e-04, 5.02037727e-04,
       4.91942279e-04, 4.46952075e-04, 4.28733407e-04, 3.93364657e-04,
      

In [48]:
pca = PCA(n_components=2)
pca_data_2comp = pca.fit_transform(scaled_data)
pca_data_2comp_df = pd.DataFrame(pca_data_2comp, columns = ['PC1','PC2'])
pca_data_2comp_df["Labels"] = labels_Y

In [50]:
pca_data_2comp_df_sorted = pca_data_2comp_df.sort_values(by=["Labels"])
pca_data_2comp_df_sorted.head(10)

Unnamed: 0,PC1,PC2,Labels
297926,-2.671005,-0.582621,-1
1392337,-1.088543,-1.647219,-1
1341028,-6.194049,-2.545746,-1
180382,-2.528904,-2.998015,-1
607497,-2.604695,-2.612323,-1
992916,-3.428668,-2.844442,-1
1154659,-4.379559,-1.489158,-1
398164,-2.210939,-2.109961,-1
1102740,-4.489199,1.498144,-1
580377,-2.288275,-2.246409,-1


In [72]:
# calculating Euclidean distance
def calculate_prob_of_visit(given_visit, given_pca_sorted):
    # assuming given visit is a meaning vector, with number of variables same as number of PCs

    # given_pca_sorted = given_pca.sort_values(by=["Labels"]) # sorting given PCA list by labels
    prev_label = 0
    current_label = 0
    lowest_dist_to_visit = 0
    current_labels_dist = 0
    total_dist = 0
    lowest_dist_to_visits_label = 0

    for x in range(0, len(given_pca_sorted)):
        current_label = given_pca_sorted.at[given_pca_sorted.index[x], "Labels"] # get current label

        # extracting required variables of the point in PCA data
        temp_point = given_pca_sorted.iloc[x]
        temp_point = temp_point.tolist()
        temp_point = temp_point[:-1]

        if current_label >= 0: # to skip "-1" labels
            if current_label == prev_label: current_labels_dist += np.linalg.norm(np.array(given_visit) - np.array(temp_point))
            else: # have arrived to next label in the sorted PCA
                if current_labels_dist < lowest_dist_to_visit:
                    lowest_dist_to_visit = current_labels_dist
                    lowest_dist_to_visits_label = current_label
                prev_label = current_label
                total_dist += current_labels_dist
                current_labels_dist = 0 # since we are going to next label, reset distance

    if total_dist == 0: print(len(given_visit))
    return 1 - (lowest_dist_to_visits_label / total_dist), lowest_dist_to_visits_label

In [74]:
# experiment
accurate_estimation_prob = 0
false_estimation_prob = 0

total_accurate_prob = 0
total_false_prob = 0

# row size
row_size = len(pca_data_2comp_df_sorted.axes[0])

for x in range(0, len(pca_data_2comp_df_sorted.axes[0])):
    label = pca_data_2comp_df_sorted.at[pca_data_2comp_df_sorted.index[x], "Labels"]
    if label >= 0:
        current_visit = pca_data_2comp_df_sorted.iloc[x]
        current_visit = current_visit.tolist()
        current_visit = current_visit[:-1]

        estimated_prob, estimated_label = calculate_prob_of_visit(current_visit, pca_data_2comp_df_sorted)

        if estimated_label == label:
            total_accurate_prob += estimated_prob
        else:
            total_false_prob += estimated_prob

accurate_estimation_prob = total_accurate_prob / row_size
false_estimation_prob = total_false_prob / row_size

print("Accurate prediction with prob.")
print(accurate_estimation_prob)
print("False prediction with prob.")
print(false_estimation_prob)

KeyboardInterrupt: 

In [None]:
"""
Keeping it just in case
"""
# ----------------------------------------
classifier = DecisionTreeClassifier()
classifier.fit(pca_data, labels_Y) # train classifier

# assuming we got new data
newdata = []

# just transforming to pca, no re-fit again needed
scaled_new_data = scaler.transform(newdata)
pca_new_data = pca.transform(scaled_new_data)

pred_labels = classifier.predict_proba(pca_new_data)

# ---------------------------------------------

# Splitting the dataset into the training set and test set such that it can be used for classification
X_train, X_test, y_train, y_test = train_test_split(features_X, labels_Y, test_size=0.2, random_state=0)

# Scaling features such that they all have a mean of 0 and a variance of 1
scaler = StandardScaler()
scaled_X_train = scaler.fit_transform(X_train)
scaled_X_test = scaler.transform(X_test)

pca = PCA() # can be replaced with "PCA(n_components=2)" if data is too much
pca_X_train = pca.fit_transform(scaled_X_train)
pca_X_test = pca.transform(scaled_X_test)

pca.explained_variance_ratio_ # observing how much each PCA is responsible for the variance

# Training, Making Predictions and Performance Evaluation
classifier = RandomForestClassifier(max_depth=2, random_state=0)
classifier.fit(pca_X_train, y_train)

# Predicting the test set results and making performance evaluation
y_pred = classifier.predict(pca_X_test)

cm = confusion_matrix(y_test, y_pred)
print("Accuracy:")
print(accuracy_score(y_test, y_pred))
