In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import os
import random as rn
from sklearn.metrics.pairwise import cosine_similarity
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)


In [None]:
path =   # Enter the directory of your project folder
PATH = path + "/Results"

if not os.path.exists(PATH):
    os.mkdir(PATH) 

In [None]:
df = pd.read_csv(path+"//ToyDataset.csv").drop(['Unnamed: 0'],axis=1)
df1 = pd.read_csv(PATH+"//Data one hot encoded.csv").drop(['Unnamed: 0'],axis=1)
clientID = df.ClientID.unique()

df["EntryDate"] = pd.to_datetime(df['EntryDate'])
df["ExitDate"] = pd.to_datetime(df['ExitDate'])
df = df.sort_values(["EntryDate",'ExitDate']).reset_index(drop=True)
df1["EntryDate"] = pd.to_datetime(df1['EntryDate'])
df1["ExitDate"] = pd.to_datetime(df1['ExitDate'])
df1 = df1.sort_values(["EntryDate",'ExitDate']).reset_index(drop=True)

In [None]:
tf_idf_norm = pd.read_csv(PATH+"//Normalized TF_IDF for project type and target for each step of effective path.csv").set_index(["Step",'index'])



In [None]:
# separating the living situation and the exit destination into categories
goal = [10,11]
closer = [3,19,20,21,22,23,25,26,28,31]
trans = [1,2,4,5,6,12,13,14,15,18,27,29]
no_progress = [7,16]
hard_to_judge = [8,9,17,24,30,99]

In [None]:
#functions:

# separating out data
def data_separation(df,df1,target):
    data,data1 = pd.DataFrame(),pd.DataFrame()
    for i in range(len(clientID)):
        temp = df[df["ClientID"]==clientID[i]]
        temp1 = df1[df1["ClientID"]==clientID[i]]
        if temp['Destination'].iloc[-1] in target:
            data = data.append(temp,ignore_index=True)
            data1 = data1.append(temp1,ignore_index=True)
    return data,data1

def data_preparation(df,df1):
    df["EntryDate"] = pd.to_datetime(df['EntryDate'])
    df["ExitDate"] = pd.to_datetime(df['ExitDate'])
    df = df.sort_values(["EntryDate",'ExitDate']).reset_index(drop=True)
    df1["EntryDate"] = pd.to_datetime(df1['EntryDate'])
    df1["ExitDate"] = pd.to_datetime(df1['ExitDate'])
    d1 = df1.sort_values(["EntryDate",'ExitDate']).reset_index(drop=True)
    return df,df1


# recording effective pathway
def effective_pathway(df,df1):
    client = df1.ClientID.unique()
    pathway = pd.DataFrame()
    for i in range(len(client)):
        temp = df1[df1['ClientID']==client[i]]
        pathway = pathway.append({"ClientID":client[i],"path":list(temp['ProjectType'].unique()),"length": len(temp['ProjectType'].unique())},ignore_index=True)
    return pathway
    

# finding the cosine similarity with the ultimate goal
def cosine_similarity_function(pathway,tf_idf_norm):
    client = pathway.ClientID.unique()
    n = int(np.max(pathway.length))
    cosine_avg = pd.DataFrame(columns = ['step']+[b for b in range(n)],index=range(n))
    for k in range(2,n+1):
        cosine = pd.DataFrame(index = range(len(client)),columns=[a for a in range(n)])
        for i in range(len(client)):
            temp = pathway[pathway['ClientID']==client[i]]
            length = int(temp['length'].unique()[0])
            # only considering one length at a time and ignoring path of length 1
            if length!= k:
                continue
            if length == 1:
                continue
            for j in range(1,length+1):
                cos = pd.DataFrame(tf_idf_norm.loc[j].loc['ultimate goal'])    
                x = pd.DataFrame(tf_idf_norm.loc[j].loc[str(temp['path'].values[0][j-1])])
                cos = cos.T.append(x.T,ignore_index=True)
                cosine[k-j].iloc[i] = cosine_similarity(cos)[0][1]

        cosine = cosine.dropna(how="all")
        cosine_avg["step"].loc[k-1] = k
        for z in range(k):
            cosine_avg[z].iloc[k-1] = cosine.mean()[z]  

    cosine_avg = cosine_avg.set_index('step')
    cosine_avg.columns = [k for k in range(1,n+1)]
    cosine_avg = cosine_avg.dropna(how='all')
    return cosine_avg

# plotting the distance for ultimate goal
def plotting_trends(cosine_avg,directory,ticks=[]):
    plt.figure(figsize=(20,10))
    for i in cosine_avg.index:
        if i == 2:
            color = "r"
        elif i == 3:
            color = 'b'
        elif i == 4:
            color = "g"

        plt.plot(cosine_avg.loc[i],'o-',color=color,linewidth=3,label=i)
        m,b=np.polyfit(np.array(range(1,len(np.array(cosine_avg.loc[i][cosine_avg.loc[i].notna()],dtype='float'))+1),dtype='float'),np.array(cosine_avg.loc[i][cosine_avg.loc[i].notna()],dtype='float'),1)
        plt.plot(np.array(range(1,len(np.array(cosine_avg.loc[i][cosine_avg.loc[i].notna()],dtype='float'))+1),dtype='float'),m*np.array(range(1,len(np.array(cosine_avg.loc[i][cosine_avg.loc[i].notna()],dtype='float'))+1),dtype='float')+b,"o--",color=color,linewidth=3)
        if ticks == []:
            pass
        else:
            plt.axis(ticks)
        plt.legend(fontsize=25)
        plt.xticks(fontsize=25)
        plt.yticks(fontsize=25)
    plt.savefig(PATH+directory)
    plt.close()

In [None]:
# separating out the clients that reached ultimate goal 
df_goal,df_goal1 = data_separation(df,df1,goal)
if df_goal.empty:
    print('No data in df_goal')
    cosine_avg_goal = pd.DataFrame()
else:
    df_goal,df_goal1 = data_preparation(df_goal,df_goal1)
    # computing similarity trend
    pathway = effective_pathway(df_goal,df_goal1)
    cosine_avg_goal = cosine_similarity_function(pathway,tf_idf_norm)
    directory = "//TF-IDF similarity to the actual exit plot for ultimate goal.png"
    plotting_trends(cosine_avg_goal,directory,[4.25,0.75,0.0,0.6])

In [None]:
# separating out the clients that reached ultimate goal 
df_closer,df_closer1 = data_separation(df,df1,closer+trans)
if df_closer.empty:
    print('No data in df_closer')
    cosine_avg_closer = pd.DataFrame()
else: 
    df_closer,df_closer1 = data_preparation(df_closer,df_closer1)
    # computing similarity trend
    pathway = effective_pathway(df_closer,df_closer1)
    cosine_avg_closer = cosine_similarity_function(pathway,tf_idf_norm)
    directory = "//TF-IDF similarity to the actual exit plot for closer or transitional.png"
    plotting_trends(cosine_avg_closer,directory,[4.25,0.75,0.1,0.65])

In [None]:
# separating out the clients that reached ultimate goal 
df_unsuccess,df_unsuccess1 = data_separation(df,df1,hard_to_judge+no_progress)
if df_unsuccess.empty:
    print('No data in df_unsuccess')
    cosine_avg_unsuccess = pd.DataFrame()
else: 
    df_unsuccess,df_unsuccess1 = data_preparation(df_unsuccess,df_unsuccess1)
    # computing similarity trend
    pathway = effective_pathway(df_unsuccess,df_unsuccess1)
    cosine_avg_unsuccess = cosine_similarity_function(pathway,tf_idf_norm)
    directory = "//TF-IDF similarity to the actual exit plot for unsuccessful.png"
    plotting_trends(cosine_avg_unsuccess,directory,[4.25,0.75,-0.1,0.9])

In [None]:
plt.figure(figsize=(20,10))
if cosine_avg_goal.empty:
    print("No data in cosine_avg_goal")
else:
    cosine_avg_goal_mean = cosine_avg_goal.mean(axis=0)
    plt.plot(cosine_avg_goal_mean,'o-',color = 'g',linewidth=3,label="ultimate goal")
    m,b=np.polyfit(cosine_avg_goal_mean.index,cosine_avg_goal_mean,1)
    plt.plot(cosine_avg_goal_mean.index,m*cosine_avg_goal_mean.index+b,"--",color = 'g',linewidth=3)

if cosine_avg_closer.empty:
    print("No data in cosine_avg_closer")
else:
    cosine_avg_closer_mean = cosine_avg_closer.mean(axis=0)
    plt.plot(cosine_avg_closer_mean,'o-',color = 'y',linewidth=3,label="closer or transitional")
    m,b=np.polyfit(cosine_avg_closer_mean.index,cosine_avg_closer_mean,1)
    plt.plot(cosine_avg_closer_mean.index,m*cosine_avg_closer_mean.index+b,"--",color = 'y',linewidth=3)

if cosine_avg_unsuccess.empty:
    print("No data in cosine_avg_unsuccess")
else:
    cosine_avg_unsuccess_mean = cosine_avg_unsuccess.mean(axis=0)
    plt.plot(cosine_avg_unsuccess_mean,'o-',color = 'r',linewidth=3,label="unsuccessful")
    m,b=np.polyfit(cosine_avg_unsuccess_mean.index,cosine_avg_unsuccess_mean,1)
    plt.plot(cosine_avg_unsuccess_mean.index,m*cosine_avg_unsuccess_mean.index+b,"--",color = 'r',linewidth=3)

plt.axis([4.25,0.75,0.1,0.61])
plt.legend(fontsize=25)
plt.xticks(fontsize=25)
plt.yticks(fontsize=25)
plt.savefig(PATH+"//TF-IDF mean similarity to ultimate goal comparison.png")
plt.close()
