In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import os
from sklearn.metrics.pairwise import cosine_similarity


In [3]:
path =   # Enter the directory of your project folder
PATH = path + "/Results"

if not os.path.exists(PATH):
    os.mkdir(PATH) 

In [5]:
df = pd.read_csv(PATH+"//Data one hot encoded.csv").drop(['Unnamed: 0'],axis=1)

clientID = df.ClientID.unique()

In [6]:
# Preparing the bipartite network from tripartite network
A = df[["EnrollmentID","ClientID","ProjectType"]]
M = df.drop(["ProjectType"],axis = 1)
p = np.sort(A['ProjectType'].unique())
P = len(p)
F = pd.DataFrame()
for i in range(P):
    S = A[A["ProjectType"]==p[i]]
    M_S = M[M['EnrollmentID'].isin(S['EnrollmentID'])].drop(['EnrollmentID', 'ClientID', 'ProjectID', 'EntryDate',
        'ExitID','ExitDate','HouseholdID'],axis = 1)
    y = pd.DataFrame([p[i]],index=['ProjectType'])
    M_S = pd.concat([y,M_S.sum(axis=0)])
    F = F.append(M_S.T,ignore_index=True)
F = F.set_index("ProjectType",drop=True)
    

In [8]:
# calculating tf_idf
tf_idf = pd.DataFrame([],index = F.index, columns = F.columns)
for i in range(len(F.index)):
    for j in range(len(F.columns)):
        temp = F[F.columns[j]]
        tf = temp.loc[F.index[i]]
        n = len(temp[temp!=0])+1
        tf_idf.loc[F.index[i]][F.columns[j]] = tf * max(0,np.log(P/n))

tf_idf_norm = pd.DataFrame([],index = tf_idf.index, columns = tf_idf.columns)
for i in p:
    tf_idf_norm.loc[i] = tf_idf.loc[i]/np.sqrt(np.square(tf_idf).sum(axis=1)).loc[i]
tf_idf_norm.to_csv(PATH+"//Normalized TF_IDF.csv")

# calculating cosine similarity
cosine = pd.DataFrame(np.triu(cosine_similarity(tf_idf_norm)),columns = tf_idf_norm.index,index= tf_idf_norm.index)
cosine.to_csv(PATH+"//cosine similarity.csv")

plt.figure(figsize=(10,8))
sns.heatmap(cosine,annot=True,cmap="YlGnBu")
plt.savefig(PATH+"//Cosine similarity heat map.png")
plt.close()

# calculating the rank
r = pd.DataFrame()
for i in range(len(p)):
    y = pd.DataFrame([p[i]],index=['ProjectType'])
    rank = pd.concat([y,pd.Series(cosine.loc[p[i]].sort_values(ascending = False).index[1::])])
    r = r.append(rank.T,ignore_index=True)
r = r.set_index('ProjectType')
r.columns =  range(1,len(r.columns)+1)
# print("ranking:\n",r)
r.to_csv(PATH+"//rank.csv")


In [9]:
# calculating the link probability

r = pd.read_csv(PATH+"//rank.csv")
r = r.set_index("ProjectType")

# computing edge list
reentry = pd.DataFrame([],columns=["ClientID","Start","End"])
for i in range(len(clientID)):
    
    temp = df[df['ClientID']==clientID[i]]['ProjectType'].reset_index(drop=True)
    for j in range(len(temp)-1):
        reentry = reentry.append({"ClientID":clientID[i],"Start":temp[j],"End":temp[j+1]},ignore_index=True)
reentry.to_csv(PATH+"//Edge list.csv")



In [10]:
# calculating edge probability 
reentry = pd.read_csv(PATH+"//Edge list.csv").drop(['Unnamed: 0'],axis=1)

edges_tup = pd.Series([tuple(reentry.iloc[i][1:3]) for i in range(len(reentry))])
unique_edges = edges_tup.unique()
edge_prob = pd.DataFrame([],columns=['edge','prob'])
for i in range(len(unique_edges)):
    x = len(edges_tup[edges_tup==unique_edges[i]])/len(edges_tup)
    edge_prob = edge_prob.append({"edge": unique_edges[i],'prob':x},ignore_index=True)
edge_prob.to_csv(PATH+"//Edge Probability.csv")



In [13]:
# calculating rank probability
edge_prob = pd.read_csv(PATH+"//Edge Probability.csv").drop(['Unnamed: 0'],axis=1)
rank_prob = pd.DataFrame([],columns=['rank','prob'])

for i in range(1,len(r.columns)+1):
    prob = 0
    edge_tup_rank = pd.Series([(r.index[j],r[str(i)].iloc[j]) for j in range(len(r))])
    for k in range(len(edge_tup_rank)):
        if len(edge_prob[edge_prob["edge"]==str(edge_tup_rank.iloc[k])]['prob'])==0:
            prob = prob + 0
        else:
            prob = prob + edge_prob[edge_prob["edge"]==str(edge_tup_rank.iloc[k])]['prob'].values[0]
    rank_prob = rank_prob.append({"rank":i,"prob":prob},ignore_index=True)



In [15]:
# plotting the rank probability
f = 25
plt.figure(figsize=(20,10))
plt.plot(rank_prob['rank'],rank_prob['prob'],'o',linewidth=3)
plt.xlabel("Rank",fontsize=f)
plt.ylabel("P(r)",fontsize=f)
plt.xscale('log')
plt.yscale('log')
plt.xticks(fontsize=f)
plt.yticks(fontsize=f)

plt.savefig(PATH+"//P(r)_plot_without_r0_Project_type_log.png")
plt.close()
