## Word2Vector Implementation

In [1]:
# Import required libraries
import pandas as pd
from gensim.models import Word2Vec

In [2]:
# Load the TF-IDF features from the Excel file
features_df = pd.read_csv('top_150_comorbidities_tfidf_features.csv', index_col=0)
display(features_df)

Unnamed: 0_level_0,cholesterol,dyslipidemia,high,disease,hyperlipidemia,gerd,reflux,hypothyroidism,gastroesophageal,gout,...,compression,turp,wall,tb,hypertrophy,orthostatic,parkinsons,vertebral,lower,lymphoma
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.000000,0.0,0.000000,0.141631,0.0,0.176176,0.354382,0.000000,0.187300,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.000000,0.0,0.000000,0.348228,0.0,0.000000,0.145220,0.000000,0.153505,0.167681,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.000000,0.0,0.000000,0.175700,0.0,0.000000,0.219814,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
516,0.148702,0.0,0.153730,0.000000,0.0,0.000000,0.000000,0.217151,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
517,0.311627,0.0,0.322163,0.298561,0.0,0.371382,0.373523,0.000000,0.394832,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
519,0.000000,0.0,0.000000,0.122648,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
521,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [3]:
# Initializing a Word2Vec model
model = Word2Vec(vector_size=508, window=10, sg=0, hs=0, negative=10, min_count=7, workers=10)

# Build vocabulary from the TF-IDF features
sentences = [list(features.keys()) for _, features in features_df.iterrows()]
model.build_vocab(sentences)

In [4]:
# Train Word2Vec model with TF-IDF weighted vectors
for _, features in features_df.iterrows():
    word_tokens = list(features.keys())
    model.train([word_tokens], total_examples=5, epochs=10, compute_loss=True)

In [5]:
# Export the Word2Vec embeddings to a DataFrame
word2vec_embeddings = pd.DataFrame({word: model.wv[word] for word in model.wv.index_to_key}, index=features_df.index)

# Save the Word2Vec embeddings DataFrame to an Excel file
word2vec_embeddings.to_csv('word2vec_embeddings.csv')

In [6]:
# Load the pt pathways csv file
pt_pathways_v2 = pd.read_csv('Dataset/pt_pathways-v2.csv')

# Load the generated word2vec embeddings file
w2v = pd.read_csv('word2vec_embeddings.csv')

pt_pathways_v2.head()

Unnamed: 0,id,Hospital LOS,Mean HLOS,admission_disposition,age,sex,Std Dev,Hypertension,Chronic cardiac disease (not hypertension),N/A,...,reason_for_admission_Tachypnea [R06.8],reason_for_admission_Viral pneumonia [J12.9],WARD,ICU,Neurology,Imaging,Cardiology,Cancer,Nephrology,Isolation Ward
0,1,21,1,1,74,1,1,1,0,1,...,0,0,1,0,0,0,0,,0,1
1,2,5,0,1,61,0,0,1,0,1,...,0,0,1,0,0,0,0,,0,1
2,3,7,0,1,58,0,0,1,0,1,...,0,0,1,0,0,0,0,,0,0
3,4,9,0,1,94,1,0,1,0,1,...,0,0,1,0,0,0,0,,0,1
4,5,9,0,1,91,1,0,1,1,1,...,0,0,1,0,0,0,0,,0,1


In [7]:
w2v.head()

Unnamed: 0,id,lymphoma,infection,cognitive,knee,psoriasis,remote,insomnia,hernia,tract,...,hypocholesteremia,bronchitis,ischemic,sinusitis,eye,arthroplasty,carcinoma,nephropathy,syncope,cholesterol
0,1,-0.772351,-0.293309,-0.869769,-1.165225,-0.87412,-0.932158,-0.790873,-0.83133,-0.683968,...,-0.608767,-0.455254,-0.777694,-0.679012,-0.688976,-0.836226,-0.584153,-0.657582,-0.642037,-0.945593
1,2,-0.063473,0.218033,-0.345257,-0.326226,-0.205054,-0.285367,-0.02822,-0.026223,0.074483,...,0.291599,0.460366,0.602479,0.694491,0.585712,0.765486,0.723863,0.625036,0.848367,-0.070907
2,3,-1.07243,-0.65094,-0.482729,-0.561663,-0.656281,-0.521548,-0.654662,-0.67437,-0.731612,...,1.154339,1.14094,1.137694,1.416582,1.151307,1.127899,1.298754,1.006759,1.208086,-0.587492
3,4,-0.077095,-0.49193,0.027029,0.110387,-0.070009,-0.157776,-0.133461,-0.332626,-0.279064,...,0.293452,0.35091,0.714539,0.628227,0.653386,0.962031,0.902187,0.96129,1.108696,0.863767
4,5,0.336609,0.044228,0.167877,0.073952,0.226477,0.049177,0.276977,0.158667,0.326051,...,-0.110882,-0.280716,-0.346717,-0.038715,-0.20869,0.006369,-0.08212,0.008698,0.027187,-1.66972


In [8]:
# Merge the dataframes based on patient ID
merged_data = pd.merge(pt_pathways_v2, w2v, on='id')

In [9]:
# Save the merged data to a new CSV file
merged_data.to_csv('pt_pathways-w2v-top150-v2.csv', index=False)