In [2]:
import pickle

# Load the saved Word2Vec model from a pickle file
with open('wordembed_model.pkl', 'rb') as f1:
    wordembed_model = pickle.load(f1)
# Load the datafile from a pickle file
with open('preprocessed_df.pkl', 'rb') as f2:
    preprocessed_df = pickle.load(f2)

In [2]:
wordembed_model.wv.most_similar('aspnet')


[('spring', 0.9338988661766052),
 ('nodejs', 0.9252668023109436),
 ('mvc', 0.9231239557266235),
 ('framework', 0.9042681455612183),
 ('angular', 0.9036686420440674),
 ('angularjs', 0.8968015313148499),
 ('core', 0.8798183798789978),
 ('net', 0.8759362101554871),
 ('apis', 0.8741264939308167),
 ('wpf', 0.8730732798576355)]

In [4]:
preprocessed_df.head()

Unnamed: 0,CourseId,CourseTitle_lemmatized,Description_lemmatized,title_desc,title_desc_tokenized,title_desc_cleaned
0,abts-advanced-topics,"[biztalk, 2006, business, process, management]","[course, cover, business, process, management,...",biztalk 2006 business process management cours...,"[biztalk, 2006, business, process, management,...","[biztalk, 2006, business, process, management,..."
1,abts-fundamentals,"[biztalk, 2006, fundamental]","[despite, trend, towards, service-oriented, ar...",biztalk 2006 fundamental despite trend towards...,"[biztalk, 2006, fundamental, despite, trend, t...","[biztalk, 2006, fundamental, despite, trend, t..."
2,agile-team-practice-fundamentals,"[agile, team, practice, scrum]","[course, much, different, course, pluralsight,...",agile team practice scrum course much differen...,"[agile, team, practice, scrum, course, much, d...","[agile, team, practice, scrum, course, much, d..."
3,aspdotnet-advanced-topics,"[asp.net, 3.5, advanced, topic]","[course, cover, advanced, topic, asp.net, 3.5,...",asp.net 3.5 advanced topic course cover advanc...,"[asp.net, 3.5, advanced, topic, course, cover,...","[aspnet, 35, advanced, topic, course, cover, a..."
4,aspdotnet-ajax-advanced-topics,"[asp.net, ajax, advanced, topic]","[course, cover, advanced, topic, asp.net, ajax...",asp.net ajax advanced topic course cover advan...,"[asp.net, ajax, advanced, topic, course, cover...","[aspnet, ajax, advanced, topic, course, cover,..."


In [10]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Get the embeddings for each course title and description
target_course = preprocessed_df.loc[preprocessed_df['CourseId'] == 'abts-advanced-topics']['title_desc_cleaned'].tolist()
target_embedding = np.concatenate([wordembed_model.wv[word] for word in target_course])

max_len = max([len(np.concatenate([wordembed_model.wv[word] for word in row['title_desc_cleaned']])) for _, row in preprocessed_df.iterrows()])

target_embedding_padded = pad_sequences([target_embedding.reshape(1,-1).T], maxlen=max_len, dtype='float32', padding='post')
target_embedding_padded = target_embedding_padded.reshape(1,max_len)

# Compute the similarity scores between the target course and all other courses in the dataset
similarity_scores = {}
for index, row in preprocessed_df.iterrows():
    course_id = row['CourseId']
    course_title_desc = row['title_desc_cleaned']
    course_embedding = np.concatenate([wordembed_model.wv[word] for word in course_title_desc])

    course_embedding_padded = pad_sequences([course_embedding], maxlen=max_len, dtype='float32', padding='post').reshape(1,max_len)
    similarity_scores[course_id] = cosine_similarity(target_embedding_padded, course_embedding_padded)[0][0]
    
# Sort the courses by similarity score and return the top n courses
similar_courses = sorted(similarity_scores.items(), key=lambda x: x[1], reverse=True)[:20]

In [11]:
similar_courses

[('abts-advanced-topics', 0.9999999),
 ('bts09-advanced-topics', 0.7992474),
 ('sql-server-bi', 0.46521413),
 ('citrix-xendesktop-7-management', 0.42337668),
 ('internet-explorer-10-introduction', 0.36481237),
 ('configuring-sharepoint-2013-farms-powershell', 0.3574193),
 ('silverlight-ria-services-advanced-topics', 0.34988526),
 ('windows-server-vnext-first-look', 0.3492698),
 ('mse-advanced-topics', 0.33731195),
 ('exchange-online-administration', 0.33448663),
 ('management-strategies-increase-productivity', 0.32744846),
 ('sharepoint-business-services', 0.32334825),
 ('windows-server-2003-active-directory-fundamentals', 0.31869715),
 ('exchange-2013-virtualization', 0.3169784),
 ('yammer-business-professionals', 0.31246552),
 ('wmi-ps', 0.31077844),
 ('wf-advanced-topics', 0.3044471),
 ('add-profit-to-business-by-adding-purpose', 0.3038302),
 ('end-user-security-awareness', 0.30329582),
 ('working-with-entities-in-drupal-7', 0.29937756)]

In [4]:
preprocessed_df.shape

(8011, 6)

In [8]:
max_len = max([len(np.concatenate([wordembed_model.wv[word] for word in row['title_desc_cleaned']])) for _, row in preprocessed_df.iterrows()])

def get_padded_embedding(course_id : str):
    course = preprocessed_df.loc[preprocessed_df['CourseId'] == course_id]['title_desc_cleaned'].tolist()
    embedding = np.concatenate([wordembed_model.wv[word] for word in course])
    padded_embedding = pad_sequences([embedding.reshape(1,-1).T], maxlen=max_len, dtype='float32', padding='post').reshape(1, max_len)
    return padded_embedding

similarity_matrix = np.zeros((8011, 8011))
embeddings_matrix = np.vstack(preprocessed_df['CourseId'].apply(get_padded_embedding))


In [9]:
print(embeddings_matrix[0])

[ 0.30212376  0.11837974 -0.03773297 ...  0.          0.
  0.        ]


In [13]:
print(target_embedding_padded)

[[ 0.30212376  0.11837974 -0.03773297 ...  0.          0.
   0.        ]]


## Nice :) the embeddings match so there are no indexes problem or something like that, safe to calculate similarities now

In [None]:

similarity_matrix = cosine_similarity(embeddings_matrix)
