In [2]:
import pickle

# Load the saved Word2Vec model from a pickle file
with open('wordembed_model.pkl', 'rb') as f1:
    wordembed_model = pickle.load(f1)
# Load the datafile from a pickle file
with open('preprocessed_df.pkl', 'rb') as f2:
    preprocessed_df = pickle.load(f2)

In [2]:
wordembed_model.wv.most_similar('aspnet')


[('spring', 0.9338988661766052),
 ('nodejs', 0.9252668023109436),
 ('mvc', 0.9231239557266235),
 ('framework', 0.9042681455612183),
 ('angular', 0.9036686420440674),
 ('angularjs', 0.8968015313148499),
 ('core', 0.8798183798789978),
 ('net', 0.8759362101554871),
 ('apis', 0.8741264939308167),
 ('wpf', 0.8730732798576355)]

In [4]:
preprocessed_df.head()

Unnamed: 0,CourseId,CourseTitle_lemmatized,Description_lemmatized,title_desc,title_desc_tokenized,title_desc_cleaned
0,abts-advanced-topics,"[biztalk, 2006, business, process, management]","[course, cover, business, process, management,...",biztalk 2006 business process management cours...,"[biztalk, 2006, business, process, management,...","[biztalk, 2006, business, process, management,..."
1,abts-fundamentals,"[biztalk, 2006, fundamental]","[despite, trend, towards, service-oriented, ar...",biztalk 2006 fundamental despite trend towards...,"[biztalk, 2006, fundamental, despite, trend, t...","[biztalk, 2006, fundamental, despite, trend, t..."
2,agile-team-practice-fundamentals,"[agile, team, practice, scrum]","[course, much, different, course, pluralsight,...",agile team practice scrum course much differen...,"[agile, team, practice, scrum, course, much, d...","[agile, team, practice, scrum, course, much, d..."
3,aspdotnet-advanced-topics,"[asp.net, 3.5, advanced, topic]","[course, cover, advanced, topic, asp.net, 3.5,...",asp.net 3.5 advanced topic course cover advanc...,"[asp.net, 3.5, advanced, topic, course, cover,...","[aspnet, 35, advanced, topic, course, cover, a..."
4,aspdotnet-ajax-advanced-topics,"[asp.net, ajax, advanced, topic]","[course, cover, advanced, topic, asp.net, ajax...",asp.net ajax advanced topic course cover advan...,"[asp.net, ajax, advanced, topic, course, cover...","[aspnet, ajax, advanced, topic, course, cover,..."


In [10]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from tensorflow.keras.preprocessing.sequence import pad_sequences
# Get the embeddings for each course title and description
target_course = preprocessed_df.loc[preprocessed_df['CourseId'] == 'abts-advanced-topics']['title_desc_cleaned'].tolist()
target_embedding = np.concatenate([wordembed_model.wv[word] for word in target_course])
max_len = max([len(np.concatenate([wordembed_model.wv[word] for word in row['title_desc_cleaned']])) for _, row in preprocessed_df.iterrows()])
target_embedding_padded = pad_sequences([target_embedding.reshape(1,-1).T], maxlen=max_len, dtype='float32', padding='post')
target_embedding_padded = target_embedding_padded.reshape(1,max_len)
# Compute the similarity scores between the target course and all other courses in the dataset
similarity_scores = {}
for index, row in preprocessed_df.iterrows():
    course_id = row['CourseId']
    course_title_desc = row['title_desc_cleaned']
    course_embedding = np.concatenate([wordembed_model.wv[word] for word in course_title_desc])
    course_embedding_padded = pad_sequences([course_embedding], maxlen=max_len, dtype='float32', padding='post').reshape(1,max_len)
    similarity_scores[course_id] = cosine_similarity(target_embedding_padded, course_embedding_padded)[0][0]
    
# Sort the courses by similarity score and return the top n courses
similar_courses = sorted(similarity_scores.items(), key=lambda x: x[1], reverse=True)[:20]

In [11]:
similar_courses

[('abts-advanced-topics', 0.9999999),
 ('bts09-advanced-topics', 0.7992474),
 ('sql-server-bi', 0.46521413),
 ('citrix-xendesktop-7-management', 0.42337668),
 ('internet-explorer-10-introduction', 0.36481237),
 ('configuring-sharepoint-2013-farms-powershell', 0.3574193),
 ('silverlight-ria-services-advanced-topics', 0.34988526),
 ('windows-server-vnext-first-look', 0.3492698),
 ('mse-advanced-topics', 0.33731195),
 ('exchange-online-administration', 0.33448663),
 ('management-strategies-increase-productivity', 0.32744846),
 ('sharepoint-business-services', 0.32334825),
 ('windows-server-2003-active-directory-fundamentals', 0.31869715),
 ('exchange-2013-virtualization', 0.3169784),
 ('yammer-business-professionals', 0.31246552),
 ('wmi-ps', 0.31077844),
 ('wf-advanced-topics', 0.3044471),
 ('add-profit-to-business-by-adding-purpose', 0.3038302),
 ('end-user-security-awareness', 0.30329582),
 ('working-with-entities-in-drupal-7', 0.29937756)]

In [4]:
preprocessed_df.shape

(8011, 6)

In [8]:
max_len = max([len(np.concatenate([wordembed_model.wv[word] for word in row['title_desc_cleaned']])) for _, row in preprocessed_df.iterrows()])

def get_padded_embedding(course_id : str):
    course = preprocessed_df.loc[preprocessed_df['CourseId'] == course_id]['title_desc_cleaned'].tolist()
    embedding = np.concatenate([wordembed_model.wv[word] for word in course])
    padded_embedding = pad_sequences([embedding.reshape(1,-1).T], maxlen=max_len, dtype='float32', padding='post').reshape(1, max_len)
    return padded_embedding

similarity_matrix = np.zeros((8011, 8011))
embeddings_matrix = np.vstack(preprocessed_df['CourseId'].apply(get_padded_embedding))


In [9]:
print(embeddings_matrix[0])

[ 0.30212376  0.11837974 -0.03773297 ...  0.          0.
  0.        ]


In [13]:
print(target_embedding_padded)

[[ 0.30212376  0.11837974 -0.03773297 ...  0.          0.
   0.        ]]


## Nice :) the embeddings match so there are no indexes problem or something like that, safe to calculate similarities now

In [14]:

similarity_matrix = cosine_similarity(embeddings_matrix)


In [16]:
print(similarity_matrix[0])

[0.9999998  0.1222927  0.09713293 ... 0.09917957 0.12804884 0.0645486 ]


In [17]:
import pandas as pd 
course_ids = preprocessed_df['CourseId'].tolist()
similarity_df = pd.DataFrame(similarity_matrix, index=course_ids, columns=course_ids)

In [18]:
similarity_df.head()

Unnamed: 0,abts-advanced-topics,abts-fundamentals,agile-team-practice-fundamentals,aspdotnet-advanced-topics,aspdotnet-ajax-advanced-topics,aspdotnet-ajax-fundamentals,aspdotnet-ajax-jscript,aspdotnet-data,aspdotnet-fundamentals,aspdotnet-mvc,...,css-grid-bootstrap-4-creating-site,designing-implementing-managing-vmware-vsan-production,aws-amazon-rds,microsoft-cognitive-services-bing-entity-search,microsoft-azure-stack-solutions-architectural-patterns,nunit-moq-mocking,identity-access-management-aws-users,secure-coding-using-components-known-vulnerabilities,aws-s3-implementing,identity-access-management-aws-roles-groups
abts-advanced-topics,1.0,0.122293,0.097133,0.173345,0.140116,0.127896,0.094861,0.107397,0.114121,0.113606,...,0.067519,0.128133,0.084191,0.098118,0.112382,0.070193,0.130953,0.09918,0.128049,0.064549
abts-fundamentals,0.122293,0.999999,0.171751,0.13049,0.098779,0.243477,0.070059,0.189584,0.228489,0.214744,...,0.176717,0.212376,0.143262,0.187703,0.17231,0.169284,0.190094,0.225933,0.198061,0.147549
agile-team-practice-fundamentals,0.097133,0.171751,1.0,0.127953,0.11671,0.198002,0.139012,0.18003,0.227555,0.171845,...,0.222326,0.20101,0.181796,0.203916,0.223771,0.176844,0.21622,0.211967,0.208391,0.214533
aspdotnet-advanced-topics,0.173345,0.13049,0.127953,1.0,0.754955,0.169838,0.545065,0.160963,0.173508,0.175885,...,0.080996,0.108015,0.109575,0.092359,0.090069,0.077786,0.113325,0.112522,0.135488,0.089197
aspdotnet-ajax-advanced-topics,0.140116,0.098779,0.11671,0.754955,1.0,0.141311,0.528729,0.150439,0.149767,0.165245,...,0.097408,0.094236,0.125073,0.111571,0.104162,0.076494,0.127439,0.10833,0.114829,0.099378


In [22]:
target_course_id = 'abts-advanced-topics'
target_course_similarity = similarity_df.loc[target_course_id]
top_10_similar_courses = target_course_similarity.sort_values(ascending=False).head(11)[1:]
top_10_similar_courses

bts09-advanced-topics                           0.799248
sql-server-bi                                   0.465214
citrix-xendesktop-7-management                  0.423377
internet-explorer-10-introduction               0.364812
configuring-sharepoint-2013-farms-powershell    0.357419
silverlight-ria-services-advanced-topics        0.349885
windows-server-vnext-first-look                 0.349270
mse-advanced-topics                             0.337312
exchange-online-administration                  0.334487
management-strategies-increase-productivity     0.327449
Name: abts-advanced-topics, dtype: float32

In [25]:
file_name = 'courses_similarity_df.pkl'
similarity_df.to_pickle(file_name)


In [24]:
#we might wanna return a list later as a response for the api
n = 20
top_n_similar_courses_with_scores = target_course_similarity.sort_values(ascending=False).head(n + 1)[1:]
top_n_similar_courses_with_scores = top_n_similar_courses_with_scores.to_dict()
top_n_similar_courses_with_scores = list(top_n_similar_courses_with_scores.items())
top_n_similar_courses_with_scores


[('bts09-advanced-topics', 0.7992475032806396),
 ('sql-server-bi', 0.4652140736579895),
 ('citrix-xendesktop-7-management', 0.42337659001350403),
 ('internet-explorer-10-introduction', 0.3648124039173126),
 ('configuring-sharepoint-2013-farms-powershell', 0.35741934180259705),
 ('silverlight-ria-services-advanced-topics', 0.3498851954936981),
 ('windows-server-vnext-first-look', 0.3492697775363922),
 ('mse-advanced-topics', 0.33731192350387573),
 ('exchange-online-administration', 0.33448660373687744),
 ('management-strategies-increase-productivity', 0.3274485170841217),
 ('sharepoint-business-services', 0.3233482539653778),
 ('windows-server-2003-active-directory-fundamentals', 0.3186971843242645),
 ('exchange-2013-virtualization', 0.31697842478752136),
 ('yammer-business-professionals', 0.31246551871299744),
 ('wmi-ps', 0.3107783794403076),
 ('wf-advanced-topics', 0.30444714426994324),
 ('add-profit-to-business-by-adding-purpose', 0.30383020639419556),
 ('end-user-security-awareness'

In [3]:
import pandas as pd
file_name1 = 'courses_similarity_df.pkl'
file_name2 = 'preprocessed_df.pkl'
similarity_df = pd.read_pickle(file_name)
preprocessed_df = pd.read_pickle(file_name2)

## This is to calculate groups of similarities, as in clusters of similar courses, spectralclustering uses distance , eigenvalues and eigenvectors to detect and cluster courses with relatively similar similarities, who basically has close eigenvectors & eigenvalues based on the distances

In [5]:
import pandas as pd
from sklearn.cluster import SpectralClustering

# spectral clustering uses distances, which are 1- similarities, a similarity of 1 means a distance of 0 
distance_matrix = 1 - similarity_df

# Perform Spectral Clustering using the distance matrix
n_clusters = 5
clustering_model = SpectralClustering(n_clusters=n_clusters, affinity='precomputed')
clusters = clustering_model.fit_predict(distance_matrix)

preprocessed_df['spectral_clusters'] = clusters

[2.51678247e-15 8.61787728e-06 9.14472771e-06 9.06637352e-06
 1.21702606e-05 8.46262893e-06]
not reaching the requested tolerance 1e-05.
  _, diffusion_map = lobpcg(


In [6]:
file_name = 'preprocessed_df_withclusters.pkl'
preprocessed_df.to_pickle(file_name)

In [9]:
preprocessed_df.head()

Unnamed: 0,CourseId,CourseTitle_lemmatized,Description_lemmatized,title_desc,title_desc_tokenized,title_desc_cleaned,spectral_clusters
0,abts-advanced-topics,"[biztalk, 2006, business, process, management]","[course, cover, business, process, management,...",biztalk 2006 business process management cours...,"[biztalk, 2006, business, process, management,...","[biztalk, 2006, business, process, management,...",1
1,abts-fundamentals,"[biztalk, 2006, fundamental]","[despite, trend, towards, service-oriented, ar...",biztalk 2006 fundamental despite trend towards...,"[biztalk, 2006, fundamental, despite, trend, t...","[biztalk, 2006, fundamental, despite, trend, t...",3
2,agile-team-practice-fundamentals,"[agile, team, practice, scrum]","[course, much, different, course, pluralsight,...",agile team practice scrum course much differen...,"[agile, team, practice, scrum, course, much, d...","[agile, team, practice, scrum, course, much, d...",3
3,aspdotnet-advanced-topics,"[asp.net, 3.5, advanced, topic]","[course, cover, advanced, topic, asp.net, 3.5,...",asp.net 3.5 advanced topic course cover advanc...,"[asp.net, 3.5, advanced, topic, course, cover,...","[aspnet, 35, advanced, topic, course, cover, a...",4
4,aspdotnet-ajax-advanced-topics,"[asp.net, ajax, advanced, topic]","[course, cover, advanced, topic, asp.net, ajax...",asp.net ajax advanced topic course cover advan...,"[asp.net, ajax, advanced, topic, course, cover...","[aspnet, ajax, advanced, topic, course, cover,...",2


In [8]:
import pickle

# Save the model to a file
with open("spectral_clustering_model.pkl", "wb") as f:
    pickle.dump(clustering_model, f)


this will compress the model harder to hopefully be small enough to be zipped and pushed to github

In [10]:
import joblib

# Save the model with a higher compression level
joblib.dump(clustering_model, "spectral_clustering_model_hardcompress.pkl", compress=9)


['spectral_clustering_model_hardcompress.pkl']

### this uploads the spectral clustering model as an s3 object

In [None]:
import boto3

s3 = boto3.client('s3')
model_filename = 'spectral_clustering_model_hardcompress.pkl'
bucket_name = 'your-bucket-name'

# Upload the model to S3
with open(model_filename, 'rb') as file:
    s3.upload_fileobj(file, bucket_name, model_filename)

This is just to load the saved model 

In [None]:
with open("spectral_clustering_model.pkl", "rb") as f:
    loaded_clustering_model = pickle.load(f)


In [11]:
similarity_df.shape

(8011, 8011)

## This is to do the same thing but with DBSCAN, difference is DBSCAN doesn't predefine a number of clusters, it just clusters those close enough to form a 'neighbourhood', and doesn't consider outliers or those who don't really belong

In [14]:
import numpy as np
import pandas as pd
from sklearn.cluster import DBSCAN

# DBSCAN also uses distances, which are 1- similarities, a similarity of 1 means a distance of 0 
distance_matrix = 1 - similarity_df

# Clip negative values to zero, this provided problems since some similarities with courses themselves are 1.0001 so it provides negative distance which raises an error
distance_matrix_clipped = np.clip(distance_matrix, 0, None)

# Perform DBSCAN using the distance matrix
eps = 0.32  
min_samples = 3  
dbscan_clustering_model = DBSCAN(eps=eps, min_samples=min_samples, metric='precomputed')
dbscan_clusters = dbscan_clustering_model.fit_predict(distance_matrix_clipped)

preprocessed_df['dbscan_clusters'] = dbscan_clusters

In [17]:
preprocessed_df.tail(200)

Unnamed: 0,CourseId,CourseTitle_lemmatized,Description_lemmatized,title_desc,title_desc_tokenized,title_desc_cleaned,spectral_clusters,dbscan_clusters
7811,microsoft-azure-information-architecture-requi...,"[identify, information, architecture, requirem...","[core, information, architecture, thorough, kn...",identify information architecture requirement ...,"[identify, information, architecture, requirem...","[identify, information, architecture, requirem...",1,-1
7812,administering-amazon-aurora-amazon-rds,"[administering, amazon, aurora, amazon, rds]","[aws, reinventing, relational, database, lands...",administering amazon aurora amazon rds aws rei...,"[administering, amazon, aurora, amazon, rds, a...","[administering, amazon, aurora, amazon, rds, a...",1,-1
7813,microsoft-azure-test-strategy-designing,"[microsoft, azure, developer, :, designing, te...","[act, building, test, strategy, confusing, ., ...",microsoft azure developer : designing test str...,"[microsoft, azure, developer, :, designing, te...","[microsoft, azure, developer, , designing, tes...",0,-1
7814,embracing-change-agile,"[embracing, change, :, staying, agile, midst, ...","[change, everywhere, seems, come, u, every, di...",embracing change : staying agile midst change ...,"[embracing, change, :, staying, agile, midst, ...","[embracing, change, , staying, agile, midst, c...",3,-1
7815,microsoft-azure-one-time-costs-estimating-recu...,"[estimating, one-time, recurring, cost, micros...","[frequent, question, fired, azure, solution, a...",estimating one-time recurring cost microsoft a...,"[estimating, one-time, recurring, cost, micros...","[estimating, onetime, recurring, cost, microso...",3,-1
...,...,...,...,...,...,...,...,...
8006,nunit-moq-mocking,"[mocking, moq, nunit]","[writing, effective, unit, test, tricky, depen...",mocking moq nunit writing effective unit test ...,"[mocking, moq, nunit, writing, effective, unit...","[mocking, moq, nunit, writing, effective, unit...",2,-1
8007,identity-access-management-aws-users,"[identity, access, management, aws, :, user]","[core, identity, access, management, (, iam, )...",identity access management aws : user core ide...,"[identity, access, management, aws, :, user, c...","[identity, access, management, aws, , user, co...",3,-1
8008,secure-coding-using-components-known-vulnerabi...,"[secure, coding, :, using, component, known, v...","[know, old, component, using, date, ,, contain...",secure coding : using component known vulnerab...,"[secure, coding, :, using, component, known, v...","[secure, coding, , using, component, known, vu...",1,-1
8009,aws-s3-implementing,"[implementing, amazon, s3, storage, aws]","[aws, s3, one, fundamental, service, offered, ...",implementing amazon s3 storage aws aws s3 one ...,"[implementing, amazon, s3, storage, aws, aws, ...","[implementing, amazon, s3, storage, aws, aws, ...",3,-1


In [19]:
import joblib

# Save the model with a higher compression level
joblib.dump(dbscan_clustering_model, "dbscan_clustering_model.pkl")

file_name = 'preprocessed_df_with_all_clusters.pkl'
preprocessed_df.to_pickle(file_name)

In [20]:
spectralclusters_grouped_data = preprocessed_df.groupby('spectral_clusters')
print(spectralclusters_grouped_data.head())

                            CourseId  \
0               abts-advanced-topics   
1                  abts-fundamentals   
2   agile-team-practice-fundamentals   
3          aspdotnet-advanced-topics   
4     aspdotnet-ajax-advanced-topics   
5        aspdotnet-ajax-fundamentals   
6             aspdotnet-ajax-jscript   
7                     aspdotnet-data   
8             aspdotnet-fundamentals   
9                      aspdotnet-mvc   
10     aspdotnet-mvc-advanced-topics   
11                    aspdotnet-mvc2   
12                aspdotnet-security   
13                    aspdotnet-undo   
14                azure-fundamentals   
15             bts09-advanced-topics   
16                   bts09-custom-ms   
17                bts09-fundamentals   
18                btsr2-fundamentals   
19      clojure-concurrency-tutorial   
20                  clr-fundamentals   
24   dotnet-distributed-architecture   
33                  mse-fundamentals   
35                  patterns-library   


## To recommend the most representative member of each cluster, we can find the course that has the smallest average distance to all other courses within the same cluster.

In [None]:
import pandas as pd
from sklearn.cluster import SpectralClustering, DBSCAN


def select_most_representative_course(group, distance_matrix):
    avg_distances = group.apply(lambda x: distance_matrix.loc[x.name, group.index].mean(), axis=1)
    most_representative_index = avg_distances.idxmin()
    return group.loc[most_representative_index]

# For Spectral Clustering
diverse_recommendations_spectral = preprocessed_df.groupby('spectral_clusters').apply(
    lambda group: select_most_representative_course(group, distance_matrix)
)

# For DBSCAN
diverse_recommendations_dbscan = preprocessed_df.groupby('dbscan_clusters').apply(
    lambda group: select_most_representative_course(group, distance_matrix)
)


this loads things for testing without rerunning the whole notebook

In [4]:
import pandas as pd

courses_data = pd.read_pickle('preprocessed_df_withclusters.pkl')
file_name = 'courses_similarity_df.pkl'
similarity_df = pd.read_pickle(file_name)
distance_matrix = 1 - similarity_df

In [9]:
print((distance_matrix.index))


Index(['abts-advanced-topics', 'abts-fundamentals',
       'agile-team-practice-fundamentals', 'aspdotnet-advanced-topics',
       'aspdotnet-ajax-advanced-topics', 'aspdotnet-ajax-fundamentals',
       'aspdotnet-ajax-jscript', 'aspdotnet-data', 'aspdotnet-fundamentals',
       'aspdotnet-mvc',
       ...
       'css-grid-bootstrap-4-creating-site',
       'designing-implementing-managing-vmware-vsan-production',
       'aws-amazon-rds', 'microsoft-cognitive-services-bing-entity-search',
       'microsoft-azure-stack-solutions-architectural-patterns',
       'nunit-moq-mocking', 'identity-access-management-aws-users',
       'secure-coding-using-components-known-vulnerabilities',
       'aws-s3-implementing', 'identity-access-management-aws-roles-groups'],
      dtype='object', length=8011)


In [10]:
print(courses_data.index)

RangeIndex(start=0, stop=8011, step=1)


In [18]:
courses_data_indexed = courses_data.set_index('CourseId')
print(courses_data_indexed)

                                                                               CourseTitle_lemmatized  \
CourseId                                                                                                
abts-advanced-topics                                   [biztalk, 2006, business, process, management]   
abts-fundamentals                                                        [biztalk, 2006, fundamental]   
agile-team-practice-fundamentals                                       [agile, team, practice, scrum]   
aspdotnet-advanced-topics                                             [asp.net, 3.5, advanced, topic]   
aspdotnet-ajax-advanced-topics                                       [asp.net, ajax, advanced, topic]   
...                                                                                               ...   
nunit-moq-mocking                                                               [mocking, moq, nunit]   
identity-access-management-aws-users                   

In [14]:
distance_matrix.head()

Unnamed: 0,abts-advanced-topics,abts-fundamentals,agile-team-practice-fundamentals,aspdotnet-advanced-topics,aspdotnet-ajax-advanced-topics,aspdotnet-ajax-fundamentals,aspdotnet-ajax-jscript,aspdotnet-data,aspdotnet-fundamentals,aspdotnet-mvc,...,css-grid-bootstrap-4-creating-site,designing-implementing-managing-vmware-vsan-production,aws-amazon-rds,microsoft-cognitive-services-bing-entity-search,microsoft-azure-stack-solutions-architectural-patterns,nunit-moq-mocking,identity-access-management-aws-users,secure-coding-using-components-known-vulnerabilities,aws-s3-implementing,identity-access-management-aws-roles-groups
abts-advanced-topics,1.788139e-07,0.8777073,0.9028671,0.8266551,0.8598844,0.872104,0.905139,0.892603,0.885879,0.886394,...,0.932481,0.871867,0.915809,0.901882,0.887618,0.929807,0.869047,0.90082,0.871951,0.935451
abts-fundamentals,0.8777073,7.152557e-07,0.8282486,0.8695101,0.9012214,0.756523,0.929941,0.810416,0.771511,0.785256,...,0.823283,0.787624,0.856738,0.812297,0.82769,0.830716,0.809906,0.774067,0.801939,0.852451
agile-team-practice-fundamentals,0.9028671,0.8282486,4.172325e-07,0.8720471,0.8832903,0.801998,0.860988,0.81997,0.772445,0.828155,...,0.777674,0.79899,0.818204,0.796084,0.776229,0.823156,0.78378,0.788033,0.791609,0.785467
aspdotnet-advanced-topics,0.8266551,0.8695101,0.8720471,1.788139e-07,0.2450448,0.830162,0.454935,0.839037,0.826492,0.824115,...,0.919004,0.891985,0.890425,0.907641,0.909931,0.922214,0.886675,0.887478,0.864512,0.910803
aspdotnet-ajax-advanced-topics,0.8598844,0.9012214,0.8832903,0.2450448,2.384186e-07,0.858689,0.471271,0.849561,0.850233,0.834755,...,0.902592,0.905764,0.874927,0.888429,0.895838,0.923506,0.872561,0.89167,0.885171,0.900622


In [37]:
def select_most_representative_course(group, distance_matrix):
    avg_distances = group.apply(lambda x: distance_matrix.loc[x.name, group.index].mean(), axis=1)
    most_representative_index = avg_distances.idxmin()
    return pd.Series({'CourseId': most_representative_index})
# For Spectral Clustering
diverse_recommendations_spectral = courses_data_indexed.groupby('spectral_clusters', as_index=False).apply(
    lambda group: select_most_representative_course(group, distance_matrix)
)

In [51]:
diverse_recommendations_list = diverse_recommendations_spectral['CourseId'].tolist()
diverse_recommendations_list
course_dict = {'recommended_diverse_courses': diverse_recommendations_list}


In [52]:
print(course_dict)

{'recommended_diverse_courses': ['skeet-async', 'csharp-6-from-scratch', 'red-hat-enterprise-linux-shell-scripting-fundamentals', 'cpp-fundamentals', 'facial-animation-maya-7-1768']}


In [2]:
with open("diverse_default_recommendations_dict_spectral.pkl", "rb") as f:
    loaded_course_dict = pickle.load(f)

print(loaded_course_dict)

{'recommended_diverse_courses': ['skeet-async', 'csharp-6-from-scratch', 'red-hat-enterprise-linux-shell-scripting-fundamentals', 'cpp-fundamentals', 'facial-animation-maya-7-1768']}


In [54]:
import pickle
with open("diverse_default_recommendations_dict_spectral.pkl", "wb") as f:
    pickle.dump(course_dict, f)

### This uploads the doverse recommendations list into the s3 bucket

In [None]:
import boto3

s3 = boto3.client('s3')
bucket_name = 'your-bucket-name'
courses_filename = 'diverse_default_recommendations_dict_spectral.pkl'

# Upload the recommended courses to S3
with open(courses_filename, 'rb') as file:
    s3.upload_fileobj(file, bucket_name, courses_filename)


In [None]:
# Assuming that we're using AWS S3 Buckets which have versioning enabled 

import re
import pandas as pd
response = s3_client.list_objects_v2(Bucket=bucket_name)
objects = sorted(response['Contents'], key=lambda x: x['LastModified'], reverse=True)

latest_object_key = None
for obj in objects:
    if re.match(r'courses_similarity_df_v.*\.csv', obj['Key']):
        latest_object_key = obj['Key']
        break
if latest_object_key is None:
    raise Exception("No similarity data found in the S3 bucket")



with open("courses_similarity_df_latest.csv", "wb") as download_file:
    s3_client.download_fileobj(bucket_name, latest_object_key, download_file)



similarity_df = pd.read_csv("courses_similarity_df_latest.csv")
