In [1]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import gensim
import pandas as pd
import nltk as nltk

from scipy.spatial.distance import cosine
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk import ngrams
from gensim import corpora

%matplotlib inline

In [2]:
rs = 123

### Calculate the consine similarity between two example courses


In [3]:
course1 = "machine learning for everyone"
course2 = "machine learning for beginners"

In [6]:
tokens = set(course1.split() + course2.split())

In [7]:
tokens = list(tokens)

In [8]:
tokens

['machine', 'learning', 'for', 'everyone', 'beginners']

then generate BoW features (token counts) for these two courses (or using `tokens_dict.doc2bow()` method provided in `nltk`, similar to what we did in the previous lab).


In [9]:
def generate_sparse_bow(course):
    """
    Generate a sparse bag-of-words (BoW) representation for a given course.

    Parameters:
    course (str): The input course text to generate the BoW representation for.

    Returns:
    list: A sparse BoW representation where each element corresponds to the presence (1) or absence (0)
    of a word in the input course text.
    """
    bow_vector = []
    word = course.split()
    for token in set(tokens):
        if token in word:
            bow_vector.append(1)
        else:
            bow_vector.append(0)
    return bow_vector

In [10]:
bow1 = generate_sparse_bow(course1)
bow2 = generate_sparse_bow(course2)

In [11]:
bow1

[1, 1, 1, 1, 0]

In [12]:
bow2

[1, 1, 1, 0, 1]

From the above cell outputs, we can see the two vectors are very similar. Only two dimensions are different.


Now we can quickly apply the cosine similarity measurement on the two vectors:


In [13]:
cos_sim = 1 - cosine(bow1, bow2)

In [14]:
print(f"The cosine similarity between course `{course1}` and course `{course2}` is {round(cos_sim, 2) * 100}%")

The cosine similarity between course `machine learning for everyone` and course `machine learning for beginners` is 75.0%


In [16]:
from scipy.spatial.distance import euclidean
euclidean(bow1, bow2)

1.4142135623730951

### TASK: Find similar courses to the course `Machine Learning with Python`

Now you have learned how to calculate cosine similarity between two sample BoW feature vectors. Let's work on some real course BoW feature vectors.


In [17]:
bows_url = "https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBM-ML321EN-SkillsNetwork/labs/datasets/courses_bows.csv"
bows_df = pd.read_csv(bows_url)

In [18]:
bows_df.head()

Unnamed: 0,doc_index,doc_id,token,bow
0,0,ML0201EN,ai,2
1,0,ML0201EN,apps,2
2,0,ML0201EN,build,2
3,0,ML0201EN,cloud,1
4,0,ML0201EN,coming,1


In [19]:
bows_df = bows_df[['doc_id', 'token', 'bow']]

In [20]:
bows_df.head(10)

Unnamed: 0,doc_id,token,bow
0,ML0201EN,ai,2
1,ML0201EN,apps,2
2,ML0201EN,build,2
3,ML0201EN,cloud,1
4,ML0201EN,coming,1
5,ML0201EN,create,1
6,ML0201EN,data,1
7,ML0201EN,developer,1
8,ML0201EN,found,1
9,ML0201EN,fun,1


In [21]:
course_url = "https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBM-ML321EN-SkillsNetwork/labs/datasets/course_processed.csv"
course_df = pd.read_csv(course_url)

In [22]:
course_df.head()

Unnamed: 0,COURSE_ID,TITLE,DESCRIPTION
0,ML0201EN,robots are coming build iot apps with watson ...,have fun with iot and learn along the way if ...
1,ML0122EN,accelerating deep learning with gpu,training complex deep learning models with lar...
2,GPXX0ZG0EN,consuming restful services using the reactive ...,learn how to use a reactive jax rs client to a...
3,RP0105EN,analyzing big data in r using apache spark,apache spark is a popular cluster computing fr...
4,GPXX0Z2PEN,containerizing packaging and running a sprin...,learn how to containerize package and run a ...


In [23]:
course_df[course_df['COURSE_ID'] == 'ML0101ENv3']

Unnamed: 0,COURSE_ID,TITLE,DESCRIPTION
158,ML0101ENv3,machine learning with python,machine learning can be an incredibly benefici...


In [24]:
ml_course = bows_df[bows_df['doc_id'] == 'ML0101ENv3']
ml_course

Unnamed: 0,doc_id,token,bow
2747,ML0101ENv3,course,1
2748,ML0101ENv3,learning,4
2749,ML0101ENv3,machine,3
2750,ML0101ENv3,need,1
2751,ML0101ENv3,get,1
2752,ML0101ENv3,started,1
2753,ML0101ENv3,python,2
2754,ML0101ENv3,tool,1
2755,ML0101ENv3,tools,1
2756,ML0101ENv3,predict,1


We can see the BoW feature vector is in vertical format but normally feature vectors are in horizontal format. One way to transpose the feature vector from vertical to horizontal is to use the Pandas `pivot()` method:


In [25]:
ml_courseT = ml_course.pivot(index=['doc_id'], columns='token').reset_index(level=[0])
ml_courseT

Unnamed: 0_level_0,doc_id,bow,bow,bow,bow,bow,bow,bow,bow,bow,bow,bow,bow,bow,bow,bow,bow,bow,bow,bow
token,Unnamed: 1_level_1,beneficial,course,free,future,get,give,hidden,insights,learning,machine,need,predict,python,started,supervised,tool,tools,trends,unsupervised
0,ML0101ENv3,1,1,1,1,1,1,1,1,4,3,1,1,2,1,1,1,1,1,1


To compare the BoWs of any two courses, which normally have a different set of tokens, we need to create a union token set and then transpose them. We have provided a method called `pivot_two_bows` as follows:


In [26]:
def pivot_two_bows(basedoc, comparedoc):
    """
    Pivot two bag-of-words (BoW) representations for comparison.

    Parameters:
    basedoc (DataFrame): DataFrame containing the bag-of-words representation for the base document.
    comparedoc (DataFrame): DataFrame containing the bag-of-words representation for the document to compare.

    Returns:
    DataFrame: A DataFrame with pivoted BoW representations for the base and compared documents,
    facilitating direct comparison of word occurrences between the two documents.
    """

    # Create copies of the input DataFrames to avoid modifying the originals
    base = basedoc.copy()
    base['type'] = 'base'  # Add a 'type' column indicating base document
    compare = comparedoc.copy()
    compare['type'] = 'compare'  # Add a 'type' column indicating compared document

    # Concatenate the two DataFrames vertically
    join = pd.concat([base, compare])

    # Pivot the concatenated DataFrame based on 'doc_id' and 'type', with words as columns
    joinT = join.pivot(index=['doc_id', 'type'], columns='token').fillna(0).reset_index(level=[0, 1])

    # Assign meaningful column names to the pivoted DataFrame
    joinT.columns = ['doc_id', 'type'] + [t[1] for t in joinT.columns][2:]

    # Return the pivoted DataFrame for comparison
    return joinT

In [27]:
course1 = bows_df[bows_df['doc_id'] == 'ML0151EN']
course2 = bows_df[bows_df['doc_id'] == 'ML0101ENv3']

In [28]:
course1

Unnamed: 0,doc_id,token,bow
3512,ML0151EN,learn,1
3513,ML0151EN,course,1
3514,ML0151EN,learning,5
3515,ML0151EN,machine,4
3516,ML0151EN,using,1
3517,ML0151EN,r,2
3518,ML0151EN,basics,1
3519,ML0151EN,language,1
3520,ML0151EN,programming,1
3521,ML0151EN,statistical,1


In [29]:
course2

Unnamed: 0,doc_id,token,bow
2747,ML0101ENv3,course,1
2748,ML0101ENv3,learning,4
2749,ML0101ENv3,machine,3
2750,ML0101ENv3,need,1
2751,ML0101ENv3,get,1
2752,ML0101ENv3,started,1
2753,ML0101ENv3,python,2
2754,ML0101ENv3,tool,1
2755,ML0101ENv3,tools,1
2756,ML0101ENv3,predict,1


In [30]:
bow_vectors = pivot_two_bows(course1, course2)
bow_vectors

Unnamed: 0,doc_id,type,approachable,basics,beneficial,comparison,course,dives,free,future,...,relates,started,statistical,supervised,tool,tools,trends,unsupervised,using,vs
0,ML0101ENv3,compare,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,...,0.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0
1,ML0151EN,base,1.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,...,1.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0


Similarly, we can use the cosine method to calculate their similarity:


In [31]:
similarity = 1 - cosine(bow_vectors.iloc[0, 2:], bow_vectors.iloc[1, 2:]);
similarity

0.6626221399549089

In [39]:
course_df[course_df['COURSE_ID'] == 'ML0101ENv3']

Unnamed: 0,COURSE_ID,TITLE,DESCRIPTION
158,ML0101ENv3,machine learning with python,machine learning can be an incredibly benefici...


In [40]:
threshold = 0.5
courses = course_df['COURSE_ID'].unique()

In [42]:
course_ML0101ENv3 = course_df[course_df['COURSE_ID'] == 'ML0101ENv3']

In [43]:
course_ML0101ENv3

Unnamed: 0,COURSE_ID,TITLE,DESCRIPTION
158,ML0101ENv3,machine learning with python,machine learning can be an incredibly benefici...


In [47]:
threshold = 0.5
courses = course_df['COURSE_ID'].unique()
course_ML0101ENv3 = bows_df[bows_df['doc_id'] == 'ML0101ENv3']
for course_id in np.delete(courses, np.where(courses == 'ML0101ENv3')):
    course = bows_df[bows_df['doc_id'] == course_id]
    bow_vectors = pivot_two_bows(course, course_ML0101ENv3)
    similarity = 1 - cosine(bow_vectors.iloc[0, 2:], bow_vectors.iloc[1, 2:])
    if similarity >= threshold:
        print(course_id)

ML0109EN
ML0151EN
excourse46
excourse47
excourse60


In [49]:
np.delete(courses, np.where(courses == 'ML0101ENv3'))

array(['ML0201EN', 'ML0122EN', 'GPXX0ZG0EN', 'RP0105EN', 'GPXX0Z2PEN',
       'CNSC02EN', 'DX0106EN', 'GPXX0FTCEN', 'RAVSCTEST1', 'GPXX06RFEN',
       'GPXX0SDXEN', 'CC0271EN', 'WA0103EN', 'DX0108EN', 'GPXX0PICEN',
       'DAI101EN', 'GPXX0W7KEN', 'GPXX0QR3EN', 'BD0145EN', 'HCC105EN',
       'DE0205EN', 'DS0132EN', 'OS0101EN', 'DS0201EN', 'BENTEST4',
       'CC0210EN', 'PA0103EN', 'HCC104EN', 'GPXX0A1YEN', 'TMP0105EN',
       'PA0107EN', 'DB0113EN', 'PA0109EN', 'PHPM002EN', 'GPXX03HFEN',
       'RP0103', 'RP0103EN', 'BD0212EN', 'GPXX0IBEN', 'SECM03EN',
       'SC0103EN', 'GPXX0YXHEN', 'RP0151EN', 'TA0105', 'SW0201EN',
       'TMP0106', 'GPXX0BUBEN', 'ST0201EN', 'ST0301EN', 'SW0101EN',
       'TMP0101EN', 'DW0101EN', 'BD0143EN', 'WA0101EN', 'GPXX04HEEN',
       'BD0141EN', 'CO0401EN', 'ML0122ENv1', 'BD0151EN', 'TA0106EN',
       'TMP107', 'ML0111EN', 'GPXX048OEN', 'CO0201EN', 'GPXX01DCEN',
       'GPXX04XJEN', 'GPXX0JZ4EN', 'GPXX0ZYVEN', 'GPXX0ZMZEN',
       'GPXX0742EN', 'GPXX0KV4EN', 