# Jaccard similarity method

In [1]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from scipy.sparse.linalg import svds

df_courses = pd.read_csv("Coursera.csv")
df_courses

Unnamed: 0,Course Name,University,Difficulty Level,Course Rating,Course URL,Course Description,Skills
0,Write A Feature Length Screenplay For Film Or ...,Michigan State University,Beginner,4.8,https://www.coursera.org/learn/write-a-feature...,Write a Full Length Feature Film Script In th...,Drama Comedy peering screenwriting film D...
1,Business Strategy: Business Model Canvas Analy...,Coursera Project Network,Beginner,4.8,https://www.coursera.org/learn/canvas-analysis...,"By the end of this guided project, you will be...",Finance business plan persona (user experien...
2,Silicon Thin Film Solar Cells,Ecole Polytechnique,Advanced,4.1,https://www.coursera.org/learn/silicon-thin-fi...,This course consists of a general presentation...,chemistry physics Solar Energy film lambda...
3,Finance for Managers,IESE Business School,Intermediate,4.8,https://www.coursera.org/learn/operational-fin...,"When it comes to numbers, there is always more...",accounts receivable dupont analysis analysis...
4,Retrieve Data using Single-Table SQL Queries,Coursera Project Network,Beginner,4.6,https://www.coursera.org/learn/single-table-sq...,In this course you will learn how to effective...,Data Analysis select (sql) database manageme...
...,...,...,...,...,...,...,...
3517,"Capstone: Retrieving, Processing, and Visualiz...",University of Michigan,Beginner,4.6,https://www.coursera.org/learn/python-data-vis...,"In the capstone, students will build a series ...",Databases syntax analysis web Data Visuali...
3518,Patrick Henry: Forgotten Founder,University of Virginia,Intermediate,4.9,https://www.coursera.org/learn/henry,"Give me liberty, or give me death: Remembering...",retirement Causality career history of the ...
3519,Business intelligence and data analytics: Gene...,Macquarie University,Advanced,4.6,https://www.coursera.org/learn/business-intell...,Megatrends heavily influence today's organisat...,analytics tableau software Business Intellig...
3520,Rigid Body Dynamics,Korea Advanced Institute of Science and Techno...,Beginner,4.6,https://www.coursera.org/learn/rigid-body-dyna...,"This course teaches dynamics, one of the basic...",Angular Mechanical Design fluid mechanics F...


In [2]:
# we need to decide (maybe if you face some issues) whether the id will be start with 0 or 1.
df_courses.insert(0, 'id', range(1, len(df_courses) + 1))
df_courses

Unnamed: 0,id,Course Name,University,Difficulty Level,Course Rating,Course URL,Course Description,Skills
0,1,Write A Feature Length Screenplay For Film Or ...,Michigan State University,Beginner,4.8,https://www.coursera.org/learn/write-a-feature...,Write a Full Length Feature Film Script In th...,Drama Comedy peering screenwriting film D...
1,2,Business Strategy: Business Model Canvas Analy...,Coursera Project Network,Beginner,4.8,https://www.coursera.org/learn/canvas-analysis...,"By the end of this guided project, you will be...",Finance business plan persona (user experien...
2,3,Silicon Thin Film Solar Cells,Ecole Polytechnique,Advanced,4.1,https://www.coursera.org/learn/silicon-thin-fi...,This course consists of a general presentation...,chemistry physics Solar Energy film lambda...
3,4,Finance for Managers,IESE Business School,Intermediate,4.8,https://www.coursera.org/learn/operational-fin...,"When it comes to numbers, there is always more...",accounts receivable dupont analysis analysis...
4,5,Retrieve Data using Single-Table SQL Queries,Coursera Project Network,Beginner,4.6,https://www.coursera.org/learn/single-table-sq...,In this course you will learn how to effective...,Data Analysis select (sql) database manageme...
...,...,...,...,...,...,...,...,...
3517,3518,"Capstone: Retrieving, Processing, and Visualiz...",University of Michigan,Beginner,4.6,https://www.coursera.org/learn/python-data-vis...,"In the capstone, students will build a series ...",Databases syntax analysis web Data Visuali...
3518,3519,Patrick Henry: Forgotten Founder,University of Virginia,Intermediate,4.9,https://www.coursera.org/learn/henry,"Give me liberty, or give me death: Remembering...",retirement Causality career history of the ...
3519,3520,Business intelligence and data analytics: Gene...,Macquarie University,Advanced,4.6,https://www.coursera.org/learn/business-intell...,Megatrends heavily influence today's organisat...,analytics tableau software Business Intellig...
3520,3521,Rigid Body Dynamics,Korea Advanced Institute of Science and Techno...,Beginner,4.6,https://www.coursera.org/learn/rigid-body-dyna...,"This course teaches dynamics, one of the basic...",Angular Mechanical Design fluid mechanics F...


## Jaccard Similarity method

To calculate the Jaccard Similarity for all pairs of courses in your dataset, you'll need to compare each course with every other course.Given that Coursera.csv has 3522 rows, this means a large number of comparisons (over 6 million pairs).

$$ 
\text{Jaccard Similarity} = \frac{\text{Size of Intersection}}{\text{Size of Union}}
$$

In [54]:
df_skills = df_courses[['Skills']]
df_skills

Unnamed: 0,Skills
0,Drama Comedy peering screenwriting film D...
1,Finance business plan persona (user experien...
2,chemistry physics Solar Energy film lambda...
3,accounts receivable dupont analysis analysis...
4,Data Analysis select (sql) database manageme...
...,...
3517,Databases syntax analysis web Data Visuali...
3518,retirement Causality career history of the ...
3519,analytics tableau software Business Intellig...
3520,Angular Mechanical Design fluid mechanics F...


In [55]:
def data_preprocessing(df):
    df_cleaned = df.copy()
    #removing paranthesis from skills columns 
    df_cleaned['Skills'] = df_cleaned['Skills'].str.replace('(',' ')
    df_cleaned['Skills'] = df_cleaned['Skills'].str.replace(')',' ')
    
    return df_cleaned

In [56]:
df_skills_cleaned = data_preprocessing(df_skills)
df_skills_cleaned

Unnamed: 0,Skills
0,Drama Comedy peering screenwriting film D...
1,Finance business plan persona user experien...
2,chemistry physics Solar Energy film lambda...
3,accounts receivable dupont analysis analysis...
4,Data Analysis select sql database manageme...
...,...
3517,Databases syntax analysis web Data Visuali...
3518,retirement Causality career history of the ...
3519,analytics tableau software Business Intellig...
3520,Angular Mechanical Design fluid mechanics F...


In [57]:
# Splitting the skills string into a set of individual skills
df_skills_cleaned['Skills'] = df_skills_cleaned['Skills'].str.split().apply(set)
df_skills_cleaned

Unnamed: 0,Skills
0,"{creative, screenwriting, Drama, peering, Writ..."
1,"{project, business-strategy, experience, perso..."
2,"{lambda, silicon, film, Energy, electronics, e..."
3,"{Leadership, analysis, Financial, accounts, Ma..."
4,"{table, retrieval, web, database, information-..."
...,...
3517,"{Databases, syntax, analysis, Computer, python..."
3518,"{book, Dependent, retirement, And, of, Influen..."
3519,"{Dispersion, Intelligence, analysis, Forecasti..."
3520,"{robotics, lecture, Mechanical, dynamics, ener..."


In [58]:
# Define a function to calculate Jaccard Similarity
def jaccard_similarity(set1, set2):
    intersection = set1.intersection(set2)
    union = set1.union(set2)
    return len(intersection) / len(union) if union else 0

In [59]:
# Calculate Jaccard Similarity for all course pairs
n = len(df_skills_cleaned)
similarity_matrix = pd.DataFrame(index=range(n), columns=range(n))

In [60]:
import time
# Start the timer
start_time = time.time()

for i in range(n):
    for j in range(n):
        if i != j:
            similarity_matrix.iloc[i, j] = jaccard_similarity(df_skills_cleaned['Skills'].iloc[i], df_skills_cleaned['Skills'].iloc[j])
            
# Stop the timer
end_time = time.time()

# Calculate and print the elapsed time
elapsed_time = end_time - start_time
print(f"Time taken to compute the similarity matrix: {elapsed_time} seconds")

Time taken to compute the similarity matrix: 1579.100442647934 seconds


In [61]:
similarity_matrix

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,3512,3513,3514,3515,3516,3517,3518,3519,3520,3521
0,,0.0,0.034483,0.0,0.0,0.0,0.033333,0.0,0.034483,0.0,...,0.0,0.0,0.076923,0.0,0.0,0.0,0.033333,0.0,0.0,0.0
1,0.0,,0.0,0.064516,0.0,0.034483,0.185185,0.0,0.068966,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.064516,0.0,0.0
2,0.034483,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.111111,0.0,0.0,0.0,0.0,0.0,0.0,0.068966,0.0
3,0.0,0.064516,0.0,,0.027778,0.0,0.03125,0.0,0.032258,0.0,...,0.060606,0.0,0.0,0.026316,0.0,0.066667,0.0,0.096774,0.0,0.0
4,0.0,0.0,0.0,0.027778,,0.0,0.0,0.0,0.0,0.0,...,0.055556,0.0,0.0,0.05,0.027778,0.09375,0.0,0.057143,0.0,0.057143
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3517,0.0,0.0,0.0,0.066667,0.09375,0.0,0.0,0.066667,0.0,0.0,...,0.222222,0.0,0.0,0.027778,0.066667,,0.0,0.066667,0.0,0.0
3518,0.033333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.068966,0.0,...,0.030303,0.0,0.074074,0.0,0.0,0.0,,0.0,0.0,0.0
3519,0.0,0.064516,0.0,0.096774,0.057143,0.0,0.03125,0.0,0.066667,0.0,...,0.166667,0.0,0.0,0.0,0.030303,0.066667,0.0,,0.0,0.0
3520,0.0,0.0,0.068966,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.068966,0.0,0.0,0.0,0.0,0.0,0.0,,0.0


In [62]:
# Or exporting as a Pickle file for better performance with large matrices
similarity_matrix.to_pickle('similarity_matrix.pkl')
df_courses.to_pickle('df_courses.pkl')

In [63]:
# Get the first 5 more similar courses of a specific course
def get_similar_courses(similarity_matrix, course_index, top_n=5):
    # Extract the similarity values for the given course
    similar_courses = similarity_matrix.iloc[course_index]

    # Sort the courses by similarity (excluding the course itself)
    most_similar = similar_courses.sort_values(ascending=False).drop(course_index)

    # Return the indices of the top N similar courses
    return most_similar.head(top_n).index

# Example usage
course_index = 104  # Change this to the index of the course you're interested in
top_similar_courses = get_similar_courses(similarity_matrix, course_index, top_n=5)

# Print the indices of the top similar courses
print("Top 5 similar courses for course index", course_index, ":", top_similar_courses)

Top 5 similar courses for course index 104 : Index([3330, 3423, 3000, 452, 1022], dtype='int64')


In [64]:
# Here, I'm assuming there isn't one and creating a simple sequential ID
df_courses['CourseID'] = range(1, len(df_courses) + 1)

# Create a new DataFrame with just the IDs and Course Names
df_course_mapping = df_courses[['CourseID', 'Course Name']].copy()

# Display the first few rows of the mapping DataFrame
df_course_mapping.head()

Unnamed: 0,CourseID,Course Name
0,1,Write A Feature Length Screenplay For Film Or ...
1,2,Business Strategy: Business Model Canvas Analy...
2,3,Silicon Thin Film Solar Cells
3,4,Finance for Managers
4,5,Retrieve Data using Single-Table SQL Queries


In [65]:
def get_course_names_from_indices(indices, df_mapping):
    course_names = []
    for idx in indices:
        # Retrieve and append the course name using the index
        course_names.append(df_mapping.iloc[idx]['Course Name'])
    return course_names

# Retrieve the course names for the top similar courses
top_course_names = get_course_names_from_indices(top_similar_courses, df_courses)

print(f"Top 5 similar courses for course at index {course_index} ({df_courses.iloc[course_index]['Course Name']}):")
for name in top_course_names:
    print(name)

Top 5 similar courses for course at index 104 (Business Writing):
High-Impact Business Writing
High-Impact Business Writing
Business English: Meetings
Business English: Capstone Project
Take Your English Communication Skills to the Next Level
