In [14]:
# Importing libraries
# Pandas for data manipulation
# TfidfVectorizer to convert text to vector
# linear_kernel to compute the dot product of two vectors
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

In [15]:
# importing the dataset 
# Path : data/data.csv
df = pd.read_csv('../data/udemy_courses.csv')

In [16]:
#  Data Visualization
df.head()

Unnamed: 0,course_id,course_title,url,is_paid,price,num_subscribers,num_reviews,num_lectures,level,content_duration,published_timestamp,subject
0,1070968,Ultimate Investment Banking Course,https://www.udemy.com/ultimate-investment-bank...,True,200,2147,23,51,All Levels,1.5,2017-01-18T20:58:58Z,Business Finance
1,1113822,Complete GST Course & Certification - Grow You...,https://www.udemy.com/goods-and-services-tax/,True,75,2792,923,274,All Levels,39.0,2017-03-09T16:34:20Z,Business Finance
2,1006314,Financial Modeling for Business Analysts and C...,https://www.udemy.com/financial-modeling-for-b...,True,45,2174,74,51,Intermediate Level,2.5,2016-12-19T19:26:30Z,Business Finance
3,1210588,Beginner to Pro - Financial Analysis in Excel ...,https://www.udemy.com/complete-excel-finance-c...,True,95,2451,11,36,All Levels,3.0,2017-05-30T20:07:24Z,Business Finance
4,1011058,How To Maximize Your Profits Trading Options,https://www.udemy.com/how-to-maximize-your-pro...,True,200,1276,45,26,Intermediate Level,2.0,2016-12-13T14:57:18Z,Business Finance


In [17]:
# Types of course levels
df['level'].unique()

array(['All Levels', 'Intermediate Level', 'Beginner Level',
       'Expert Level'], dtype=object)

In [18]:
# Now we will use TF-IDF vectorizer to convert text to vector
# TF-IDF is an abbreviation for Term Frequency Inverse Document Frequency.
# This is very common algorithm to transform text into a meaningful representation of numbers which is used to fit machine algorithm for prediction.

# Removing all english stop words such as 'the', 'a'
tf = TfidfVectorizer(analyzer="word", ngram_range=(1, 2), min_df=0, stop_words="english")
tf

In [19]:
# Combine relevant features into a single text column
df['features'] = df['level'] + ' ' + df['course_title'] 
df['features']

0           All Levels Ultimate Investment Banking Course
1       All Levels Complete GST Course & Certification...
2       Intermediate Level Financial Modeling for Busi...
3       All Levels Beginner to Pro - Financial Analysi...
4       Intermediate Level How To Maximize Your Profit...
                              ...                        
3673    All Levels Learn jQuery from Scratch - Master ...
3674    Beginner Level How To Design A WordPress Websi...
3675             All Levels Learn and Build using Polymer
3676    All Levels CSS Animations: Create Amazing Effe...
3677    Beginner Level Using MODX CMS to Build Website...
Name: features, Length: 3678, dtype: object

In [20]:
# Now we will fit and transform the data using TF-IDF vectorized data
tfidf_matrix = tf.fit_transform(df["features"])
print("Shape :", tfidf_matrix.shape)
tfidf_matrix

Shape : (3678, 15106)


<3678x15106 sparse matrix of type '<class 'numpy.float64'>'
	with 44760 stored elements in Compressed Sparse Row format>

In [21]:
# Computing the cosine similarity on Tfidf matrix
cosine_similarities = linear_kernel(tfidf_matrix, tfidf_matrix)

In [39]:
# creating a mapping of course title to index number
# here we will use course_difficulty and features to find the specific difficulty and subject 
user_difficulty = input("Enter the difficulty of course ( 'All Levels', 'Intermediate Level', 'Beginner Level','Expert Level' )  : ")
user_subject = input("Enter the subject of course : ")
user_course_index = df.index[(df['level'] == user_difficulty) & (df['features'].str.contains(user_subject))].tolist()

# User course index is a list of all the courses that matches the user difficulty and subject

In [54]:
# Here we are comparing all the courses with the user course index and average the similarity scores for each course for each user course index

similarity_scores = []
for i in user_course_index:
    similarity_scores.append(list(enumerate(cosine_similarities[i]))) # Append a list of tuples to similarity_scores

# Now similarity_scores is a list of lists of tuples
# Each inner list contains tuples of the form (index, similarity_score)
# We want to average the similarity scores for each course 

sums = {}
counts = {}

for inner_list in similarity_scores:
    for tup in inner_list:
        index, score = tup
        sums[index] = sums.get(index, 0) + score
        counts[index] = counts.get(index, 0) + 1

avg_similarity_scores = [(index, sums[index] / counts[index]) for index in sums]


In [55]:
# Here we are sorting the weighted_scores list in descending order
similar_courses = sorted(avg_similarity_scores, key=lambda x: x[1], reverse=True)
similar_courses 


[(3343, 0.3918599485832597),
 (2681, 0.37103492446962966),
 (3333, 0.3100167447486889),
 (3160, 0.2903441431151472),
 (30, 0.28864484034687515),
 (3194, 0.2625846334470548),
 (3138, 0.2193883768562261),
 (2497, 0.21594193729798236),
 (3284, 0.19753654607074664),
 (147, 0.17061954215365097),
 (3392, 0.1503419518891245),
 (2528, 0.14469029469853434),
 (2570, 0.14234538806036456),
 (2960, 0.12575149500952595),
 (3200, 0.12468921685121104),
 (536, 0.12386971457812872),
 (2859, 0.11747437668917693),
 (2607, 0.11594092836779103),
 (2660, 0.11594092836779103),
 (14, 0.11461672013448813),
 (3192, 0.11364854333249984),
 (334, 0.11223521019397083),
 (3203, 0.10544431119532384),
 (762, 0.10311751307443483),
 (3197, 0.09905203487475196),
 (2494, 0.09631318733493448),
 (2962, 0.09545982308502743),
 (863, 0.09405520853030507),
 (2507, 0.09200020355107481),
 (3326, 0.09050288828251056),
 (3018, 0.0901080687208737),
 (3312, 0.08999575089039534),
 (2707, 0.08530322156197638),
 (3123, 0.0852902244721548

In [57]:
# Printing the top 5 similar courses
print("Recommended Courses:")
for i,score in similar_courses[1:6]:
    print(f"{df['course_title'][i]} - Similarity Score: {score:.2f}")
    print(df.iloc[i])
    print(i)
    print("-------------------------------------------")


Recommended Courses:
Python for Beginners: Python Programming Language | Tutorial - Similarity Score: 0.37
course_id                                                         477702
course_title           Python for Beginners: Python Programming Langu...
url                                 https://www.udemy.com/python-course/
is_paid                                                             True
price                                                                150
num_subscribers                                                     6153
num_reviews                                                          125
num_lectures                                                          84
level                                                     Beginner Level
content_duration                                                     5.0
published_timestamp                                 2015-06-14T18:18:57Z
subject                                                  Web Development
features         