In [2]:
# Importing libraries
# Pandas for data manipulation
# TfidfVectorizer to convert text to vector
# linear_kernel to compute the dot product of two vectors
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

In [5]:
# importing the dataset 
# Path : data/data.csv
df = pd.read_csv('../data/udemy_courses.csv')

In [6]:
#  Data Visualization
df.head()

Unnamed: 0,course_id,course_title,url,is_paid,price,num_subscribers,num_reviews,num_lectures,level,content_duration,published_timestamp,subject
0,1070968,Ultimate Investment Banking Course,https://www.udemy.com/ultimate-investment-bank...,True,200,2147,23,51,All Levels,1.5,2017-01-18T20:58:58Z,Business Finance
1,1113822,Complete GST Course & Certification - Grow You...,https://www.udemy.com/goods-and-services-tax/,True,75,2792,923,274,All Levels,39.0,2017-03-09T16:34:20Z,Business Finance
2,1006314,Financial Modeling for Business Analysts and C...,https://www.udemy.com/financial-modeling-for-b...,True,45,2174,74,51,Intermediate Level,2.5,2016-12-19T19:26:30Z,Business Finance
3,1210588,Beginner to Pro - Financial Analysis in Excel ...,https://www.udemy.com/complete-excel-finance-c...,True,95,2451,11,36,All Levels,3.0,2017-05-30T20:07:24Z,Business Finance
4,1011058,How To Maximize Your Profits Trading Options,https://www.udemy.com/how-to-maximize-your-pro...,True,200,1276,45,26,Intermediate Level,2.0,2016-12-13T14:57:18Z,Business Finance


In [7]:
# Types of course levels
df['level'].unique()

array(['All Levels', 'Intermediate Level', 'Beginner Level',
       'Expert Level'], dtype=object)

In [8]:
# Now we will use TF-IDF vectorizer to convert text to vector
# TF-IDF is an abbreviation for Term Frequency Inverse Document Frequency.
# This is very common algorithm to transform text into a meaningful representation of numbers which is used to fit machine algorithm for prediction.

# Removing all english stop words such as 'the', 'a'
tf = TfidfVectorizer(analyzer="word", ngram_range=(1, 2), min_df=0, stop_words="english")
tf

In [21]:
# Combine relevant features into a single text column
df['features'] = df['level'] + ' ' + df['course_title'] 
df['features']

0           All Levels Ultimate Investment Banking Course
1       All Levels Complete GST Course & Certification...
2       Intermediate Level Financial Modeling for Busi...
3       All Levels Beginner to Pro - Financial Analysi...
4       Intermediate Level How To Maximize Your Profit...
                              ...                        
3673    All Levels Learn jQuery from Scratch - Master ...
3674    Beginner Level How To Design A WordPress Websi...
3675             All Levels Learn and Build using Polymer
3676    All Levels CSS Animations: Create Amazing Effe...
3677    Beginner Level Using MODX CMS to Build Website...
Name: features, Length: 3678, dtype: object

In [22]:
# Now we will fit and transform the data using TF-IDF vectorized data
tfidf_matrix = tf.fit_transform(df["features"])
print("Shape :", tfidf_matrix.shape)
tfidf_matrix

Shape : (3678, 15106)


<3678x15106 sparse matrix of type '<class 'numpy.float64'>'
	with 44760 stored elements in Compressed Sparse Row format>

In [23]:
# Computing the cosine similarity on Tfidf matrix
cosine_similarities = linear_kernel(tfidf_matrix, tfidf_matrix)

In [24]:
# creating a mapping of course title to index number
# here we will use course_difficulty and features to find the specific difficulty and subject
user_difficulty = input("Enter the difficulty of course ( 'All Levels', 'Intermediate Level', 'Beginner Level','Expert Level' )  : ")
user_subject = input("Enter the subject of course : ")
user_course_index = df.index[(df['level'] == user_difficulty) & (df['features'].str.contains(user_subject))].tolist()[0]


In [25]:
# Here we are comparing all the courses with the user_course_index and storing the similarity scores in a list
similarity_scores = list(enumerate(cosine_similarities[user_course_index]))

In [27]:
# Here we are sorting the weighted_scores list in descending order
similar_courses = sorted(similarity_scores, key=lambda x: x[1], reverse=True)


In [28]:
# Printing the top 5 similar courses
print("Recommended Courses:")
for i, score in similar_courses[1:6]:
    print(f"{df['course_title'][i]} - Similarity Score: {score:.2f}")
    print(df.iloc[i])
    print("-------------------------------------------")


Recommended Courses:
Python Algo Stock Trading: Automate Your Trading! - Similarity Score: 0.41
course_id                                                        1170894
course_title           Python Algo Stock Trading: Automate Your Trading!
url                    https://www.udemy.com/algorithmic-stock-tradin...
is_paid                                                             True
price                                                                 95
num_subscribers                                                     1165
num_reviews                                                           21
num_lectures                                                          41
level                                                     Beginner Level
content_duration                                                     2.5
published_timestamp                                 2017-05-28T23:41:03Z
subject                                                 Business Finance
features               Begin