### Dataset link: https://github.com/DataThinkers/Course-Recommender-System/blob/main/udemy_courses.csv

### Import the library

In [1]:
import numpy as np
import pandas as pd

### Load the dataset

In [2]:
df = pd.read_csv('udemy_courses.csv')

In [3]:
# first row of the dataframe
df.head(1)

Unnamed: 0,course_id,course_title,url,is_paid,price,num_subscribers,num_reviews,num_lectures,level,content_duration,published_timestamp,subject
0,1070968,Ultimate Investment Banking Course,https://www.udemy.com/ultimate-investment-bank...,True,200,2147,23,51,All Levels,1.5,2017-01-18T20:58:58Z,Business Finance


In [4]:
# shape of the dataset
df.shape

(3678, 12)

In [5]:
# checking for null values
df.isnull().sum()

course_id              0
course_title           0
url                    0
is_paid                0
price                  0
num_subscribers        0
num_reviews            0
num_lectures           0
level                  0
content_duration       0
published_timestamp    0
subject                0
dtype: int64

In [6]:
# checking for duplicates
df.duplicated().any()

True

In [7]:
# duplicates
df[df.duplicated()]

Unnamed: 0,course_id,course_title,url,is_paid,price,num_subscribers,num_reviews,num_lectures,level,content_duration,published_timestamp,subject
787,837322,Essentials of money value: Get a financial Life !,https://www.udemy.com/essentials-of-money-value/,True,20,0,0,20,All Levels,0.616667,2016-05-16T18:28:30Z,Business Finance
788,1157298,Introduction to Forex Trading Business For Beg...,https://www.udemy.com/introduction-to-forex-tr...,True,20,0,0,27,Beginner Level,1.5,2017-04-23T16:19:01Z,Business Finance
894,1035638,Understanding Financial Statements,https://www.udemy.com/understanding-financial-...,True,25,0,0,10,All Levels,1.0,2016-12-15T14:56:17Z,Business Finance
1100,1084454,CFA Level 2- Quantitative Methods,https://www.udemy.com/cfa-level-2-quantitative...,True,40,0,0,35,All Levels,5.5,2017-07-02T14:29:35Z,Business Finance
1473,185526,MicroStation - Células,https://www.udemy.com/microstation-celulas/,True,20,0,0,9,Beginner Level,0.616667,2014-04-15T21:48:55Z,Graphic Design
2561,28295,Learn Web Designing & HTML5/CSS3 Essentials in...,https://www.udemy.com/build-beautiful-html5-we...,True,75,43285,525,24,All Levels,4.0,2013-01-03T00:55:31Z,Web Development


In [8]:
# dropping the duplicates
df = df.drop_duplicates()

In [9]:
# checking for shape after dropping the duplicates
df.shape

(3672, 12)

### Popularity Based Recommendation System

In [10]:
def popularity_based_recommendation(data,top_n=5):
    
    # Calculate popularity score for each course
    df['popularity_score'] = 0.6 * df['num_subscribers'] + 0.4 * df['num_reviews']
    
    # Sort courses by popularity score in descending order
    df_sorted = df.sort_values(by='popularity_score', ascending=False)
    
    # Return the recommended courses (course titles and popularity scores)
    recommended_courses = df_sorted[['course_title', 'popularity_score']].head(top_n)
    
    return recommended_courses

In [11]:
popularity_based_recommendation(df)

Unnamed: 0,course_title,popularity_score
2827,Learn HTML5 Programming From Scratch,164805.4
3032,Coding for Entrepreneurs Basic,96729.0
3230,The Web Developer Bootcamp,83928.4
3232,The Complete Web Developer Course 2.0,77672.0
2783,Build Your First Website in 1 Week with HTML5 ...,74544.2


### Content Based Recommendation System

In [12]:
import neattext.functions as nfx

In [13]:
df['course_title'].head()

0                   Ultimate Investment Banking Course
1    Complete GST Course & Certification - Grow You...
2    Financial Modeling for Business Analysts and C...
3    Beginner to Pro - Financial Analysis in Excel ...
4         How To Maximize Your Profits Trading Options
Name: course_title, dtype: object

In [14]:
# removing the stopwords and special characters
df['course_title'] = df['course_title'].apply(nfx.remove_stopwords)
df['course_title'] = df['course_title'].apply(nfx.remove_special_characters)

In [15]:
df['course_title'].head()

0                   Ultimate Investment Banking Course
1    Complete GST Course  Certification  Grow Practice
2     Financial Modeling Business Analysts Consultants
3          Beginner Pro  Financial Analysis Excel 2017
4                     Maximize Profits Trading Options
Name: course_title, dtype: object

In [16]:
# combining two features
df['title_subject'] = df['course_title'] +' '+df['subject']

In [17]:
from sklearn.feature_extraction.text import CountVectorizer

In [18]:
cv = CountVectorizer(max_features=3000)
vectors = cv.fit_transform(df['title_subject']).toarray()

In [19]:
vectors

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [20]:
# shape of the vectors
vectors.shape

(3672, 3000)

In [21]:
# count of the selected features
len(cv.get_feature_names_out())

3000

#### Cosine Similarity

In [22]:
from sklearn.metrics.pairwise import cosine_similarity

In [23]:
similarity = cosine_similarity(vectors)

In [24]:
sorted(list(enumerate(similarity[0])),reverse=True,key=lambda x:x[1])[1:6]

[(39, 0.7715167498104596),
 (240, 0.6666666666666669),
 (417, 0.6666666666666669),
 (418, 0.6172133998483676),
 (657, 0.6172133998483676)]

In [25]:
def recommend(course):
    # let's featch the index
    course_index = df[df['course_title']==course].index[0]
    distances = similarity[course_index]
    courses_list = sorted(list(enumerate(distances)),reverse=True,key=lambda x:x[1])[1:6]
    for i in courses_list:
        print(df.iloc[i[0]]['course_title'])

In [26]:
recommend("know HTML Learn HTML Basics")

WordPress Development Beginners
Wordpress Theme Development Beginners
Wordpress beginners Build Websites Fast Coding
Website Coding WordPress  Web Skills
Kids Coding  Beginners CSS
