In [1]:
# Importing libraries
# Pandas for data manipulation
# TfidfVectorizer to convert text to vector
# linear_kernel to compute the dot product of two vectors
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

In [2]:
# importing the dataset 
# Path : data/data.csv
df = pd.read_csv('../data/data.csv')

In [3]:
#  Data Visualization
df.head()

Unnamed: 0,course_code,course_title,course_organization,course_Certificate_type,course_rating,course_difficulty,course_students_enrolled
0,134,(ISC)² Systems Security Certified Practitioner...,(ISC)²,SPECIALIZATION,4.7,Beginner,5.3k
1,743,A Crash Course in Causality: Inferring Causal...,University of Pennsylvania,COURSE,4.7,Intermediate,17k
2,874,A Crash Course in Data Science,Johns Hopkins University,COURSE,4.5,Mixed,130k
3,413,A Law Student's Toolkit,Yale University,COURSE,4.7,Mixed,91k
4,635,A Life of Happiness and Fulfillment,Indian School of Business,COURSE,4.8,Mixed,320k


In [4]:

# highest value of course_code and lowest value of course_code
df['course_code'].max(), df['course_code'].min()

(890, 0)

In [5]:
# Types of course_certifications
df['course_Certificate_type'].unique()

array(['SPECIALIZATION', 'COURSE', 'PROFESSIONAL CERTIFICATE'],
      dtype=object)

In [6]:
# Types of course_difficulty
df['course_difficulty'].unique()

array(['Beginner', 'Intermediate', 'Mixed', 'Advanced'], dtype=object)

In [7]:
# Min max course rating
df['course_rating'].max(), df['course_rating'].min()

(5.0, 3.3)

Here there are 7 Headers ( course_id, course_title,course_organization,course_Certificate_type,course_rating,course_difficulty,course_students_enrolled)
- course Code starts from `0` and ends at `890`.
- course_title is the name of the course.
- course_organization is the organization which is providing the course.
- course_Certificate_type is the type of certificate provided by the course. `Specialization | Course | Professional Certificate`
- course_rating is the rating of the course. `0.0 - 5.0`
- course_difficulty is the difficulty level of the course. `Beginner | Intermediate | Mixed | Advanced`
- course_students_enrolled is the number of students enrolled for the course. 



In [8]:
# Now we will use TF-IDF vectorizer to convert text to vector
# TF-IDF is an abbreviation for Term Frequency Inverse Document Frequency.
# This is very common algorithm to transform text into a meaningful representation of numbers which is used to fit machine algorithm for prediction.

# Removing all english stop words such as 'the', 'a'
tf = TfidfVectorizer(analyzer="word", ngram_range=(1, 2), min_df=0, stop_words="english")
tf

In [9]:
# Combine relevant features into a single text column
df['features'] = df['course_difficulty'] + ' ' + df['course_title'] + ' ' + df['course_organization']
df['features']

0      Beginner (ISC)² Systems Security Certified Pra...
1      Intermediate A Crash Course in Causality:  Inf...
2      Mixed A Crash Course in Data Science Johns Hop...
3          Mixed A Law Student's Toolkit Yale University
4      Mixed A Life of Happiness and Fulfillment Indi...
                             ...                        
886    Intermediate Программирование на Python Mail.R...
887    Mixed Психолингвистика (Psycholinguistics) Sai...
888    Intermediate Разработка интерфейсов: вёрстка и...
889    Intermediate Русский как иностранный Saint Pet...
890    Beginner Финансовые инструменты для частного и...
Name: features, Length: 891, dtype: object

In [10]:
# Now we will fit and transform the data using TF-IDF vectorized data
tfidf_matrix = tf.fit_transform(df["features"])
print("Shape :", tfidf_matrix.shape)
tfidf_matrix

Shape : (891, 5145)


<891x5145 sparse matrix of type '<class 'numpy.float64'>'
	with 12019 stored elements in Compressed Sparse Row format>

In [11]:
# Computing the cosine similarity on Tfidf matrix
cosine_similarities = linear_kernel(tfidf_matrix, tfidf_matrix)

In [12]:
cosine_similarities

array([[1.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.00331788],
       [0.        , 1.        , 0.22706319, ..., 0.00912975, 0.01291189,
        0.00250773],
       [0.        , 0.22706319, 1.        , ..., 0.        , 0.00459503,
        0.00360564],
       ...,
       [0.        , 0.00912975, 0.        , ..., 1.        , 0.01130791,
        0.        ],
       [0.        , 0.01291189, 0.00459503, ..., 0.01130791, 1.        ,
        0.00310601],
       [0.00331788, 0.00250773, 0.00360564, ..., 0.        , 0.00310601,
        1.        ]])

In [13]:
# creating a mapping of course title to index number
# here we will use course_difficulty and features to find the specific difficulty and subject
user_difficulty = input("Enter the difficulty of course ( 'Beginner', 'Intermediate', 'Mixed', 'Advanced' )  : ")
user_subject = input("Enter the subject of course : ")
user_course_index = df.index[(df['course_difficulty'] == user_difficulty) & (df['features'].str.contains(user_subject))].tolist()[0]


In [14]:
# Here the user_course_index is a number which represent the input values of user
user_course_index

57

In [15]:
# Here we are comparing all the courses with the user_course_index and storing the similarity scores in a list
similarity_scores = list(enumerate(cosine_similarities[user_course_index]))

In [16]:
similarity_scores

[(0, 0.0),
 (1, 0.01665482956802046),
 (2, 0.005927057231014591),
 (3, 0.005979011054229988),
 (4, 0.0),
 (5, 0.00479137627978091),
 (6, 0.0),
 (7, 0.01765813228297424),
 (8, 0.0),
 (9, 0.017536366546024505),
 (10, 0.017536366546024505),
 (11, 0.01985930299283921),
 (12, 0.0),
 (13, 0.0),
 (14, 0.0),
 (15, 0.0),
 (16, 0.01555239737403904),
 (17, 0.005875255184905013),
 (18, 0.006110858290563248),
 (19, 0.02145816489692866),
 (20, 0.0),
 (21, 0.007745166835048887),
 (22, 0.1233110898003604),
 (23, 0.005557817110552113),
 (24, 0.0),
 (25, 0.004580038374142415),
 (26, 0.02632690704054024),
 (27, 0.0),
 (28, 0.14254959829355016),
 (29, 0.1299408912642287),
 (30, 0.003992719234613952),
 (31, 0.006998132976252353),
 (32, 0.006770241496651969),
 (33, 0.007761841410622288),
 (34, 0.005835060443648006),
 (35, 0.0),
 (36, 0.0),
 (37, 0.02346001216692179),
 (38, 0.03426928097558714),
 (39, 0.023160631063457525),
 (40, 0.00611061100067345),
 (41, 0.007221116708323416),
 (42, 0.0),
 (43, 0.00534037

In [17]:
# Combine similarity scores with course ratings
weighted_scores = [(i, score * df['course_rating'][i]) for i, score in similarity_scores]

In [18]:
# Here we are sorting the weighted_scores list in descending order
similar_courses = sorted(weighted_scores, key=lambda x: x[1], reverse=True)


In [19]:
# This is the list of similar courses
similar_courses

[(57, 4.6),
 (56, 2.3657379773501757),
 (570, 2.2569808698249925),
 (58, 1.7351821482969958),
 (752, 1.4618870757086082),
 (487, 1.3539659369773769),
 (849, 1.2365509320693016),
 (190, 1.2084154008673034),
 (563, 1.1975405775183827),
 (841, 1.1558127086774301),
 (564, 1.1452871303408108),
 (55, 1.1430898462734327),
 (567, 1.1067967528918015),
 (674, 1.0334495204733098),
 (688, 0.9267456032138145),
 (682, 0.9267428638625661),
 (568, 0.9000603609055904),
 (532, 0.8860358460964333),
 (128, 0.8746735927194738),
 (684, 0.8361126049169554),
 (569, 0.8309750359989698),
 (200, 0.8213616162243298),
 (683, 0.812226436567496),
 (595, 0.8051993944040854),
 (520, 0.7993076567867085),
 (763, 0.763775818735983),
 (566, 0.7549190153077534),
 (571, 0.7235914500587473),
 (193, 0.7165632624738657),
 (375, 0.701959826167594),
 (45, 0.6888686003954937),
 (598, 0.6719802619288179),
 (685, 0.6597819226607893),
 (596, 0.6589221743484497),
 (597, 0.6589221743484497),
 (850, 0.6533934456353286),
 (389, 0.653205

In [20]:
# Printing the top 5 similar courses
print("Recommended Courses:")
for i, score in similar_courses[1:6]:
    print(f"{df['course_title'][i]} (Organization: {df['course_organization'][i]}) - Similarity Score: {score:.2f}")
    print(df.iloc[i])
    print("-------------------------------------------")


Recommended Courses:
Applied Data Science with Python (Organization: University of Michigan) - Similarity Score: 2.37
course_code                                                                26
course_title                                 Applied Data Science with Python
course_organization                                    University of Michigan
course_Certificate_type                                        SPECIALIZATION
course_rating                                                             4.5
course_difficulty                                                Intermediate
course_students_enrolled                                                 480k
features                    Intermediate Applied Data Science with Python ...
Name: 56, dtype: object
-------------------------------------------
Machine Learning with Python (Organization: IBM) - Similarity Score: 2.26
course_code                                                           321
course_title                              