In [3]:
# Importing libraries
# Pandas for data manipulation
# TfidfVectorizer to convert text to vector
# linear_kernel to compute the dot product of two vectors
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

In [4]:
# importing the dataset 
# Path : data/data.csv
df = pd.read_csv('../data/data.csv')

In [10]:
#  Data Visualization
df.head()

Unnamed: 0,course_code,course_title,course_organization,course_Certificate_type,course_rating,course_difficulty,course_students_enrolled
0,134,(ISC)² Systems Security Certified Practitioner...,(ISC)²,SPECIALIZATION,4.7,Beginner,5.3k
1,743,A Crash Course in Causality: Inferring Causal...,University of Pennsylvania,COURSE,4.7,Intermediate,17k
2,874,A Crash Course in Data Science,Johns Hopkins University,COURSE,4.5,Mixed,130k
3,413,A Law Student's Toolkit,Yale University,COURSE,4.7,Mixed,91k
4,635,A Life of Happiness and Fulfillment,Indian School of Business,COURSE,4.8,Mixed,320k


In [8]:

# highest value of course_code and lowest value of course_code
df['course_code'].max(), df['course_code'].min()

(890, 0)

In [12]:
# Types of course_certifications
df['course_Certificate_type'].unique()

array(['SPECIALIZATION', 'COURSE', 'PROFESSIONAL CERTIFICATE'],
      dtype=object)

In [13]:
# Types of course_difficulty
df['course_difficulty'].unique()

array(['Beginner', 'Intermediate', 'Mixed', 'Advanced'], dtype=object)

In [14]:
# Min max course rating
df['course_rating'].max(), df['course_rating'].min()

(5.0, 3.3)

Here there are 7 Headers ( course_id, course_title,course_organization,course_Certificate_type,course_rating,course_difficulty,course_students_enrolled)
- course Code starts from `0` and ends at `890`.
- course_title is the name of the course.
- course_organization is the organization which is providing the course.
- course_Certificate_type is the type of certificate provided by the course. `Specialization | Course | Professional Certificate`
- course_rating is the rating of the course. `0.0 - 5.0`
- course_difficulty is the difficulty level of the course. `Beginner | Intermediate | Mixed | Advanced`
- course_students_enrolled is the number of students enrolled for the course. 



In [20]:
# Now we will use TF-IDF vectorizer to convert text to vector
# TF-IDF is an abbreviation for Term Frequency Inverse Document Frequency.
# This is very common algorithm to transform text into a meaningful representation of numbers which is used to fit machine algorithm for prediction.

# Removing all english stop words such as 'the', 'a'
tf = TfidfVectorizer(analyzer="word", ngram_range=(1, 2), min_df=0, stop_words="english")
tf

In [21]:
# Combine relevant features into a single text column
df['features'] = df['course_difficulty'] + ' ' + df['course_title'] + ' ' + df['course_organization']
df['features']

0      Beginner (ISC)² Systems Security Certified Pra...
1      Intermediate A Crash Course in Causality:  Inf...
2      Mixed A Crash Course in Data Science Johns Hop...
3          Mixed A Law Student's Toolkit Yale University
4      Mixed A Life of Happiness and Fulfillment Indi...
                             ...                        
886    Intermediate Программирование на Python Mail.R...
887    Mixed Психолингвистика (Psycholinguistics) Sai...
888    Intermediate Разработка интерфейсов: вёрстка и...
889    Intermediate Русский как иностранный Saint Pet...
890    Beginner Финансовые инструменты для частного и...
Name: features, Length: 891, dtype: object

In [27]:
# Now we will fit and transform the data using TF-IDF vectorized data
tfidf_matrix = tf.fit_transform(df["features"])
print("Shape :", tfidf_matrix.shape)
tfidf_matrix

Shape : (891, 5145)


<891x5145 sparse matrix of type '<class 'numpy.float64'>'
	with 12019 stored elements in Compressed Sparse Row format>

In [28]:
# Computing the cosine similarity on Tfidf matrix
cosine_similarities = linear_kernel(tfidf_matrix, tfidf_matrix)

In [29]:
cosine_similarities

array([[1.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.00331788],
       [0.        , 1.        , 0.22706319, ..., 0.00912975, 0.01291189,
        0.00250773],
       [0.        , 0.22706319, 1.        , ..., 0.        , 0.00459503,
        0.00360564],
       ...,
       [0.        , 0.00912975, 0.        , ..., 1.        , 0.01130791,
        0.        ],
       [0.        , 0.01291189, 0.00459503, ..., 0.01130791, 1.        ,
        0.00310601],
       [0.00331788, 0.00250773, 0.00360564, ..., 0.        , 0.00310601,
        1.        ]])

In [32]:
# creating a mapping of course title to index number
# here we will use course_difficulty and features to find the specific difficulty and subject
user_difficulty = input("Enter the difficulty of course ( 'Beginner', 'Intermediate', 'Mixed', 'Advanced' )  : ")
user_subject = input("Enter the subject of course : ")
user_course_index = df.index[(df['course_difficulty'] == user_difficulty) & (df['features'].str.contains(user_subject))].tolist()[0]


In [33]:
# Here the user_course_index is a number which represent the input values of user
user_course_index

6

In [34]:
# Here we are comparing all the courses with the user_course_index and storing the similarity scores in a list
similarity_scores = list(enumerate(cosine_similarities[user_course_index]))

In [35]:
similarity_scores

[(0, 0.006302404427245335),
 (1, 0.0),
 (2, 0.0),
 (3, 0.0),
 (4, 0.0),
 (5, 0.006920928095185508),
 (6, 1.0000000000000002),
 (7, 0.443935746793803),
 (8, 0.4052676999602514),
 (9, 0.44087448513257965),
 (10, 0.44087448513257965),
 (11, 0.499274462533674),
 (12, 0.009204170768009896),
 (13, 0.006885111222173722),
 (14, 0.006874701160410617),
 (15, 0.0068482524902774575),
 (16, 0.0),
 (17, 0.0),
 (18, 0.008826860667847682),
 (19, 0.0),
 (20, 0.009868498679848212),
 (21, 0.0),
 (22, 0.0),
 (23, 0.008028017492727938),
 (24, 0.0),
 (25, 0.006615659971101126),
 (26, 0.0),
 (27, 0.0),
 (28, 0.0),
 (29, 0.0),
 (30, 0.0),
 (31, 0.0),
 (32, 0.009779310129137247),
 (33, 0.011211631721732875),
 (34, 0.008428483050259557),
 (35, 0.007855967528712694),
 (36, 0.006437170702164567),
 (37, 0.0),
 (38, 0.0),
 (39, 0.0),
 (40, 0.008826503468695283),
 (41, 0.010430579146151774),
 (42, 0.007573127361478521),
 (43, 0.007713936257545344),
 (44, 0.007575629108925544),
 (45, 0.01281158549138861),
 (46, 0.0),

In [44]:
# Combine similarity scores with course ratings
weighted_scores = [(i, score * df['course_rating'][i]) for i, score in similarity_scores]

In [45]:
# Here we are sorting the weighted_scores list in descending order
similar_courses = sorted(weighted_scores, key=lambda x: x[1], reverse=True)


In [39]:
# This is the list of similar courses
similar_courses

[(6, 1.0000000000000002),
 (11, 0.499274462533674),
 (7, 0.443935746793803),
 (9, 0.44087448513257965),
 (10, 0.44087448513257965),
 (8, 0.4052676999602514),
 (211, 0.37273618393626995),
 (787, 0.35762337462471805),
 (723, 0.34230884531052475),
 (788, 0.31533321010651305),
 (162, 0.313696092887491),
 (763, 0.30208699108410436),
 (626, 0.2937369919188723),
 (163, 0.28543101362350415),
 (621, 0.2773987981731122),
 (520, 0.22675714062069036),
 (427, 0.21293544040195947),
 (687, 0.17754523827573485),
 (418, 0.1659334374021575),
 (469, 0.1607849717586794),
 (416, 0.15508957009077518),
 (417, 0.13074727811609355),
 (864, 0.014723355542559049),
 (267, 0.014391027389764569),
 (764, 0.013227964954397096),
 (753, 0.013061118654716236),
 (109, 0.013053590059950191),
 (45, 0.01281158549138861),
 (486, 0.01262801956094225),
 (704, 0.012619636341407416),
 (113, 0.012371704821404228),
 (510, 0.012365706766951949),
 (117, 0.012122801857955496),
 (196, 0.01204774756949154),
 (567, 0.01191272218485649),

In [53]:
# Printing the top 5 similar courses
print("Recommended Courses:")
for i, score in similar_courses[1:6]:
    print(f"{df['course_title'][i]} (Organization: {df['course_organization'][i]}) - Similarity Score: {score:.2f}")
    print(df.iloc[i])
    print("-------------------------------------------")


Recommended Courses:
AI for Medicine (Organization: deeplearning.ai) - Similarity Score: 2.35
course_code                                                           35
course_title                                             AI for Medicine
course_organization                                      deeplearning.ai
course_Certificate_type                                   SPECIALIZATION
course_rating                                                        4.7
course_difficulty                                           Intermediate
course_students_enrolled                                             13k
features                    Intermediate AI for Medicine deeplearning.ai
Name: 11, dtype: object
-------------------------------------------
AI For Medical Treatment (Organization: deeplearning.ai) - Similarity Score: 2.13
course_code                                                               488
course_title                                         AI For Medical Treatment
course_organizat