In [2]:
# Importing libraries
# Pandas for data manipulation
# TfidfVectorizer to convert text to vector
# linear_kernel to compute the dot product of two vectors
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

In [3]:
# importing the dataset 
# Path : data/data.csv
df = pd.read_csv('../data/data.csv')

In [4]:
#  Data Visualization
df.head()

Unnamed: 0,course_code,course_title,course_organization,course_Certificate_type,course_rating,course_difficulty,course_students_enrolled
0,134,(ISC)² Systems Security Certified Practitioner...,(ISC)²,SPECIALIZATION,4.7,Beginner,5.3k
1,743,A Crash Course in Causality: Inferring Causal...,University of Pennsylvania,COURSE,4.7,Intermediate,17k
2,874,A Crash Course in Data Science,Johns Hopkins University,COURSE,4.5,Mixed,130k
3,413,A Law Student's Toolkit,Yale University,COURSE,4.7,Mixed,91k
4,635,A Life of Happiness and Fulfillment,Indian School of Business,COURSE,4.8,Mixed,320k


In [5]:

# highest value of course_code and lowest value of course_code
df['course_code'].max(), df['course_code'].min()

(890, 0)

In [6]:
# Types of course_certifications
df['course_Certificate_type'].unique()

array(['SPECIALIZATION', 'COURSE', 'PROFESSIONAL CERTIFICATE'],
      dtype=object)

In [7]:
# Types of course_difficulty
df['course_difficulty'].unique()

array(['Beginner', 'Intermediate', 'Mixed', 'Advanced'], dtype=object)

In [8]:
# Min max course rating
df['course_rating'].max(), df['course_rating'].min()

(5.0, 3.3)

Here there are 7 Headers ( course_id, course_title,course_organization,course_Certificate_type,course_rating,course_difficulty,course_students_enrolled)
- course Code starts from `0` and ends at `890`.
- course_title is the name of the course.
- course_organization is the organization which is providing the course.
- course_Certificate_type is the type of certificate provided by the course. `Specialization | Course | Professional Certificate`
- course_rating is the rating of the course. `0.0 - 5.0`
- course_difficulty is the difficulty level of the course. `Beginner | Intermediate | Mixed | Advanced`
- course_students_enrolled is the number of students enrolled for the course. 



In [9]:
# Now we will use TF-IDF vectorizer to convert text to vector
# TF-IDF is an abbreviation for Term Frequency Inverse Document Frequency.
# This is very common algorithm to transform text into a meaningful representation of numbers which is used to fit machine algorithm for prediction.

# Removing all english stop words such as 'the', 'a'
tf = TfidfVectorizer(analyzer="word", ngram_range=(1, 2), min_df=0, stop_words="english")
tf

In [10]:
# Combine relevant features into a single text column
df['features'] = df['course_difficulty'] + ' ' + df['course_title'] + ' ' + df['course_organization']
df['features']

0      Beginner (ISC)² Systems Security Certified Pra...
1      Intermediate A Crash Course in Causality:  Inf...
2      Mixed A Crash Course in Data Science Johns Hop...
3          Mixed A Law Student's Toolkit Yale University
4      Mixed A Life of Happiness and Fulfillment Indi...
                             ...                        
886    Intermediate Программирование на Python Mail.R...
887    Mixed Психолингвистика (Psycholinguistics) Sai...
888    Intermediate Разработка интерфейсов: вёрстка и...
889    Intermediate Русский как иностранный Saint Pet...
890    Beginner Финансовые инструменты для частного и...
Name: features, Length: 891, dtype: object

In [11]:
# Now we will fit and transform the data using TF-IDF vectorized data
tfidf_matrix = tf.fit_transform(df["features"])
print("Shape :", tfidf_matrix.shape)
tfidf_matrix

Shape : (891, 5145)


<891x5145 sparse matrix of type '<class 'numpy.float64'>'
	with 12019 stored elements in Compressed Sparse Row format>

In [12]:
# Computing the cosine similarity on Tfidf matrix
cosine_similarities = linear_kernel(tfidf_matrix, tfidf_matrix)

In [13]:
cosine_similarities

array([[1.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.00331788],
       [0.        , 1.        , 0.22706319, ..., 0.00912975, 0.01291189,
        0.00250773],
       [0.        , 0.22706319, 1.        , ..., 0.        , 0.00459503,
        0.00360564],
       ...,
       [0.        , 0.00912975, 0.        , ..., 1.        , 0.01130791,
        0.        ],
       [0.        , 0.01291189, 0.00459503, ..., 0.01130791, 1.        ,
        0.00310601],
       [0.00331788, 0.00250773, 0.00360564, ..., 0.        , 0.00310601,
        1.        ]])

In [23]:
# creating a mapping of course title to index number
# here we will use course_difficulty and features to find the specific difficulty and subject
user_difficulty = input("Enter the difficulty of course ( 'Beginner', 'Intermediate', 'Mixed', 'Advanced' )  : ")
user_subject = input("Enter the subject of course : ")
user_course_index = df.index[(df['course_difficulty'] == user_difficulty) & (df['features'].str.contains(user_subject))].tolist()


In [26]:
# Here the user_course_index is a number which represent the input values of user
user_course_index

[59,
 171,
 188,
 391,
 466,
 513,
 530,
 532,
 682,
 685,
 687,
 688,
 752,
 841,
 851,
 880,
 883]

In [27]:
# Here we are comparing all the courses with the user course index and average the similarity scores for each course for each user course index

similarity_scores = []
for i in user_course_index:
    similarity_scores.append(list(enumerate(cosine_similarities[i]))) # Append a list of tuples to similarity_scores

# Now similarity_scores is a list of lists of tuples
# Each inner list contains tuples of the form (index, similarity_score)
# We want to average the similarity scores for each course 

sums = {}
counts = {}

for inner_list in similarity_scores:
    for tup in inner_list:
        index, score = tup
        sums[index] = sums.get(index, 0) + score
        counts[index] = counts.get(index, 0) + 1

avg_similarity_scores = [(index, sums[index] / counts[index]) for index in sums]


In [28]:
avg_similarity_scores

[(0, 0.0050421292612440795),
 (1, 0.01998041314153795),
 (2, 0.03503427883323451),
 (3, 0.002483290467136594),
 (4, 0.004858819177882467),
 (5, 0.00752699297814028),
 (6, 0.01865434902355419),
 (7, 0.007525124704703345),
 (8, 0.025340057587838777),
 (9, 0.0074732334661157624),
 (10, 0.0074732334661157624),
 (11, 0.008463167518214555),
 (12, 0.007363637051637837),
 (13, 0.0055083137175656735),
 (14, 0.005499985328355962),
 (15, 0.005478825528927208),
 (16, 0.0),
 (17, 0.0024401970594029026),
 (18, 0.009599827848541066),
 (19, 0.0045949489382077335),
 (20, 0.007895121065716763),
 (21, 0.003216836161266541),
 (22, 0.03349337123746533),
 (23, 0.008731030067800589),
 (24, 0.004310897533255047),
 (25, 0.007194992559290151),
 (26, 0.004477688957139217),
 (27, 0.029568075727082473),
 (28, 0.039024468121329516),
 (29, 0.02352013300612893),
 (30, 0.0016583146482485756),
 (31, 0.002906567114537607),
 (32, 0.010635683200390073),
 (33, 0.01512834938678647),
 (34, 0.009166564348473781),
 (35, 0.0062

In [29]:
# Combine similarity scores with course ratings
weighted_scores = [(i, score * df['course_rating'][i]) for i, score in avg_similarity_scores]

In [30]:
# Here we are sorting the weighted_scores list in descending order
similar_courses = sorted(weighted_scores, key=lambda x: x[1], reverse=True)


In [31]:
# This is the list of similar courses
similar_courses

[(688, 0.887916018587727),
 (682, 0.8879145692420612),
 (752, 0.7809483732252473),
 (685, 0.737107395841175),
 (841, 0.7233359673039861),
 (687, 0.710510346961317),
 (171, 0.6567226555889893),
 (188, 0.6521411095315659),
 (391, 0.6453756479696477),
 (466, 0.5803178600333891),
 (487, 0.5777564276256486),
 (513, 0.5658359467686354),
 (851, 0.5556400108891646),
 (684, 0.5394587629548826),
 (56, 0.5391221894398673),
 (57, 0.5164845759764648),
 (849, 0.5147712863599899),
 (59, 0.5083028056888904),
 (532, 0.5029535897907124),
 (880, 0.4907980115132762),
 (190, 0.4752166209746631),
 (883, 0.47166508177143773),
 (570, 0.4674306584850194),
 (530, 0.46160457578485903),
 (128, 0.4530186250455463),
 (674, 0.4417440589445417),
 (850, 0.4380796385535445),
 (203, 0.4123648950494597),
 (683, 0.3948602274315415),
 (58, 0.3715576614997376),
 (508, 0.3438738129792982),
 (45, 0.2714628509852443),
 (864, 0.2688593150050461),
 (858, 0.24636169860360288),
 (686, 0.23467763583624016),
 (486, 0.227037943935981

In [32]:
# Printing the top 5 similar courses
print("Recommended Courses:")
for i, score in similar_courses[1:6]:
    print(f"{df['course_title'][i]} (Organization: {df['course_organization'][i]}) - Similarity Score: {score:.2f}")
    print(df.iloc[i])
    print("-------------------------------------------")


Recommended Courses:
Python Basics (Organization: University of Michigan) - Similarity Score: 0.89
course_code                                                           279
course_title                                                Python Basics
course_organization                                University of Michigan
course_Certificate_type                                            COURSE
course_rating                                                         4.8
course_difficulty                                                Beginner
course_students_enrolled                                             110k
features                    Beginner Python Basics University of Michigan
Name: 682, dtype: object
-------------------------------------------
Statistics with Python (Organization: University of Michigan) - Similarity Score: 0.78
course_code                                                               127
course_title                                           Statistics with Pyth