In [31]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity


#helper function:

def get_title_from_index(index):
	return df[df.index == index]["course_title"].values[0]

def get_index_from_title(title):
	return df[df.course_title == title]["index"].values[0]

##################################################

In [69]:

##Step 1: Read CSV File
df = pd.read_csv("course_data.csv")
print(df.columns)
df.reset_index(inplace = True)

df.drop("Unnamed: 0",axis=1,inplace=True)
##Step 2: Select Features
df.describe()
df.info()
df.head()

Index(['Unnamed: 0', 'course_title', 'course_organization',
       'course_Certificate_type', 'course_rating', 'course_difficulty',
       'course_students_enrolled'],
      dtype='object')
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 7 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   index                     891 non-null    int64  
 1   course_title              891 non-null    object 
 2   course_organization       891 non-null    object 
 3   course_Certificate_type   891 non-null    object 
 4   course_rating             891 non-null    float64
 5   course_difficulty         891 non-null    object 
 6   course_students_enrolled  891 non-null    object 
dtypes: float64(1), int64(1), object(5)
memory usage: 48.9+ KB


Unnamed: 0,index,course_title,course_organization,course_Certificate_type,course_rating,course_difficulty,course_students_enrolled
0,0,(ISC)² Systems Security Certified Practitioner...,(ISC)²,SPECIALIZATION,4.7,Beginner,5.3k
1,1,A Crash Course in Causality: Inferring Causal...,University of Pennsylvania,COURSE,4.7,Intermediate,17k
2,2,A Crash Course in Data Science,Johns Hopkins University,COURSE,4.5,Mixed,130k
3,3,A Law Student's Toolkit,Yale University,COURSE,4.7,Mixed,91k
4,4,A Life of Happiness and Fulfillment,Indian School of Business,COURSE,4.8,Mixed,320k


In [70]:
features = ['course_title','course_organization','course_Certificate_type','course_difficulty']

#Step 3: Create a column in DF which combines all selected features

#for feature in features:
#	df[feature] = df[feature].fillna('')

def combine_features(row):
	try:
		return row['course_title'] +" "+row['course_organization']+" "+row["course_Certificate_type"]+" "+row["course_difficulty"]
	except:
		print("Error:", row)    

df["combined_features"] = df.apply(combine_features,axis=1)
df.columns
print( df["combined_features"].head())

0    (ISC)² Systems Security Certified Practitioner...
1    A Crash Course in Causality:  Inferring Causal...
2    A Crash Course in Data Science Johns Hopkins U...
3    A Law Student's Toolkit Yale University COURSE...
4    A Life of Happiness and Fulfillment Indian Sch...
Name: combined_features, dtype: object


In [82]:
##Step 4: Create count matrix from this new combined column
cv = CountVectorizer()

count_matrix = cv.fit_transform(df["combined_features"])
#print(count_matrix)
##Step 5: Compute the Cosine Similarity based on the count_matrix
cosine_sim = cosine_similarity(count_matrix) 
#print(cosine_sim)


course_user_likes = "Data Science Methodology"

## Step 6: Get index of this course from its title
course_index = get_index_from_title(course_user_likes)

similar_courses =  list(enumerate(cosine_sim[course_index]))
print(similar_courses)

[(0, 0.12309149097933275), (1, 0.29704426289300234), (2, 0.4714045207910318), (3, 0.1543033499620919), (4, 0.11322770341445959), (5, 0.2461829819586655), (6, 0.2721655269759087), (7, 0.12909944487358058), (8, 0.3086066999241838), (9, 0.12909944487358058), (10, 0.12909944487358058), (11, 0.0), (12, 0.1543033499620919), (13, 0.25819888974716115), (14, 0.25819888974716115), (15, 0.25819888974716115), (16, 0.12309149097933275), (17, 0.13608276348795434), (18, 0.13608276348795434), (19, 0.11785113019775795), (20, 0.36514837167011077), (21, 0.1543033499620919), (22, 0.13608276348795434), (23, 0.12909944487358058), (24, 0.09901475429766744), (25, 0.2461829819586655), (26, 0.0), (27, 0.4714045207910318), (28, 0.0), (29, 0.0), (30, 0.09622504486493764), (31, 0.1543033499620919), (32, 0.3086066999241838), (33, 0.1543033499620919), (34, 0.2721655269759087), (35, 0.2721655269759087), (36, 0.2721655269759087), (37, 0.13608276348795434), (38, 0.0), (39, 0.0), (40, 0.3086066999241838), (41, 0.2581988

In [84]:
sorted_similar_courses = sorted(similar_courses,key=lambda x:x[1],reverse=True)
## Step 8: Print titles of first 10 Courses
i=0
for element in sorted_similar_courses[:11]:
		print(get_title_from_index(element[0]))       

Data Science Methodology
Tools for Data Science
What is Data Science?
Databases and SQL for Data Science
IBM Data Science
Python for Data Science and AI
Applied Data Science
Applied Data Science Capstone
Data Analysis with Python
Introduction to Data Science
Data Science Math Skills
