In [None]:
!pip install rake-nltk

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting rake-nltk
  Downloading rake_nltk-1.0.6-py3-none-any.whl (9.1 kB)
Installing collected packages: rake-nltk
Successfully installed rake-nltk-1.0.6


In [None]:
import pandas as pd
import numpy as np
from rake_nltk import Rake
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
df = pd.read_csv('/content/EdX.csv')
df.head()

Unnamed: 0,Name,University,Difficulty Level,Link,About,Course Description
0,How to Learn Online,edX,Beginner,https://www.edx.org/course/how-to-learn-online,Learn essential strategies for successful onli...,"Designed for those who are new to elearning, t..."
1,Programming for Everybody (Getting Started wit...,The University of Michigan,Beginner,https://www.edx.org/course/programming-for-eve...,"This course is a ""no prerequisite"" introductio...",This course aims to teach everyone the basics ...
2,CS50's Introduction to Computer Science,Harvard University,Beginner,https://www.edx.org/course/cs50s-introduction-...,An introduction to the intellectual enterprise...,"This is CS50x , Harvard University's introduct..."
3,The Analytics Edge,Massachusetts Institute of Technology,Intermediate,https://www.edx.org/course/the-analytics-edge,"Through inspiring examples and stories, discov...","In the last decade, the amount of data availab..."
4,Marketing Analytics: Marketing Measurement Str...,"University of California, Berkeley",Beginner,https://www.edx.org/course/marketing-analytics...,This course is part of a MicroMasters® Program,Begin your journey in a new career in marketin...


In [None]:
# data overview
print('Rows x Columns : ', df.shape[0], 'x', df.shape[1])
print('Features: ', df.columns.tolist())
print('nUnique values:')
print(df.nunique())

Rows x Columns :  720 x 6
Features:  ['Name', 'University', 'Difficulty Level', 'Link', 'About', 'Course Description']
nUnique values:
Name                  717
University            102
Difficulty Level        3
Link                  719
About                 698
Course Description    717
dtype: int64


In [None]:
# type of entries, how many missing values/null fields
df.info()
print('nMissing values:  ', df.isnull().sum().values.sum())
df.isnull().sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 720 entries, 0 to 719
Data columns (total 6 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   Name                720 non-null    object
 1   University          720 non-null    object
 2   Difficulty Level    720 non-null    object
 3   Link                720 non-null    object
 4   About               720 non-null    object
 5   Course Description  720 non-null    object
dtypes: object(6)
memory usage: 33.9+ KB
nMissing values:   0


Name                  0
University            0
Difficulty Level      0
Link                  0
About                 0
Course Description    0
dtype: int64

In [None]:
# to remove punctuations from Course description
import string
df['Course Description'] = df['Course Description'].str.translate(str.maketrans("","",string.punctuation))

In [None]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
# to extract key words from Course Description to a list
df['Key_words'] = ''   # initializing a new column
r = Rake()   # using Rake to remove stop words

for index, row in df.iterrows():
    r.extract_keywords_from_text(row['Course Description'])   # to extract key words 
    key_words_dict_scores = r.get_word_degrees()    # to get dictionary with key words and their similarity scores
    row['Key_words'] = list(key_words_dict_scores.keys())   # to assign it to new column

df

Unnamed: 0,Name,University,Difficulty Level,Link,About,Course Description,Key_words
0,How to Learn Online,edX,Beginner,https://www.edx.org/course/how-to-learn-online,Learn essential strategies for successful onli...,Designed for those who are new to elearning th...,"[designed, new, elearning, course, prepare, st..."
1,Programming for Everybody (Getting Started wit...,The University of Michigan,Beginner,https://www.edx.org/course/programming-for-eve...,"This course is a ""no prerequisite"" introductio...",This course aims to teach everyone the basics ...,"[course, aims, teach, everyone, basics, progra..."
2,CS50's Introduction to Computer Science,Harvard University,Beginner,https://www.edx.org/course/cs50s-introduction-...,An introduction to the intellectual enterprise...,This is CS50x Harvard Universitys introductio...,"[cs50x, harvard, universitys, introduction, in..."
3,The Analytics Edge,Massachusetts Institute of Technology,Intermediate,https://www.edx.org/course/the-analytics-edge,"Through inspiring examples and stories, discov...",In the last decade the amount of data availabl...,"[last, decade, amount, data, available, organi..."
4,Marketing Analytics: Marketing Measurement Str...,"University of California, Berkeley",Beginner,https://www.edx.org/course/marketing-analytics...,This course is part of a MicroMasters® Program,Begin your journey in a new career in marketin...,"[begin, journey, new, career, marketing, analy..."
...,...,...,...,...,...,...,...
715,Global China: From the Mongols to the Ming,Harvard University,Beginner,https://www.edx.org/course/global-china-from-t...,Explore the impact of the conquest dynasties a...,In the 13th century by force of arms the Mongo...,"[13th, century, force, arms, mongols, created,..."
716,Leaders in Citizen Security and Justice Manage...,Inter-American Development Bank,Intermediate,https://www.edx.org/course/leaders-in-citizen-...,"Learn about the latest in prevention, police a...",The high rates of crime and violence are two o...,"[high, rates, crime, violence, two, main, chal..."
717,Computational Neuroscience: Neuronal Dynamics ...,École polytechnique fédérale de Lausanne,Advanced,https://www.edx.org/course/computational-neuro...,This course explains the mathematical and comp...,What happens in your brain when you make a dec...,"[happens, brain, make, decision, recall, memor..."
718,Cities and the Challenge of Sustainable Develo...,SDG Academy,Beginner,https://www.edx.org/course/cities-and-the-chal...,What is a sustainable city? Learn the basics h...,According to the United Nations urbanization a...,"[according, united, nations, urbanization, pop..."


In [None]:
# to combine 1 lists (1 column) of key words into 1 sentence under Bag_of_words column
df['Bag_of_words'] = ''
columns = ['Key_words']

for index, row in df.iterrows():
    words = ''
    for col in columns:
        words += ' '.join(row[col]) + ' '
    row['Bag_of_words'] = words
    
# strip white spaces infront and behind, replace multiple whitespaces (if any)
df['Bag_of_words'] = df['Bag_of_words'].str.strip().str.replace('   ', ' ').str.replace('  ', ' ')

df1 = df[['Name','Bag_of_words']]

In [None]:
df1

Unnamed: 0,Name,Bag_of_words
0,How to Learn Online,designed new elearning course prepare strategi...
1,Programming for Everybody (Getting Started wit...,course aims teach everyone basics programming ...
2,CS50's Introduction to Computer Science,cs50x harvard universitys introduction intelle...
3,The Analytics Edge,last decade amount data available organization...
4,Marketing Analytics: Marketing Measurement Str...,begin journey new career marketing analytics l...
...,...,...
715,Global China: From the Mongols to the Ming,13th century force arms mongols created greate...
716,Leaders in Citizen Security and Justice Manage...,high rates crime violence two main challenges ...
717,Computational Neuroscience: Neuronal Dynamics ...,happens brain make decision recall memory last...
718,Cities and the Challenge of Sustainable Develo...,according united nations urbanization populati...


In [None]:
# to generate the count matrix
count = CountVectorizer()
count_matrix = count.fit_transform(df['Bag_of_words'])
count_matrix
cosine_sim = cosine_similarity(count_matrix, count_matrix)
print(cosine_sim)

[[1.         0.05063697 0.08890009 ... 0.08377078 0.0451754  0.06282809]
 [0.05063697 1.         0.11575623 ... 0.08483797 0.02287545 0.06362848]
 [0.08890009 0.11575623 1.         ... 0.0638334  0.02294912 0.0638334 ]
 ...
 [0.08377078 0.08483797 0.0638334  ... 1.         0.03784378 0.05263158]
 [0.0451754  0.02287545 0.02294912 ... 0.03784378 1.         0.01892189]
 [0.06282809 0.06362848 0.0638334  ... 0.05263158 0.01892189 1.        ]]


In [None]:
indices = pd.Series(df['Name'])

In [None]:
# this function takes in a course name as input and returns the top 5 recommended (similar) courses

def recommend(name, cosine_sim = cosine_sim):
    recommended_courses = []
    idx = indices[indices == name].index[0]   # to get the index of the movie title matching the input course
    score_series = pd.Series(cosine_sim[idx]).sort_values(ascending = False)   # similarity scores in descending order
    top_5_indices = list(score_series.iloc[1:6].index)   # to get the indices of top 6 most similar courses
    # [1:6] to exclude 0 (index 0 is the input course itself)
    
    for i in top_5_indices: # to append the names of top 5 similar courses to the recommended_courses list
       recommended_courses.append(list(df['Name'])[i])
       #recommended_courses.sort() #for alphabetical order
    return recommended_courses


In [None]:
recommend('The Analytics Edge')

['The Data Science Method',
 'Analyzing Data with Python',
 'Data Science and Machine Learning Capstone Project',
 'Introduction to Probability',
 'Introduction to Probability']

In [None]:
import pickle

In [None]:
pickle.dump(df1,open('course_df.pkl','wb'))

In [None]:
pickle.dump(cosine_sim,open('cosine_sim.pkl','wb'))

In [None]:
df2=pd.read_csv('/content/Coursera.csv')
df2

Unnamed: 0,Course Name,University,Difficulty Level,Course Rating,Course URL,Course Description,Skills
0,Write A Feature Length Screenplay For Film Or ...,Michigan State University,Beginner,4.8,https://www.coursera.org/learn/write-a-feature...,Write a Full Length Feature Film Script In th...,Drama Comedy peering screenwriting film D...
1,Business Strategy: Business Model Canvas Analy...,Coursera Project Network,Beginner,4.8,https://www.coursera.org/learn/canvas-analysis...,"By the end of this guided project, you will be...",Finance business plan persona (user experien...
2,Silicon Thin Film Solar Cells,�cole Polytechnique,Advanced,4.1,https://www.coursera.org/learn/silicon-thin-fi...,This course consists of a general presentation...,chemistry physics Solar Energy film lambda...
3,Finance for Managers,IESE Business School,Intermediate,4.8,https://www.coursera.org/learn/operational-fin...,"When it comes to numbers, there is always more...",accounts receivable dupont analysis analysis...
4,Retrieve Data using Single-Table SQL Queries,Coursera Project Network,Beginner,4.6,https://www.coursera.org/learn/single-table-sq...,In this course you�ll learn how to effectively...,Data Analysis select (sql) database manageme...
...,...,...,...,...,...,...,...
3517,"Capstone: Retrieving, Processing, and Visualiz...",University of Michigan,Beginner,4.6,https://www.coursera.org/learn/python-data-vis...,"In the capstone, students will build a series ...",Databases syntax analysis web Data Visuali...
3518,Patrick Henry: Forgotten Founder,University of Virginia,Intermediate,4.9,https://www.coursera.org/learn/henry,"�Give me liberty, or give me death:� Rememberi...",retirement Causality career history of the ...
3519,Business intelligence and data analytics: Gene...,Macquarie University,Advanced,4.6,https://www.coursera.org/learn/business-intell...,�Megatrends� heavily influence today�s organis...,analytics tableau software Business Intellig...
3520,Rigid Body Dynamics,Korea Advanced Institute of Science and Techno...,Beginner,4.6,https://www.coursera.org/learn/rigid-body-dyna...,"This course teaches dynamics, one of the basic...",Angular Mechanical Design fluid mechanics F...


In [None]:
df3=pd.read_csv('/content/Udacity.csv')
df3

Unnamed: 0,Name,School,Difficulty Level,Rating,Link,About
0,Data Engineer,School of Data Science,Intermediate,4.6,https://www.udacity.com//course/data-engineer-...,Data Engineering is the foundation for the new...
1,Data Scientist,School of Data Science,Advanced,4.7,https://www.udacity.com//course/data-scientist...,"Build effective machine learning models, run d..."
2,Data Analyst,School of Data Science,Intermediate,4.6,https://www.udacity.com//course/data-analyst-n...,"Use Python, SQL, and statistics to uncover ins..."
3,C++,School of Autonomous Systems,Intermediate,4.6,https://www.udacity.com//course/c-plus-plus-na...,Get hands-on experience by building five real-...
4,Product Manager,School of Product Management,Beginner,4.7,https://www.udacity.com//course/product-manage...,Envision and execute the development of indust...
...,...,...,...,...,...,...
258,Front-End Interview Prep,Career Advancement,Intermediate,,https://www.udacity.com//course/front-end-inte...,Answer front-end technical and behavioral inte...
259,Full-Stack Interview Prep,Career Advancement,Intermediate,,https://www.udacity.com//course/full-stack-int...,Answer common full stack and web security inte...
260,Data Structures & Algorithms in Swift,Career Advancement,Intermediate,,https://www.udacity.com//course/data-structure...,Review and practice the skills technical inter...
261,iOS Interview Prep,Career Advancement,Intermediate,,https://www.udacity.com//course/ios-interview-...,Answer iOS and mobile development interview qu...


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.7/9.7 MB[0m [31m61.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m82.1/82.1 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m184.3/184.3 kB[0m [31m16.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.7/4.7 MB[0m [31m85.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m164.8/164.8 kB[0m [31m17.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.7/62.7 kB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for validators (setup.py) ... [?25l[?25hdone


SyntaxError: ignored