## **RECOMMENDER SYSTEM**

**Importing Libraries**

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import string
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from operator import itemgetter

**Fetching Dataset**

In [2]:
data=pd.read_csv("/content/UCoursera_Courses.csv")
data.drop(["Unnamed: 0"],axis=1,inplace=True) 

**Cleaning the dataset**

In [3]:
#removing stopwords such as "of, is, the, etc."
nltk.download('stopwords')
stopwords=nltk.corpus.stopwords.words('english')
def clean_text(txt):
    txt="".join([c for c in txt if c not in string.punctuation])
    tokens=re.split('\W+',txt)
    txt=[word for word in tokens if word not in stopwords]
    return txt

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
#breaking word into its normal form
nltk.download('wordnet')
wn=nltk.WordNetLemmatizer()
def lemm(token_txt):
    text=[wn.lemmatize(word) for word in token_txt]
    return text

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


**Calling upper functions for the columns of the dataset**

In [5]:
# data['course_title_l']= data['course_title'].apply(lambda x: remove(x))
# data['course_organization_l']= data['course_organization'].apply(lambda x: remove(x))
data['course_title_l']=data['course_title'].apply(lambda x:clean_text(x))
data['course_organization_l']= data['course_organization'].apply(lambda x: clean_text(x))
data['course_title_l']=data['course_title_l'].apply(lambda x: lemm(x))
data['course_organization_l']=data['course_organization_l'].apply(lambda x: lemm(x))

In [6]:
data['course_Certificate_type']=data['course_Certificate_type'].mask(data['course_Certificate_type']=='PROFESSIONAL CERTIFICATE','PROFESSIONAL_CERTIFICATE')

In [7]:
data.head()

Unnamed: 0,course_title,course_organization,course_Certificate_type,course_rating,course_difficulty,course_students_enrolled,course_title_l,course_organization_l
0,(ISC)² Systems Security Certified Practitioner...,(ISC)²,SPECIALIZATION,4.7,Beginner,5.3k,"[ISC², Systems, Security, Certified, Practitio...",[ISC²]
1,A Crash Course in Causality: Inferring Causal...,University of Pennsylvania,COURSE,4.7,Intermediate,17k,"[A, Crash, Course, Causality, Inferring, Causa...","[University, Pennsylvania]"
2,A Crash Course in Data Science,Johns Hopkins University,COURSE,4.5,Mixed,130k,"[A, Crash, Course, Data, Science]","[Johns, Hopkins, University]"
3,A Law Student's Toolkit,Yale University,COURSE,4.7,Mixed,91k,"[A, Law, Students, Toolkit]","[Yale, University]"
4,A Life of Happiness and Fulfillment,Indian School of Business,COURSE,4.8,Mixed,320k,"[A, Life, Happiness, Fulfillment]","[Indian, School, Business]"


**Creating a soup which contains all necessary information to recommend**

In [8]:
def create_soup(x):
    return ' '.join(x['course_title_l']) + ' ' + ' '.join(x['course_organization_l']) + ' ' + x['course_Certificate_type'] + ' ' + x['course_difficulty']

data['soup'] = data.apply(create_soup, axis=1)

**Asking user input**

In [9]:
def get_course():
    course = input("What Course are you interested in?")
    courses = " ".join(["".join(n.split()) for n in course.lower().split(',')])
    return course

def get_searchTerms():
    searchTerms = "" 
    courses = get_course()
    if courses != 'skip':
        searchTerms = courses
    return searchTerms

In [10]:
def make_recommendation(data=data):
    #grabbing the new wordsoup from the user
    searchTerms = get_searchTerms()
    # searchTerms = remove(searchTerms)
    searchTerms = clean_text(searchTerms)
    searchTerms = lemm(searchTerms)

    #creating a copy of the last row of the dataset, which we will use to input the user's input
    new_row = data.iloc[-1,:].copy() 
    new_row.iloc[-1] = " ".join(searchTerms)#adding the input to our new row
    data = data.append(new_row) #adding the new row to the dataset
    
    #Vectorizing the entire matrix
    count = CountVectorizer(stop_words='english')
    count_matrix = count.fit_transform(data['soup'])  

    #running pairwise cosine similarity 
    cosine_sim2 = cosine_similarity(count_matrix, count_matrix) #getting a similarity matrix
    
    #sorting cosine similarities by highest to lowest
    sim_scores = list(enumerate(cosine_sim2[-1,:]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    print(sim_scores)

    #matching the similarities to the course titles
    ranked_titles = []
    for i in range(1, 5):
        indx = sim_scores[i][0]
        if (sim_scores[i][1] != 0):
            ranked_titles.append([data['course_title'].iloc[indx], data['course_rating'].iloc[indx], data['course_students_enrolled'].iloc[indx],data['course_Certificate_type'].iloc[indx]])
    return ranked_titles

In [11]:
make_recommendation()

What Course are you interested in?data science
[(891, 0.9999999999999998), (864, 0.6324555320336758), (54, 0.5773502691896258), (198, 0.5773502691896258), (486, 0.5773502691896258), (825, 0.5773502691896258), (55, 0.5345224838248487), (196, 0.5345224838248487), (208, 0.5345224838248487), (661, 0.5345224838248487), (687, 0.5345224838248487), (56, 0.4999999999999999), (197, 0.4999999999999999), (299, 0.4999999999999999), (368, 0.4999999999999999), (420, 0.4999999999999999), (487, 0.4999999999999999), (711, 0.4999999999999999), (199, 0.4714045207910316), (200, 0.4472135954999579), (553, 0.4472135954999579), (671, 0.4472135954999579), (2, 0.42640143271122083), (27, 0.42640143271122083), (465, 0.42640143271122083), (594, 0.42640143271122083), (811, 0.42640143271122083), (130, 0.408248290463863), (612, 0.408248290463863), (193, 0.39223227027636803), (188, 0.2886751345948129), (203, 0.2886751345948129), (788, 0.2886751345948129), (815, 0.2886751345948129), (186, 0.26726124191242434), (195, 0.

[['What is Data Science?', 4.7, '260k', 'COURSE'],
 ['Applied Data Science', 4.6, '220k', 'SPECIALIZATION'],
 ['Data Science Methodology', 4.6, '89k', 'COURSE'],
 ['Introduction to Data Science', 4.6, '310k', 'SPECIALIZATION']]