In [None]:
import numpy as np
import scipy
import pandas as pd
import math
import random
import sklearn
from nltk.corpus import stopwords
import nltk
from langdetect import detect

from scipy.sparse import csr_matrix
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse.linalg import svds
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt

In [None]:
crawled_courses_information = pd.read_csv("/kaggle/input/crawled-courses-information/crawled_courses_information (1).csv")
crawled_courses_information.head(5)

In [None]:
courses_review= pd.read_csv('/kaggle/input/course-reviews-on-coursera/Coursera_reviews.csv')
all_courses = pd.read_csv('/kaggle/input/course-reviews-on-coursera/Coursera_courses.csv')

In [None]:
all_courses.drop('name', axis = 1, inplace = True)
all_courses

In [None]:
crawled_courses_information.drop(['rating', 'views', 'raters', 'enrollment'], axis = 1, inplace = True)
crawled_courses_information

In [None]:
courses = pd.merge(all_courses, crawled_courses_information, left_on = 'course_url', right_on = 'url')
courses.drop('url', axis = 1, inplace = True)
courses

In [None]:
ratings = courses_review.drop_duplicates()
data = pd.merge(ratings, courses)

In [None]:
data = data[['reviewers','course_id','name','category','description','instructors','rating']]
data.rename( columns = {'name':'Course Name', 'category': 'Course Category', 'description': 'Course Description', 'instructors': 'Course Instructors'}, inplace=True)

In [None]:
data['Course Name'] = data['Course Name'].str.replace(' ',',')
data['Course Name'] = data['Course Name'].str.replace(',,',',')
data['Course Name'] = data['Course Name'].str.replace(':','')
data['Course Description'] = data['Course Description'].str.replace(' ',',')
data['Course Description'] = data['Course Description'].str.replace(',,',',')
data['Course Description'] = data['Course Description'].str.replace('_','')
data['Course Description'] = data['Course Description'].str.replace(':','')
data['Course Description'] = data['Course Description'].str.replace('(','')
data['Course Description'] = data['Course Description'].str.replace(')','')
data['Course Description'] = data['Course Description'].str.replace('\n\n','')
data['Course Category'] = data['Course Category'].str.replace(' ',',')
data['Course Category'] = data['Course Category'].str.replace(',,',',')
data['Course Category'] = data['Course Category'].str.replace('_','')
data['Course Category'] = data['Course Category'].str.replace(':','')
data['Course Category'] = data['Course Category'].str.replace('(','')
data['Course Category'] = data['Course Category'].str.replace(')','')
data['Course Category'] = data['Course Category'].str.replace('\n\n','')
data['Course Instructors'] = data['Course Instructors'].str.replace(' ',',')
data['Course Instructors'] = data['Course Instructors'].str.replace(',,',',')
data['Course Instructors'] = data['Course Instructors'].str.replace('_','')
data['Course Instructors'] = data['Course Instructors'].str.replace(':','')
data['Course Instructors'] = data['Course Instructors'].str.replace('(','')
data['Course Instructors'] = data['Course Instructors'].str.replace(')','')
data['Course Instructors'] = data['Course Instructors'].str.replace('\n\n','')


In [None]:
data['tags'] = data['Course Name']+',' + data['Course Category'] +','+ data['Course Description'] + ','+data['Course Instructors']

In [None]:
#import label encoder

from sklearn import preprocessing 

#make an instance of Label Encoder

label_encoder = preprocessing.LabelEncoder()

data['reviewers'] = label_encoder.fit_transform(data[['reviewers']])

data['Course Name'] = label_encoder.fit_transform(data['Course Name'])

In [None]:
#import label encoder

from sklearn import preprocessing 

#make an instance of Label Encoder

label_encoder = preprocessing.LabelEncoder()

crawled_courses_information['name'] = label_encoder.fit_transform(crawled_courses_information['name'])

In [None]:
new_df = data[['reviewers','Course Name','tags','rating']]
new_df['tags'] = data['tags'].str.replace(',',' ')
new_df['Course Name'] = data['Course Name'].str.replace(',',' ')
new_df.rename(columns = {'Course Name':'course_name'}, inplace = True)
new_df['tags'] = new_df['tags'].astype(str).apply(lambda x:x.lower()) #lower casing the tags column

In [None]:
new_df['detect'] = new_df['tags'].apply(detect)

In [None]:
new_df = new_df[new_df['detect'] == 'en']

In [None]:
new_df['tags'].str.replace('\<.*$', '').iloc[2]

In [None]:
users_reviews_count_df = new_df.groupby(['reviewers', 'course_name']).size().groupby('reviewers').size()
print('# users: %d' % len(users_reviews_count_df))
users_with_enough_reviews_df = users_reviews_count_df[users_reviews_count_df >= 20].reset_index()[['reviewers']]
print('# users with at least 20 interactions: %d' % len(users_with_enough_reviews_df))

In [None]:
print('# of reviews: %d' % len(new_df))
interactions_from_selected_users_df = new_df.merge(users_with_enough_reviews_df, 
               how = 'right',
               left_on = 'reviewers',
               right_on = 'reviewers')
print('# of rating from users with at least 10 interactions: %d' % len(interactions_from_selected_users_df))

In [None]:
interactions_train_df, interactions_test_df = train_test_split(interactions_from_selected_users_df,
                                   stratify=interactions_from_selected_users_df['reviewers'], 
                                   test_size=0.20)

print('# interactions on Train set: %d' % len(interactions_train_df))
print('# interactions on Test set: %d' % len(interactions_test_df))

In [None]:
interactions_full_indexed_df = interactions_from_selected_users_df.set_index('reviewers')
interactions_train_indexed_df = interactions_train_df.set_index('reviewers')
interactions_test_indexed_df = interactions_test_df.set_index('reviewers')

In [None]:
def get_items_interacted(person_id, new_df):
    # Get the user's data and merge in the movie information.
    interacted_items = new_df.loc[person_id]['course_name']
    return set(interacted_items if type(interacted_items) == pd.Series else [interacted_items])

In [None]:
#Top-N accuracy metrics consts
EVAL_RANDOM_SAMPLE_NON_INTERACTED_ITEMS = 100

class ModelEvaluator:


    def get_not_interacted_items_sample(self, person_id, sample_size, seed=42):
        interacted_items = get_items_interacted(person_id, interactions_full_indexed_df)
        all_items = set(crawled_courses_information['name'])
        non_interacted_items = all_items - interacted_items

        random.seed(seed)
        non_interacted_items_sample = random.sample(non_interacted_items, sample_size)
        return set(non_interacted_items_sample)

    def _verify_hit_top_n(self, item_id, recommended_items, topn):        
            try:
                index = next(i for i, c in enumerate(recommended_items) if c == item_id)
            except:
                index = -1
            hit = int(index in range(0, topn))
            return hit, index

    def evaluate_model_for_user(self, model, person_id):
        #Getting the items in test set
        interacted_values_testset = interactions_test_indexed_df.loc[person_id]
        if type(interacted_values_testset['course_name']) == pd.Series:
            person_interacted_items_testset = set(interacted_values_testset['course_name'])
        else:
            person_interacted_items_testset = set([int(interacted_values_testset['course_name'])])  
        interacted_items_count_testset = len(person_interacted_items_testset) 

        #Getting a ranked recommendation list from a model for a given user
        person_recs_df = model.recommend_items(person_id, 
                                               items_to_ignore=get_items_interacted(person_id, 
                                                                                    interactions_train_indexed_df), 
                                               topn=10000000000)

        hits_at_5_count = 0
        hits_at_10_count = 0
        #For each item the user has interacted in test set
        for item_id in person_interacted_items_testset:
            #Getting a random sample (100) items the user has not interacted 
            #(to represent items that are assumed to be no relevant to the user)
            non_interacted_items_sample = self.get_not_interacted_items_sample(person_id, 
                                                                          sample_size=EVAL_RANDOM_SAMPLE_NON_INTERACTED_ITEMS, 
                                                                          seed=42)

            #Combining the current interacted item with the 100 random items
            items_to_filter_recs = non_interacted_items_sample.union(set([item_id]))

            #Filtering only recommendations that are either the interacted item or from a random sample of 100 non-interacted items
            valid_recs_df = person_recs_df[person_recs_df['course_name'].isin(items_to_filter_recs)]                    
            valid_recs = valid_recs_df['course_name'].values
            #Verifying if the current interacted item is among the Top-N recommended items
            hit_at_5, index_at_5 = self._verify_hit_top_n(item_id, valid_recs, 5)
            hits_at_5_count += hit_at_5
            hit_at_10, index_at_10 = self._verify_hit_top_n(item_id, valid_recs, 10)
            hits_at_10_count += hit_at_10

        #Recall is the rate of the interacted items that are ranked among the Top-N recommended items, 
        #when mixed with a set of non-relevant items
        recall_at_5 = hits_at_5_count / float(interacted_items_count_testset)
        recall_at_10 = hits_at_10_count / float(interacted_items_count_testset)

        person_metrics = {'hits@5_count':hits_at_5_count, 
                          'hits@10_count':hits_at_10_count, 
                          'interacted_count': interacted_items_count_testset,
                          'recall@5': recall_at_5,
                          'recall@10': recall_at_10}
        return person_metrics

    def evaluate_model(self, model):
        #print('Running evaluation for users')
        people_metrics = []
        for idx, person_id in enumerate(list(interactions_test_indexed_df.index.unique().values)):
#             if idx % 100 == 0 and idx > 0:
#                print('%d users processed'.format( idx))
            person_metrics = self.evaluate_model_for_user(model, person_id)  
            person_metrics['_person_id'] = person_id
            people_metrics.append(person_metrics)
        print('%d users processed'.format( idx))

        detailed_results_df = pd.DataFrame(people_metrics) \
                            .sort_values('interacted_count', ascending=False)
        
        global_recall_at_5 = detailed_results_df['hits@5_count'].sum() / float(detailed_results_df['interacted_count'].sum())
        global_recall_at_10 = detailed_results_df['hits@10_count'].sum() / float(detailed_results_df['interacted_count'].sum())
        
        global_metrics = {'modelName': model.get_model_name(),
                          'recall@5': global_recall_at_5,
                          'recall@10': global_recall_at_10}    
        return global_metrics, detailed_results_df
    
model_evaluator = ModelEvaluator()    

In [None]:
new_df['tags'] = new_df['tags'].str.replace('\d+', '')


In [None]:
stopwords_list = stopwords.words('english') 

vectorizer = TfidfVectorizer(analyzer='word',
                     min_df=0.003,
                     max_df=0.5,
                     max_features=5000,
                     stop_words=stopwords_list)

item_ids = crawled_courses_information['name'].tolist()
tfidf_matrix2 = vectorizer.fit_transform(new_df['tags'])
tfidf_feature_names2 = vectorizer.get_feature_names()
tfidf_matrix2

In [None]:
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()
#defining the stemming function
def stem(text):
    y=[]
    
    for i in text.split():
        y.append(ps.stem(i))
    
    return " ".join(y)
interactions_train_indexed_df['tags'] = interactions_train_indexed_df['tags'].apply(stem) #applying stemming on the tags column


In [None]:
new_df['tags'] = new_df['tags'].apply(stem) #applying stemming on the tags column


In [None]:
new_df['tags']

In [None]:
tfidf_matrix2.shape

In [None]:
def get_item_profile(item_id):
    idx = item_ids.index(item_id)
    item_profile = tfidf_matrix2[idx:idx+1]
    return item_profile

def get_item_profiles(ids):
    item_profiles_list = [get_item_profile(x) for x in ids]
    item_profiles = scipy.sparse.vstack(item_profiles_list)
    return item_profiles

def build_users_profile(person_id, interactions_indexed_df):
    interactions_person_df = interactions_indexed_df.loc[person_id]
    user_item_profiles = get_item_profiles(interactions_person_df['course_name'])
    
    user_item_strengths = np.array(new_df['rating']).reshape(-1,1)
#     Weighted average of item profiles by the interactions strength
#     user_item_strengths_weighted_avg = np.sum(user_item_profiles.multiply(user_item_strengths), axis=0) / np.sum(user_item_strengths)
    user_profile_norm = sklearn.preprocessing.normalize(user_item_strengths)
    return user_profile_norm

def build_users_profiles(): 
    interactions_indexed_df = interactions_from_selected_users_df[interactions_from_selected_users_df['course_name'] \
                                                   .isin(courses['name'])].set_index('reviewers')
    user_profiles = {}
    for person_id in interactions_indexed_df.index.unique():
        user_profiles[person_id] = build_users_profile(person_id, interactions_indexed_df)
    return user_profiles

In [None]:
user_profiles = build_users_profiles()
len(user_profiles)

In [None]:
class ContentBasedRecommender:
    
    MODEL_NAME = 'Content-Based'
    
    def __init__(self, items_df=None):
        self.item_ids = item_ids
        self.items_df = items_df
        
    def get_model_name(self):
        return self.MODEL_NAME
        
    def _get_similar_items_to_user_profile(self, person_id, topn=1000):
        #Computes the cosine similarity between the user profile and all item profiles
        cosine_similarities = cosine_similarity(user_profiles[person_id].transpose(), tfidf_matrix2.transpose())
        #Gets the top similar items
        similar_indices = cosine_similarities.argsort().flatten()[-topn:]
        #Sort the similar items by similarity
        similar_items = sorted([(item_ids[i], cosine_similarities[0,i]) for i in similar_indices], key=lambda x: -x[1])
        return similar_items
        
    def recommend_items(self, user_id, items_to_ignore=[], topn=10, verbose=False):
        similar_items = self._get_similar_items_to_user_profile(user_id)
        #Ignores items the user has already interacted
        similar_items_filtered = list(filter(lambda x: x[0] not in items_to_ignore, similar_items))
        
        recommendations_df = pd.DataFrame(similar_items_filtered, columns=['course_name', 'recStrength']) \
                                    .head(topn)

        if verbose:
            if self.items_df is None:
                raise Exception('"items_df" is required in verbose mode')

            recommendations_df = recommendations_df.merge(self.items_df, how = 'left', 
                                                          left_on = 'course_name', 
                                                          right_on = 'course_name')[['recStrength', 'course_name']]


        return recommendations_df
    
content_based_recommender_model = ContentBasedRecommender(crawled_courses_information)

In [None]:
print('Evaluating Content-Based Filtering model...')
cb_global_metrics, cb_detailed_results_df = model_evaluator.evaluate_model(content_based_recommender_model)
print('Global metrics:',(cb_global_metrics))
cb_detailed_results_df.head(10)

In [None]:
#Word2vec
from gensim.models import Word2Vec
from gensim.models.phrases import Phrases, Phraser
from matplotlib import pyplot
from gensim.models import KeyedVectors
import gensim.downloader as api
import gensim

In [None]:
import gensim.downloader as api
path = api.load("word2vec-google-news-300", return_path=True)
print(path)

In [None]:
tags_new=new_df.tags.apply(gensim.utils.simple_preprocess)
tags_new

In [None]:
# Model parameters
model=gensim.models.Word2Vec(window=5, min_count=2, workers=4, sg=0)

# Train the model
# model.build_vocab(tags_new, progress_per=1000)
# model.train(tags_new, total_examples=model.corpus_count, epochs=model.epochs)

# Save the trained model
# model.save("./kaggle/working")

In [None]:
model.save("/kaggle/working/tags.model")

In [None]:
# model = Word2Vec.load('/kaggle/working/tags.model')

In [None]:
    # Creating a list for storing the vectors (description into vectors)
    global word_embeddings
    word_embeddings = []

    # Reading the each book description 
    for line in new_df['tags']:
        avgword2vec = None
        count = 0
        for word in line.split():
            if word in model.wv.vocab:
                count += 1
                if avgword2vec is None:
                    avgword2vec = model[word]
                else:
                    avgword2vec = avgword2vec + model[word]
                
        if avgword2vec is not None:
            avgword2vec = avgword2vec / count
        
            word_embeddings.append(avgword2vec)

In [None]:
with open('word_embeddings.txt', 'w') as f:
    for line in word_embeddings:
        f.write(f"{line}\n")

In [None]:
word2vec = np.array(word_embeddings)

In [None]:
def get_item_profile(item_id):
    idx = item_ids.index(item_id)
    item_profile = word2vec[idx:idx+1]
    return item_profile

def get_item_profiles(ids):
    item_profiles_list = [get_item_profile(x) for x in ids]
    item_profiles = scipy.sparse.vstack(item_profiles_list)
    return item_profiles

def build_users_profile(person_id, interactions_indexed_df):
    interactions_person_df = interactions_indexed_df.loc[person_id]
    user_item_profiles = get_item_profiles(interactions_person_df['course_name'])
    
    user_item_strengths = np.array(new_df['rating']).reshape(-1,1)
#     Weighted average of item profiles by the interactions strength
#     user_item_strengths_weighted_avg = np.sum(user_item_profiles.multiply(user_item_strengths), axis=0) / np.sum(user_item_strengths)
    user_profile_norm = sklearn.preprocessing.normalize(user_item_strengths)
    return user_profile_norm

def build_users_profiles(): 
    interactions_indexed_df = interactions_from_selected_users_df[interactions_from_selected_users_df['course_name'] \
                                                   .isin(courses['name'])].set_index('reviewers')
    user_profiles = {}
    for person_id in interactions_indexed_df.index.unique():
        user_profiles[person_id] = build_users_profile(person_id, interactions_indexed_df)
    return user_profiles

In [None]:
class ContentBasedRecommender:
    
    MODEL_NAME = 'Content-Based'
#     vectors(new_df)
    def __init__(self, items_df=None):
        self.item_ids = item_ids
        self.items_df = items_df
        
    def get_model_name(self):
        return self.MODEL_NAME
        
    def _get_similar_items_to_user_profile(self, person_id, topn=1000):
        #Computes the cosine similarity between the user profile and all item profiles
        cosine_similarities = cosine_similarity(user_profiles[person_id].transpose(), word2vec.transpose())
        #Gets the top similar items
        similar_indices = cosine_similarities.argsort().flatten()[-topn:]
        #Sort the similar items by similarity
        similar_items = sorted([(item_ids[i], cosine_similarities[0,i]) for i in similar_indices], key=lambda x: -x[1])
        return similar_items
        
    def recommend_items(self, user_id, items_to_ignore=[], topn=10, verbose=False):
        similar_items = self._get_similar_items_to_user_profile(user_id)
        #Ignores items the user has already interacted
        similar_items_filtered = list(filter(lambda x: x[0] not in items_to_ignore, similar_items))
        
        recommendations_df = pd.DataFrame(similar_items_filtered, columns=['course_name', 'recStrength']) \
                                    .head(topn)

        if verbose:
            if self.items_df is None:
                raise Exception('"items_df" is required in verbose mode')

#             recommendations_df = recommendations_df.merge(self.items_df, how = 'left', 
#                                                           left_on = 'course_name', 
#                                                           right_on = 'course_name')[['recStrength', 'course_name']]


        return recommendations_df
    
content_based_recommender_model = ContentBasedRecommender(crawled_courses_information)

In [None]:
print('Evaluating Content-Based Filtering model...')
cb_global_metrics, cb_detailed_results_df = model_evaluator.evaluate_model(content_based_recommender_model)
print('Global metrics:',(cb_global_metrics))
cb_detailed_results_df.head(10)

In [None]:
crawled_courses_information.rename( columns = {'name':'course_name'}, inplace=True)

In [None]:
def inspect_interactions(person_id, test_set=True):
    if test_set:
        interactions_df = interactions_test_indexed_df
    else:
        interactions_df = interactions_train_indexed_df
    return interactions_df.loc[person_id].merge(crawled_courses_information, how = 'left', 
                                                      left_on = 'course_name', 
                                                      right_on = 'course_name') \
                          .sort_values('rating', ascending = False)[['rating', 
                                                                          'course_name',
                                                                          'category']]

In [None]:
inspect_interactions('By Vivek B', test_set=False).head(20)

In [None]:
content_based_recommender_model.recommend_items('By Vivek B', topn=20, verbose=True)