# Content Base Recommender

Initial code: \
https://www.kaggle.com/code/zeynepduvarci/course-recommendation-content-and-collaborative/notebook#Content-Base-Recommender


Datasets:\
https://www.kaggle.com/datasets/imuhammad/course-reviews-on-coursera?resource=download&select=Coursera_reviews.csv
https://www.kaggle.com/datasets/khusheekapoor/coursera-courses-dataset-2021

In [33]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import string
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem.porter import PorterStemmer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

In [34]:
data=pd.read_csv('data/Coursera.csv')
df=data.copy()

In [35]:
df.head()

Unnamed: 0,Course Name,University,Difficulty Level,Course Rating,Course URL,Course Description,Skills
0,Write A Feature Length Screenplay For Film Or ...,Michigan State University,Beginner,4.8,https://www.coursera.org/learn/write-a-feature...,Write a Full Length Feature Film Script In th...,Drama Comedy peering screenwriting film D...
1,Business Strategy: Business Model Canvas Analy...,Coursera Project Network,Beginner,4.8,https://www.coursera.org/learn/canvas-analysis...,"By the end of this guided project, you will be...",Finance business plan persona (user experien...
2,Silicon Thin Film Solar Cells,�cole Polytechnique,Advanced,4.1,https://www.coursera.org/learn/silicon-thin-fi...,This course consists of a general presentation...,chemistry physics Solar Energy film lambda...
3,Finance for Managers,IESE Business School,Intermediate,4.8,https://www.coursera.org/learn/operational-fin...,"When it comes to numbers, there is always more...",accounts receivable dupont analysis analysis...
4,Retrieve Data using Single-Table SQL Queries,Coursera Project Network,Beginner,4.6,https://www.coursera.org/learn/single-table-sq...,In this course you�ll learn how to effectively...,Data Analysis select (sql) database manageme...


In [36]:
# null values count
df.isnull().sum()

Course Name           0
University            0
Difficulty Level      0
Course Rating         0
Course URL            0
Course Description    0
Skills                0
dtype: int64

In [37]:
df=df[['Course Name','Difficulty Level','Course Description','Skills']]

In [72]:
print(df['Skills'].unique())

['Drama Comedy peering screenwriting film Document Review dialogue creative writing Writing unix shells arts and humanities music and art'
 'Finance business plan persona user experience business model canvas Planning Business project Product Development presentation Strategy business business strategy'
 'chemistry physics Solar Energy film lambda calculus Electrical Engineering electronics energy silicon thinning physical science and engineering electrical engineering'
 ...
 'analytics tableau software Business Intelligence Statistical Dispersion software Demand Forecasting General Statistics Data Analysis measurement analysis business business essentials'
 'Angular Mechanical Design fluid mechanics Framing 3d rig molecular dynamics classical mechanics energy lecture robotics physical science and engineering mechanical engineering'
 'bigtable bigquery SQL Google Cloud Platform role based access control Cloud Storage Kubernetes identity management Cloud Computing Cloud Platforms comput

## preprocessing

In [38]:
def remove_punctuation(txt):
  txt_nopunt= "".join([c for c in txt if c not in string.punctuation])
  return txt_nopunt

In [39]:
nltk.download('stopwords')
stop = stopwords.words('english')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\marti\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [40]:
df['Course Description'] = df['Course Description'].str.replace('http\S+|www.\S+', '', case=False)

  df['Course Description'] = df['Course Description'].str.replace('http\S+|www.\S+', '', case=False)


In [41]:
df['Course Description']=df['Course Description'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
df['Course Name']=df['Course Name'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
df['Skills']=df['Skills'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

df['Course Name']=df['Course Name'].replace(","," ",regex=True)
df['Course Description']=df['Course Description'].replace(","," ",regex=True)
df['Skills']=df['Skills'].replace(","," ",regex=True)
df['Skills']=df['Skills'].replace("-"," ",regex=True)

df['Course Name']=df['Course Name'].apply(lambda x: remove_punctuation(x))
df['Course Description']=df['Course Description'].apply(lambda x: remove_punctuation(x))
df['Skills']=df['Skills'].apply(lambda x: remove_punctuation(x))

df['Course Description']=df['Course Description'].replace("�","'",regex=True)

df['tags']=df['Course Name']+' '+ df['Difficulty Level']+' '+ df['Course Description']+' '+df['Skills']
new_df=df[['Course Name','tags']]

In [42]:
new_df['tags'] = new_df['tags'].apply(lambda x:x.lower())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags'] = new_df['tags'].apply(lambda x:x.lower())


In [43]:
new_df['tags']=new_df['tags'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags']=new_df['tags'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))


In [44]:
new_df['tags'][0]


'write feature length screenplay film television beginner write full length feature film script course write complete featurelength screenplay film television serious drama romantic comedy anything learn break creative process components discover structured process allows produce polished pitchready script end course completing project increase confidence ideas abilities feel prepared pitch first script get started next course designed tap creativity based active learning actual learning takes place within activities writing learn link trailer course view trailer please copy paste link browser learner review love approach professor wheeler takes towards course point easy follow informative would definitely recommend anyone interested taking screenplay writing course course curriculum simple adopt professional writers room process write post work peer review share feedback peers revise work feedback receive peers thats real world feel professional writers room yet prior experience write

### stemming

In [45]:
ps=PorterStemmer()

In [46]:
def stem(text):
  y=[]
  
  for i in text.split():
    y.append(ps.stem(i))
  
  return " ".join(y)

In [47]:
new_df['tags']=new_df['tags'].apply(stem)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags']=new_df['tags'].apply(stem)


In [48]:
new_df['tags'][0]

'write featur length screenplay film televis beginn write full length featur film script cours write complet featurelength screenplay film televis seriou drama romant comedi anyth learn break creativ process compon discov structur process allow produc polish pitchreadi script end cours complet project increas confid idea abil feel prepar pitch first script get start next cours design tap creativ base activ learn actual learn take place within activ write learn link trailer cours view trailer pleas copi past link browser learner review love approach professor wheeler take toward cours point easi follow inform would definit recommend anyon interest take screenplay write cours cours curriculum simpl adopt profession writer room process write post work peer review share feedback peer revis work feedback receiv peer that real world feel profession writer room yet prior experi writer requir im propon experienti learn activ learn lectur short sometim two minut long point design stepbystep pro

## text vectorization

In [49]:
cv=CountVectorizer()

In [50]:
vectors=cv.fit_transform(new_df['tags']).toarray()

## similarity measure

In [51]:
similarity=cosine_similarity(vectors)

In [52]:
display(new_df)

Unnamed: 0,Course Name,tags
0,Write A Feature Length Screenplay For Film Or ...,write featur length screenplay film televis be...
1,Business Strategy Business Model Canvas Analys...,busi strategi busi model canva analysi miro be...
2,Silicon Thin Film Solar Cells,silicon thin film solar cell advanc cours cons...
3,Finance Managers,financ manag intermedi come number alway meet ...
4,Retrieve Data using SingleTable SQL Queries,retriev data use singlet sql queri beginn cour...
...,...,...
3517,Capstone Retrieving Processing Visualizing D...,capston retriev process visual data python beg...
3518,Patrick Henry Forgotten Founder,patrick henri forgotten founder intermedi 'giv...
3519,Business intelligence data analytics Generate ...,busi intellig data analyt gener insight advanc...
3520,Rigid Body Dynamics,rigid bodi dynam beginn cours teach dynam one ...


In [53]:
def recommend(course):
  course_index=new_df[new_df['Course Name']==course].first_valid_index()
  distances=similarity[course_index]
  dict={}
  for i in range(0,len(new_df['tags'])):
    dict[i]=distances[i]
  dict=sorted(dict.items(),key=lambda x:x[1], reverse=True)
  for i in range(1,7):
    index=dict[i][0]
    print(new_df['Course Name'][index])

In [54]:
recommend('Retrieve Data using SingleTable SQL Queries')

Creating Database Tables SQL
Advanced SQL Retrieval Queries SQLiteStudio
Create Relational Database Tables Using SQLiteStudio
Manipulating Data SQL
Databases SQL Data Science
Databases SQL Data Science


## tf-idf

In [55]:
tfidf=TfidfVectorizer()
tfidf_matrix=tfidf.fit_transform(df['tags'])

In [56]:
similarity=cosine_similarity(tfidf_matrix)

In [57]:
recommend('Retrieve Data using SingleTable SQL Queries')

Creating Database Tables SQL
Manipulating Data SQL
Advanced Relational Database SQL
Create Relational Database Tables Using SQLiteStudio
Intermediate Relational Database SQL
Advanced SQL Retrieval Queries SQLiteStudio


# Collaborative Based recommender

In [58]:
from surprise import SVD
from surprise import Dataset
from surprise import Reader
from surprise.model_selection import train_test_split
from surprise.model_selection import cross_validate

from collections import defaultdict

In [59]:
data=pd.read_csv('data/Coursera_reviews.csv')

In [60]:
data.head()

Unnamed: 0,reviews,reviewers,date_reviews,rating,course_id
0,"Pretty dry, but I was able to pass with just t...",By Robert S,"Feb 12, 2020",4,google-cbrs-cpi-training
1,would be a better experience if the video and ...,By Gabriel E R,"Sep 28, 2020",4,google-cbrs-cpi-training
2,Information was perfect! The program itself wa...,By Jacob D,"Apr 08, 2020",4,google-cbrs-cpi-training
3,A few grammatical mistakes on test made me do ...,By Dale B,"Feb 24, 2020",4,google-cbrs-cpi-training
4,Excellent course and the training provided was...,By Sean G,"Jun 18, 2020",4,google-cbrs-cpi-training


In [61]:
data=data[['reviewers','course_id','rating']]

In [62]:
reviewers=data['reviewers'].unique()

In [63]:
reviewers=reviewers[:270000]

In [64]:
data=data[~data['reviewers'].isin(reviewers)]

In [65]:
data=data.pivot_table(index=['reviewers'],columns=['course_id'],values='rating').fillna(0)

In [66]:
data=data.reset_index().melt(id_vars=['reviewers'])

In [67]:
reader=Reader(rating_scale=(0,5))
data=Dataset.load_from_df(data,reader)
type(data)

surprise.dataset.DatasetAutoFolds

In [68]:
trainset, testset= train_test_split(data, test_size=.25,shuffle=True)

### co-clustering

In [69]:
svd_model=SVD()
svd_model.fit(trainset)
cross_validate(svd_model, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.6815  0.6843  0.6794  0.6817  0.6823  0.6819  0.0016  
MAE (testset)     0.2045  0.2073  0.2066  0.2054  0.2051  0.2058  0.0010  
Fit time          6.12    5.65    5.49    5.54    6.92    5.94    0.54    
Test time         2.00    0.91    0.89    0.92    0.94    1.13    0.43    


{'test_rmse': array([0.68151934, 0.68434403, 0.67939542, 0.68174655, 0.68226891]),
 'test_mae': array([0.20453839, 0.20732556, 0.20659199, 0.20536439, 0.20506296]),
 'fit_time': (6.119314670562744,
  5.646772861480713,
  5.491268157958984,
  5.543216228485107,
  6.919079780578613),
 'test_time': (1.9999663829803467,
  0.907700777053833,
  0.8942122459411621,
  0.9192681312561035,
  0.9410512447357178)}

In [70]:
predictions=svd_model.test(testset)

### recommendation

In [71]:
top_n = defaultdict(list)
n=3
for uid, iid, true_r, est, _ in predictions:
  top_n[uid].append((iid, est))

for uid, user_ratings in top_n.items():
  user_ratings.sort(key=lambda x: x[1], reverse=True)
  top_n[uid] = user_ratings[:n]

for uid, user_ratings in top_n.items():
  print(uid, user_ratings)

By Demyd D [('embedded-software-hardware', 0), ('uva-darden-agile-team-management', 0), ('pca-machine-learning', 0)]
By Tamara  [('object-oriented-java', 0.413712367105202), ('theropods-birds', 0.06315760003017568), ('magic-middle-ages', 0.05570904728291222)]
By MUNISH K  [('machine-learning', 2.720785905339749), ('indigenous-canada', 0.3915394391301738), ('aulaconstructivista', 0.06258620540506134)]
By MAYUR B [('machine-learning', 4.750153983398765), ('object-oriented-java', 0.1290393743257239), ('competencias-lab', 0.02603451481013485)]
By Katarzyna H  [('international-law-in-action', 0.04698460543673433), ('spectroscopy', 0.039886310203317024), ('memoir-reader-relationship', 0.034483422891915506)]
By Wong Y M [('machine-learning', 4.5557919786484025), ('actualizacion-manejo-diabetes-tipo-2', 0.06611249298192594), ('mathematics-and-python', 0.05736691423796489)]
By alekhya [('object-oriented-java', 0.7005950286168294), ('tricky-american-english-pronunciation', 0.29555181008898457), 