In [1]:
import os
import pandas as pd
import numpy as np
import ast


In [2]:
os.chdir(os.getcwd().replace('notebooks', 'data'))

courses = pd.read_csv('clean.csv')


In [3]:
courses.head()


Unnamed: 0,course_title,tags
0,Ultimate Investment Banking Course,"['PaidCourse', 'PriceVeryHigh', 'AllLevels', '..."
1,Complete GST Course & Certification - Grow You...,"['PaidCourse', 'PriceLow', 'AllLevels', 'Durat..."
2,Financial Modeling for Business Analysts and C...,"['PaidCourse', 'PriceVeryLow', 'IntermediateLe..."
3,Beginner to Pro - Financial Analysis in Excel ...,"['PaidCourse', 'PriceLow', 'AllLevels', 'Durat..."
4,How To Maximize Your Profits Trading Options,"['PaidCourse', 'PriceVeryHigh', 'IntermediateL..."


In [4]:
corups = courses['tags'].values


In [5]:
corups.shape


(3677,)

In [6]:
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()


def convert_tags(tags):
    return " ".join([stemmer.stem(tag) for tag in ast.literal_eval(tags)]).lower()


In [7]:
convert_tags(corups[0])


'paidcours priceveryhigh alllevel durationverylow businessfin ultim invest bank cours'

In [8]:
corups = [convert_tags(tag) for tag in corups]


In [9]:
corups[:4]


['paidcours priceveryhigh alllevel durationverylow businessfin ultim invest bank cours',
 'paidcours pricelow alllevel durationmedium businessfin complet gst cours & certif - grow your ca practic',
 'paidcours priceverylow intermediatelevel durationverylow businessfin financi model for busi analyst and consult',
 'paidcours pricelow alllevel durationverylow businessfin beginn to pro - financi analysi in excel 2017']

In [10]:
from sklearn.feature_extraction.text import CountVectorizer


In [11]:
vectorizer = CountVectorizer(max_features=8000, stop_words='english')


In [12]:
vectors = vectorizer.fit_transform(corups)


In [13]:
vectors[0]


<1x3312 sparse matrix of type '<class 'numpy.int64'>'
	with 9 stored elements in Compressed Sparse Row format>

In [14]:
vectorizer.get_feature_names_out()[:100]


array(['00005', '001', '01', '02', '06z', '07', '08', '09', '10', '100',
       '101', '102', '10th', '10分でkawaiiカメレオンを描こう', '11', '1105', '115',
       '12', '123d', '13', '14', '1403', '15', '150', '16', '163',
       '16t19', '16z', '17', '175', '18', '183', '1872', '188', '19',
       '1942', '1year', '20', '200', '201', '2012', '2013', '2014',
       '2015', '2016', '2017', '2018', '2020', '21', '23', '23t00', '24',
       '24hr', '25', '263432', '27t20', '28', '29t00', '2d', '30',
       '30t15', '31', '33z', '34', '35', '365', '38', '398746', '39z',
       '3d', '40', '42038', '43', '45', '45z', '461', '462', '48', '4d',
       '50', '500', '51', '56', '58', '59', '5k', '5th', '60', '60min',
       '61', '63', '650804', '66', '70', '72', '80', '800', '874284',
       '88', '90'], dtype=object)

In [15]:
from sklearn.metrics.pairwise import cosine_similarity


In [16]:
courses_similarities = cosine_similarity(vectors[1], vectors)
courses_similarities


array([[0.38490018, 1.        , 0.18257419, ..., 0.19245009, 0.17407766,
        0.08333333]])

In [17]:
def recomend(course, k = 30):
  index = courses[courses['course_title'] == course].index[0]
  course_similarities = cosine_similarity(vectors[index], vectors)[0]

  courses_list = sorted(list(enumerate(course_similarities)), reverse=True, key=lambda x: x[1])[1:k + 1]
  return [courses.iloc[course[0]]['course_title'] for course in courses_list]


In [18]:
courses['course_title'].value_counts().sample(10)


course_title
Working Capital Management for CA / CFA / CPA Exams            1
How to Build Profitable FOREX Automated Trading Strategies!    1
Quote Images for Pinterest, Facebook, & Instagram              1
Stock Markets: How I Became Rich By Changing How I Trade       1
Instant Harmonica - play Amazing Grace + Ode to Joy now!       1
Projects in PHP and MySQL                                      1
Create Startup Landing Page with Viral Marketing Strategies    1
Learn Corel x7 Like a Pro                                      1
Ferramentas Visuais                                            1
Learn Facebook Flux Architecture for Web Applications          1
Name: count, dtype: int64

In [19]:
recomend('Introduction to Piano - By PGN Piano!')


['How to Play Piano - Your First Lesson!',
 'Aprendiendo Piano: Ejercicios de calentamiento',
 'What you can learn before Piano Lessons',
 'Piano With Willie: Piano Chords Vol. 1',
 'Piano Music Theory (Back To Basics) by JFilt',
 "World's Fastest Piano Method - The Piano Revolution!",
 'Piano From Zero To Pro - Beginner Essentials To The Piano',
 'Learn all Piano Chords in all Piano Scales with logic',
 'Your First Course on Piano',
 'Beginner Piano',
 'Painless Piano!',
 'Piano Lessons For Beginners: Learn Piano Quickly And Easily',
 'Piano Lessons for Beginners: Play the Piano AND Read Music!',
 'Easy Piano for Kids - Complete Beginner Piano Course',
 'PI-101 Top 10 Classical Piano Pieces for Beginners',
 'Piano Building Blocks: Learn Chord Additions & Variations',
 'Play Piano To-day! - Beginner Piano Lessons For Busy People!',
 'Just chords Piano: Learn to Play Piano Quickly - No Music',
 'Learn to Play Piano Like a Pro - Easy Piano Course 1',
 'Piano Lessons For Beginners',
 'Pla

In [20]:
import joblib


In [22]:
joblib.dump(courses, 'courses_vectors.joblib')


['courses_vectors.joblib']

In [23]:
joblib.dump(courses['course_title'].values, 'courses_title.joblib')


['courses_title.joblib']