Project: 3 Course Recommender

Problem Statement:
Recommend learning courses based on past course activity and interests.

In [3]:
import os
import ast
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns



In [4]:
os.chdir(os.getcwd().replace('notebooks', 'data'))

courses = pd.read_csv('courses.csv')


In [5]:
courses.columns = courses.columns.str.replace(' ', '_')


In [6]:
courses.columns


Index(['course_id', 'course_title', 'url', 'is_paid', 'price',
       'num_subscribers', 'num_reviews', 'num_lectures', 'level',
       'content_duration', 'published_timestamp', 'subject'],
      dtype='object')

In [9]:
selected_columns = [
    'course_title',
    'is_paid',
    'price',
    'level',
    'content_duration',
    'subject'    
]


In [11]:
df = courses[selected_columns]


In [12]:
df.sample(5)


Unnamed: 0,course_title,is_paid,price,level,content_duration,subject
846,Value Investing and Stock Market Fundamentals,True,195,Beginner Level,3.0,Business Finance
2164,Learn the Violin - Scales and Fundamentals,True,80,Beginner Level,3.0,Musical Instruments
831,Forex Trading For Beginners,False,0,Beginner Level,2.0,Business Finance
353,How to Build a Massive Stock Portfolio from Zero!,True,200,All Levels,11.0,Business Finance
833,The Advanced Forex Course for Smart Traders,False,0,All Levels,5.0,Business Finance


In [14]:
df['is_paid'] = df['is_paid'].apply(lambda x: ['PaidCourse'] if x == True else ['FreeCourse'])


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['is_paid'] = df['is_paid'].apply(lambda x: ['PaidCourse'] if x == True else ['FreeCourse'])


In [15]:
df['price'].describe()


count    3678.000000
mean       66.049483
std        61.005755
min         0.000000
25%        20.000000
50%        45.000000
75%        95.000000
max       200.000000
Name: price, dtype: float64

In [17]:
def convert_price(price):
    if price == 0 :
        return ['']
    
    if price >= 180:
        return ['PriceVeryHigh']
    elif price >= 150:
        return ['PriceHigh']
    elif price >= 120:
        return ['PriceMedium']
    elif price >= 90:
        return ['PriceLow']
    elif price >= 60:
        return ['PriceLow']
    else:
        return ['PriceVeryLow']


In [18]:
df['price'] = df['price'].apply(convert_price)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['price'] = df['price'].apply(convert_price)


In [19]:
df['level'].value_counts()


level
All Levels            1929
Beginner Level        1270
Intermediate Level     421
Expert Level            58
Name: count, dtype: int64

In [20]:
df['level'] = df['level'].apply(lambda x: [x.replace(' ', '')])


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['level'] = df['level'].apply(lambda x: [x.replace(' ', '')])


In [21]:
df['content_duration'].unique()


array([ 1.5       , 39.        ,  2.5       ,  3.        ,  2.        ,
        1.        ,  5.        ,  7.        ,  4.        ,  0.58333333,
        4.5       ,  6.5       , 10.        ,  5.5       ,  7.5       ,
       11.5       , 16.        ,  3.5       ,  9.        ,  8.5       ,
       11.        ,  0.7       ,  9.5       ,  0.55      , 62.        ,
        0.13333333,  0.68333333, 31.5       , 20.        ,  6.        ,
       46.5       ,  0.71666667, 17.5       , 12.        ,  0.53333333,
       24.        , 19.5       , 20.5       ,  0.5       , 10.5       ,
        0.51666667,  0.6       ,  8.        , 29.        , 26.        ,
       24.5       , 13.        ,  0.73333333,  0.56666667, 12.5       ,
        0.31666667,  0.61666667,  0.66666667, 18.        , 33.        ,
       71.5       , 47.        , 14.5       ,  0.65      , 18.5       ,
       43.5       ,  0.63333333, 14.        ,  0.46666667, 45.        ,
       70.        ,  0.        , 29.5       , 13.5       ,  0.45

In [22]:
df['content_duration'].describe()


count    3678.000000
mean        4.094517
std         6.053840
min         0.000000
25%         1.000000
50%         2.000000
75%         4.500000
max        78.500000
Name: content_duration, dtype: float64

In [25]:
df = df[df['content_duration'] > 0]


In [26]:
df['content_duration'].describe()


count    3677.000000
mean        4.095630
std         6.054287
min         0.133333
25%         1.000000
50%         2.000000
75%         4.500000
max        78.500000
Name: content_duration, dtype: float64

In [27]:
def convert_duration(duration):
    if duration >= 70:
        return ['DurationVeryHigh']
    elif duration >= 50:
        return ['DurationHigh']
    elif duration >= 30:
        return ['DurationMedium']
    elif duration >= 20:
        return ['DurationLow']
    elif duration >= 10:
        return ['DurationLow']
    else:
        return ['DurationVeryLow']


In [28]:
df['content_duration'] = df['content_duration'].apply(convert_duration)


In [29]:
df['subject'] = df['subject'].apply(lambda x: [x.replace(' ', '')])


In [32]:
df.sample(10)


Unnamed: 0,course_title,is_paid,price,level,content_duration,subject
3524,Ultimate JavaScript Strings,[FreeCourse],[],[BeginnerLevel],[DurationVeryLow],[WebDevelopment]
3051,The Ultimate Guide to the Best WordPress Plugi...,[PaidCourse],[PriceVeryLow],[AllLevels],[DurationLow],[WebDevelopment]
624,Forex Trading : Power Of Moving Averages,[PaidCourse],[PriceVeryHigh],[AllLevels],[DurationVeryLow],[BusinessFinance]
414,How to trade options,[PaidCourse],[PriceVeryLow],[IntermediateLevel],[DurationVeryLow],[BusinessFinance]
3291,Learn PHP Fundamentals From Scratch,[FreeCourse],[],[BeginnerLevel],[DurationVeryLow],[WebDevelopment]
1470,Crea personajes fantásticos con Photoshop,[PaidCourse],[PriceVeryLow],[BeginnerLevel],[DurationVeryLow],[GraphicDesign]
720,"IAS 8-Accounting Policies ,Changes in Accounti...",[FreeCourse],[],[AllLevels],[DurationVeryLow],[BusinessFinance]
1010,Forex:Trade Management & Psychology,[PaidCourse],[PriceHigh],[IntermediateLevel],[DurationVeryLow],[BusinessFinance]
401,Quantitative Trading Analysis with R,[PaidCourse],[PriceVeryLow],[AllLevels],[DurationVeryLow],[BusinessFinance]
1996,How To Play Guitar - For Absolute Beginners,[PaidCourse],[PriceVeryLow],[BeginnerLevel],[DurationVeryLow],[MusicalInstruments]


In [33]:
df['keywords'] = df['course_title'].apply(lambda x: x.split(' '))


In [34]:
df.sample(10)


Unnamed: 0,course_title,is_paid,price,level,content_duration,subject,keywords
673,Matemática Financeira de um jeito fácil,[PaidCourse],[PriceLow],[AllLevels],[DurationVeryLow],[BusinessFinance],"[Matemática, Financeira, de, um, jeito, fácil]"
277,Mit Finanzwissen zu besserem Unternehmertum,[PaidCourse],[PriceVeryLow],[BeginnerLevel],[DurationVeryLow],[BusinessFinance],"[Mit, Finanzwissen, zu, besserem, Unternehmertum]"
2948,Learn Plugin Development in WordPress By Build...,[PaidCourse],[PriceLow],[AllLevels],[DurationLow],[WebDevelopment],"[Learn, Plugin, Development, in, WordPress, By..."
3664,XML DTD - Crash Course for Beginners,[PaidCourse],[PriceVeryLow],[AllLevels],[DurationVeryLow],[WebDevelopment],"[XML, DTD, -, Crash, Course, for, Beginners]"
3448,WordPress Plugin Tutorial,[PaidCourse],[PriceVeryLow],[BeginnerLevel],[DurationVeryLow],[WebDevelopment],"[WordPress, Plugin, Tutorial]"
2102,Advanced Trombone Studies - Learn to Master th...,[PaidCourse],[PriceVeryLow],[AllLevels],[DurationVeryLow],[MusicalInstruments],"[Advanced, Trombone, Studies, -, Learn, to, Ma..."
2620,Learn to use JSON,[PaidCourse],[PriceHigh],[BeginnerLevel],[DurationVeryLow],[WebDevelopment],"[Learn, to, use, JSON]"
7,"Trading Stock Chart Patterns For Immediate, Ex...",[PaidCourse],[PriceLow],[AllLevels],[DurationVeryLow],[BusinessFinance],"[Trading, Stock, Chart, Patterns, For, Immedia..."
622,Tradeonomics - Four Steps to Mastering Economi...,[PaidCourse],[PriceVeryLow],[AllLevels],[DurationVeryLow],[BusinessFinance],"[Tradeonomics, -, Four, Steps, to, Mastering, ..."
1224,How to Create Kindle & Ebook Covers with Canva,[PaidCourse],[PriceVeryLow],[BeginnerLevel],[DurationVeryLow],[GraphicDesign],"[How, to, Create, Kindle, &, Ebook, Covers, wi..."


In [35]:
df['tags'] = df['is_paid'] + df['price'] + df['level'] + df['content_duration'] + df['subject'] + df['keywords']
 

In [36]:
df['tags']


0       [PaidCourse, PriceVeryHigh, AllLevels, Duratio...
1       [PaidCourse, PriceLow, AllLevels, DurationMedi...
2       [PaidCourse, PriceVeryLow, IntermediateLevel, ...
3       [PaidCourse, PriceLow, AllLevels, DurationVery...
4       [PaidCourse, PriceVeryHigh, IntermediateLevel,...
                              ...                        
3673    [PaidCourse, PriceLow, AllLevels, DurationVery...
3674    [PaidCourse, PriceVeryLow, BeginnerLevel, Dura...
3675    [PaidCourse, PriceVeryLow, AllLevels, Duration...
3676    [PaidCourse, PriceVeryLow, AllLevels, Duration...
3677    [PaidCourse, PriceVeryLow, BeginnerLevel, Dura...
Name: tags, Length: 3677, dtype: object

In [38]:
df = df[['course_title', 'tags']]


In [39]:
df.sample(10)


Unnamed: 0,course_title,tags
441,"Build, Grow, and Protect Your Assets: A Step-b...","[PaidCourse, PriceVeryHigh, AllLevels, Duratio..."
1576,Character Concept Design for Beginners,"[PaidCourse, PriceHigh, AllLevels, DurationLow..."
3289,Practical PHP: Master the Basics and Code Dyna...,"[FreeCourse, , AllLevels, DurationVeryLow, Web..."
525,Cost Accounting Operating Costing(Professional...,"[PaidCourse, PriceVeryLow, AllLevels, Duration..."
2860,Java Spring Security,"[PaidCourse, PriceVeryLow, IntermediateLevel, ..."
1670,调色中级课程,"[PaidCourse, PriceVeryHigh, BeginnerLevel, Dur..."
3606,Object Orientation in PHP,"[PaidCourse, PriceVeryLow, BeginnerLevel, Dura..."
916,Socorro! Preciso Organizar as Minhas Finanças!,"[PaidCourse, PriceVeryLow, AllLevels, Duration..."
3420,SAP Crystal Reports with ASP.Net (Step by step),"[FreeCourse, , BeginnerLevel, DurationVeryLow,..."
609,Five steps to becoming a winner in stock inves...,"[PaidCourse, PriceVeryLow, AllLevels, Duration..."


In [40]:
df.to_csv('clean.csv', index=False)
