In [49]:
import requests
import pandas as pd
import os
import json
from time import sleep
from bs4 import BeautifulSoup

In [50]:
df_tags = pd.read_csv('../StackOverflowJobs/data/jobs_tags.csv')
df_tags.head()

Unnamed: 0,.net,.net-core,.net-framework,2d,3d,ab-initio,actionscript,actionscript-2,admin,adobe-xd,...,wordpress-rest-api,wpf,xamarin,xctest,xilinx,xml,xtend,yocto,zend-framework,zipline
0,,,,,,,,,,,...,,,,,,,,,,
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,


In [51]:
df_tags = df_tags.drop(['id'], axis=1)

In [52]:
tags = df_tags.count().sort_values(ascending=False)
#tags = tags[tags > 1]

In [53]:
# Informações da API: https://www.udemy.com/developers/instructor/

def fazer_requisicao(url, params=None, headers=None, filename_cache=None):
    if not filename_cache:
        filename_cache = 'cache/%s'%(url.replace('/', '').lower())
    
    from_cache = False
    
    if os.path.exists(filename_cache):
        from_cache = True
        with open(filename_cache, 'r') as f:
            return f.read(), from_cache
        
    res = requests.get(url, params=params, headers=headers)
    res.raise_for_status()

    with open(filename_cache, 'w') as f:
        f.write(res.text)

    return res.text, from_cache

def obter_cursos(search, n_cursos):
    filename_cache = 'cache/%s'%(search.replace('/', '').lower())
    params = obter_params_api(search, n_cursos)
    headers = obter_headers_api()
    return fazer_requisicao('https://www.udemy.com/api-2.0/courses', params, headers, filename_cache)[0]


def obter_params_api(search, page_size):
    return {
        "page": 1,
        "page_size": page_size,
        "search": search,
        "category": "Development",
        "language": "en",
        "ordering": "most-reviewed",
        "ratings": 3.0
    }


def obter_headers_api():
    return {
        "Accept": "application/json, text/plain, */*",
        "Authorization": "Basic WXdJNjBrVXFFVlFWNlpiblpsdTJTV1VVVERLcHFMT083eHRhcXB3czpidzRneE0zTVBmc3RPZG5lM2hMdDBES1Q0SlJtS2lqekNpbmR1ZFp4SmdZRWNCSkFFOWJJQ0FwRWZFbXRRN3FrZVBic25Wekc5Qlk0SnlMbWQwWVpuYjJmN3NxYVRheEIzb0pSUEx6bW5OeUxTNXcwWWs5TWNSMmtUQmxHbEV3UQ==",
        "Content-Type": "application/json;charset=utf-8"
    }
    

### Obtém os curso pela API da Udemy

In [54]:
lista_cursos = []
for tag in tags.index[:20]:
    page_size = 30
    
    # Tags usadas menos de 20x será obtido apenas 10 cursos
    if tags[tag] < 20:
        page_size = 10
    
    cursos_text = obter_cursos(tag, 30)
    cursos = json.loads(cursos_text)['results']
    lista_cursos += cursos

### Cria um dataframe com as informações a serem utilizadas

In [75]:
df_cursos = pd.DataFrame(lista_cursos)
df_cursos = df_cursos[['id', 'image_125_H', 'image_240x135', 'image_480x270', 'title', 'url']]

In [76]:
df_cursos.head()

Unnamed: 0,id,image_125_H,image_240x135,image_480x270,title,url
0,947570,https://udemy-images.udemy.com/course/125_H/94...,https://udemy-images.udemy.com/course/240x135/...,https://udemy-images.udemy.com/course/480x270/...,Java Applications: Building Apps with Java,/building-apps-with-java/
1,1189214,https://udemy-images.udemy.com/course/125_H/11...,https://udemy-images.udemy.com/course/240x135/...,https://udemy-images.udemy.com/course/480x270/...,Introduction to Java EE,/java-ee-fundamentals/
2,1132484,https://udemy-images.udemy.com/course/125_H/11...,https://udemy-images.udemy.com/course/240x135/...,https://udemy-images.udemy.com/course/480x270/...,Java for Beginners,/java-8-for-complete-beginners/
3,991254,https://udemy-images.udemy.com/course/125_H/99...,https://udemy-images.udemy.com/course/240x135/...,https://udemy-images.udemy.com/course/480x270/...,Programming in JAVA,/java-by-sagar/
4,645792,https://udemy-images.udemy.com/course/125_H/64...,https://udemy-images.udemy.com/course/240x135/...,https://udemy-images.udemy.com/course/480x270/...,Java Programming for Beginners: Become a Java ...,/introduction-to-java-programming-b/


In [77]:
df_cursos['url'] = df_cursos['url'].apply(lambda x: 'https://www.udemy.com{url}'.format(url=x))

### Obtém a descrição do curso através do Beaultiful Soup

In [85]:
h_client = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}
def obter_descricao_curso(row):
    if 'description' in row.index and row['description']:
        return row['description']
    
    text, from_cache = fazer_requisicao(row['url'], headers=h_client)
    
    if not from_cache:
        sleep(6)
        
    soup = BeautifulSoup(text, 'lxml')
    
    descricoes = []
    descricoes += [e.text for e in soup.find_all('', class_='what-you-get__text')]
    descricoes += [p.text for p in soup.find('', {'data-purpose': 'collapse-description-text'}).find_all('p')]
    
    return ' '.join(descricoes)

In [90]:
df_cursos['description'] = df_cursos.apply(obter_descricao_curso, axis=1)

In [65]:
df_cursos.shape

(571, 7)

In [67]:
df_cursos.to_csv('data/cursos_udemy.csv', index=False)