## Using Scattertext to Analyze PyData Talks
Let's pull titles abstracts and descriptions of PyData talks to see how novice-level talks differed from intermediate and advanced talks.

Please check out Scattertext on Github: https://github.com/JasonKessler/scattertext

In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re, time
import pygal
import scattertext as st
from IPython.display import IFrame
from IPython.core.display import display, HTML
import seaborn as sns
display(HTML("<style>.container { width:98% !important; }</style>"))
import spacy
import scattertext as st
%matplotlib inline

## First, let's scrape pydata.org

In [2]:
def parse_talk(url):
    d = {}
    try:
        soup = BeautifulSoup(requests.get(url).text, 'lxml')
        content = soup.find_all('div', class_='container')[1]
        d['author'] = content.find_all('a')[0].contents[0]
        d['title'] = content.find_all('h2')[0].contents[0]
        d['level'] = content.find_all('dd')[0].contents[0] 
        d['description'] = soup.find_all('div', class_='description')[0].get_text()
        d['abstract'] = soup.find_all('div', class_='abstract')[0].get_text()
    except:
        print('bad', url)
        return None
    
    return d

def pull_pydata_schedule(loc, year):
    url = 'https://pydata.org/'+loc+str(year)+'/schedule/'    
    soup = BeautifulSoup(requests.get(url).text, 'lxml')
    content = soup.find_all('div', class_='container')[1]
    talks = []
    for slot in content.find_all('td', class_='slot'):
        for link in slot.find_all('a'):   
            d = parse_talk('https://pydata.org'+link.attrs['href'])
            if d is not None:
                d['location'] = loc
                d['year'] = str(year)
                talks.append(d)
    time.sleep(5) # for politeness
    print(loc, year)
    return pd.DataFrame(talks)

In [3]:
sched = pd.concat([pull_pydata_schedule('seattle', 2017),
                   pull_pydata_schedule('london', 2017),
                   pull_pydata_schedule('barcelona', 2017),                   
                   pull_pydata_schedule('berlin', 2017),        
                   pull_pydata_schedule('dc', 2016),
                   pull_pydata_schedule('carolinas', 2016),
                   pull_pydata_schedule('chicago', 2016),                   
                   pull_pydata_schedule('sfo', 2016),                       
                   pull_pydata_schedule('paris', 2016),                                                         
                   pull_pydata_schedule('berlin', 2016),                   
                   pull_pydata_schedule('london', 2016)])

seattle 2017
london 2017
bad https://pydata.orghttps://pydata.org/barcelona2017/schedule/presentation/42/
bad https://pydata.orghttps://pydata.org/barcelona2017/schedule/presentation/33/
bad https://pydata.orghttps://pydata.org/barcelona2017/schedule/presentation/34/
bad https://pydata.orghttps://pydata.org/barcelona2017/schedule/presentation/52/
barcelona 2017
bad https://pydata.org/berlin2017/program/breakfast-and-lunch-program/
bad https://pydata.org/berlin2017/keynote-speakers/
bad https://pydata.org/berlin2017/program/breakfast-and-lunch-program/
bad https://pydata.org/berlin2017/keynote-speakers#veronica-valeros
bad https://pydata.org/berlin2017/program/breakfast-and-lunch-program/
bad https://pydata.org/berlin2017/keynote-speakers#toby-walsh
bad https://pydata.org/berlin2017/program/breakfast-and-lunch-program/
bad https://pydata.org/berlin2017/keynote-speakers/#ethical-machine-learning-panel
berlin 2017
dc 2016
carolinas 2016
chicago 2016
sfo 2016
paris 2016
berlin 2016
london 

In [23]:
sched.to_csv('pydata_talks.csv', index=False)

In [3]:
sched = pd.read_csv('pydata_talks.csv')

In [4]:
nlp = spacy.load('en')

In [5]:
sched = sched[~sched['title'].isin(['BoF', 'Unconference Presentation'])]

In [6]:
sched['is_novice'] = (sched.level == 'Novice').apply(lambda x: 'Novice' if x else 'Not Novice')

In [7]:
sched['parse'] = (sched['title'] + '\n \n' + sched['abstract'].fillna('') + '\n \n' + sched['description'].fillna('')).apply(nlp)

In [8]:
sched = sched.loc[sched['title'].drop_duplicates().index]

## Let's see how descriptions of novice-directed talks sound compared to directed at more seasoned audiences

In [12]:
html = st.produce_scattertext_explorer(st.CorpusFromParsedDocuments(sched, category_col = 'is_novice', parsed_col = 'parse').build(),
                                       category='Novice',
                                       category_name='Novice',
                                       not_category_name='Intermediate or Advanced',
                                       minimum_term_frequency=8,
                                       pmi_threshold_coefficient=10,
                                       width_in_pixels=1000,
                                       term_ranker=st.OncePerDocFrequencyRanker,
                                       use_full_doc=True,
                                       metadata=sched['author'] + ' ('+sched['location'] + ', '+ sched['level'] + ')',)
file_name = 'output/PydataNoviceVsNotNovice.html'
open(file_name, 'wb').write(html.encode('utf-8'))
IFrame(src=file_name, width = 1200, height=700)

## Let's see how the experiened talk descriptions sound

In [23]:
sched['is_advanced'] = (sched.level == 'Experienced').apply(lambda x: 'Experienced' if x else 'Not Experienced')
html = st.produce_scattertext_explorer(st.CorpusFromParsedDocuments(sched, category_col = 'is_advanced', parsed_col = 'parse').build(),
                                       category='Experienced',
                                       category_name='Experienced',
                                       not_category_name='Not Experienced',
                                       minimum_term_frequency=8,
                                       pmi_filter_thresold=8,                                       
                                       width_in_pixels=1000,
                                       term_ranker=st.OncePerDocFrequencyRanker,
                                       use_full_doc=True,
                                       metadata=sched['author'] + ' ('+sched['location'] + ', '+ sched['level'] + ')',)
file_name = 'output/PydataAdvancedVsRest.html'
open(file_name, 'wb').write(html.encode('utf-8'))
IFrame(src=file_name, width = 1200, height=700)