In [5]:
from bs4 import BeautifulSoup
from requests import get
import pandas as pd, matplotlib.pyplot as plt
from tqdm import tqdm
%matplotlib inline

In [6]:
# HREF SCRAPER
def get_urls(n_pages, section):
  urls = []
  for page in tqdm(range(n_pages+1)):
    try:
      soup = BeautifulSoup(get(f'https://www.dailytexanonline.com/section/{section}?page={page}').text,'lxml')
      for i in soup.findAll('h2', {'class':'story-title'}):
        urls.append(i.find('a')['href'])
    except AttributeError:
      print(page)
  return urls

In [7]:
# ARTICLE SCRAPER
def get_articles(urls):
  articles = {}
  for h in tqdm(urls):
    try:
      soup = BeautifulSoup(get('https://www.dailytexanonline.com'+h).text,'lxml')
      auth = soup.find('div',{'id':'author-name'})
      articles[h] = dict(
        text = soup.find('div',{'id':'article-body'}).text,
        author = auth.text,
        author_link = auth.find('a')['href'],
        date = soup.find('time',{'id':'article-published'}).text,
        title = soup.find('h1',{'id':'page-title'}).text,
      )
    except:
      print(h)
  articles = pd.DataFrame.from_dict(articles, orient='index')
  articles.date = pd.to_datetime(articles.date.str.lstrip('Published on '), format='%B %d, %Y at %I:%M %p')
  articles.text = articles.text.str.replace('\\xa0',' ').str.replace('\\n',' ')
  articles.author = articles.author.str.lstrip('By ')
  articles.title = articles.title.str.strip()
  return articles

In [8]:
get_articles(get_urls(395,'opinion')).to_pickle('data/opinion.p')

100%|██████████| 396/396 [04:46<00:00,  1.38it/s]
  2%|▏         | 79/3960 [00:37<30:57,  2.09it/s]

/2018/02/05/you-ask-we-answer-how-to-talk-to-guys


 28%|██▊       | 1112/3960 [09:32<24:26,  1.94it/s]

/2016/08/01/fifty-years-later-1966-firing-lines


 37%|███▋      | 1446/3960 [12:24<21:34,  1.94it/s]

/2016/01/25/interfaith-dialogue-necessary-to-make-campus-a-safe-space


 40%|████      | 1589/3960 [13:39<20:22,  1.94it/s]

/2015/10/18/hispanic-heritage-month-building-a-brighter-future


 41%|████      | 1622/3960 [13:55<20:04,  1.94it/s]

/2015/10/07/campus-carry-will-not-protect-women


 41%|████      | 1623/3960 [13:56<20:04,  1.94it/s]

/2015/10/07/campus-carry-poses-risks-to-safety-educational-freedom


 41%|████      | 1624/3960 [13:56<20:03,  1.94it/s]

/2015/10/07/no-reason-to-fear-campus-carry-law-abiding-gunowners


 42%|████▏     | 1645/3960 [14:08<19:54,  1.94it/s]

/2015/09/30/forum-fostering-sense-of-home-for-refugees


 42%|████▏     | 1646/3960 [14:09<19:54,  1.94it/s]

/2015/09/30/forum-stop-calling-it-a-refugee-crisis


 42%|████▏     | 1647/3960 [14:09<19:53,  1.94it/s]

/2015/09/30/forum-easing-learning-tensions-through-language-tutoring-programs


 42%|████▏     | 1660/3960 [14:16<19:46,  1.94it/s]

/2015/09/27/speedway-of-golden-bricks


 42%|████▏     | 1667/3960 [14:19<19:42,  1.94it/s]

/2015/09/23/forum-sure-walk-taking-steps-forward


 42%|████▏     | 1668/3960 [14:20<19:42,  1.94it/s]

/2015/09/23/forum-campus-safety-is-a-community-responsibility


 42%|████▏     | 1670/3960 [14:21<19:41,  1.94it/s]

/2015/09/23/forum-protecting-those-who-will-change-the-world


 43%|████▎     | 1688/3960 [14:30<19:32,  1.94it/s]

/2015/09/17/forum-protect-student-voices-in-theatre-and-the-arts


 43%|████▎     | 1690/3960 [14:31<19:31,  1.94it/s]

/2015/09/17/forum-the-value-of-performance-work-reaches-far


 43%|████▎     | 1692/3960 [14:32<19:29,  1.94it/s]

/2015/09/17/forum-student-theatre-fosters-community-collaboration-and-growth


 43%|████▎     | 1693/3960 [14:33<19:29,  1.94it/s]

/2015/09/17/forum-every-ounce-of-effort-is-worth-it


 43%|████▎     | 1707/3960 [14:40<19:21,  1.94it/s]

/2015/09/11/students-must-engage-with-alcohol-education-programs-while-institutions-must-constantly


 43%|████▎     | 1711/3960 [14:42<19:20,  1.94it/s]

/2015/09/11/conscious-spending-protects-fair-labor-and-ethics


 43%|████▎     | 1714/3960 [14:44<19:18,  1.94it/s]

/2015/09/10/forum-those-who-came-before-me-the-legacies-within-me


 43%|████▎     | 1715/3960 [14:44<19:18,  1.94it/s]

/2015/09/10/forum-connect-with-campus-resources


 43%|████▎     | 1717/3960 [14:46<19:17,  1.94it/s]

/2015/09/09/forum-be-mindful-of-issues-find-ways-to-be-supportive


 43%|████▎     | 1718/3960 [14:46<19:16,  1.94it/s]

/2015/09/09/forum-change-is-happening


 43%|████▎     | 1720/3960 [14:47<19:15,  1.94it/s]

/2015/09/09/the-ugly-side-of-the-beautiful-game


 43%|████▎     | 1721/3960 [14:47<19:15,  1.94it/s]

/2015/09/09/the-perks-of-being-an-angry-black-woman


 44%|████▎     | 1724/3960 [14:50<19:14,  1.94it/s]

/2015/09/09/google-self-drive-technology-is-the-future-for-ut-commuters


 44%|████▎     | 1727/3960 [14:51<19:12,  1.94it/s]

/2015/09/07/austin-is-not-silicon-valley-%E2%80%94-but-with-time-it-could-be


 44%|████▍     | 1739/3960 [14:58<19:08,  1.93it/s]

/2015/09/02/students-should-find-ways-to-help-the-homeless-community-while-exercising-caution


 45%|████▍     | 1769/3960 [15:15<18:53,  1.93it/s]

/2015/08/07/letter-to-the-editor-statues-must-be-taken-into-historical-context-task-force-must


 45%|████▌     | 1798/3960 [15:31<18:39,  1.93it/s]

/2015/07/10/letter-to-the-editor-confederate-statues-an-affront-to-inclusivity-tolerance-and


 46%|████▌     | 1802/3960 [15:34<18:39,  1.93it/s]

/2015/07/08/other-states-research-demonstrate-safety-of-campus-carry-despite-detractors-doubts


 46%|████▌     | 1815/3960 [15:41<18:32,  1.93it/s]

/2015/06/26/a-need-for-cultural-education


 47%|████▋     | 1871/3960 [16:10<18:03,  1.93it/s]

/2015/05/03/texan-article-gives-attention-to-volunteers-of-austin-state-supported-living-center


 48%|████▊     | 1890/3960 [16:19<17:52,  1.93it/s]

/2015/04/26/dont-steal-uberlyft-rides-at-night


 48%|████▊     | 1894/3960 [16:21<17:50,  1.93it/s]

/2015/04/23/congratulations-to-texas-quidditch


 48%|████▊     | 1895/3960 [16:21<17:50,  1.93it/s]

/2015/04/22/spiderhouse-mural-concerns-reader


 48%|████▊     | 1902/3960 [16:25<17:46,  1.93it/s]

/2015/04/20/sg-student-body-should-debate-boycotts


 48%|████▊     | 1907/3960 [16:28<17:44,  1.93it/s]

/2015/04/16/student-leader-avoids-issue-of-palestinian-human-rights


 48%|████▊     | 1914/3960 [16:31<17:40,  1.93it/s]

/2015/04/15/bds-legislation-would-isolate-uts-jewish-community


 49%|████▊     | 1926/3960 [16:37<17:33,  1.93it/s]

/2015/04/09/affirmative-action-uts-admissions-system-are-righting-past-wrongs


 49%|████▊     | 1929/3960 [16:39<17:32,  1.93it/s]

/2015/04/08/university-ranking-is-oversimplified


 49%|████▉     | 1933/3960 [16:41<17:30,  1.93it/s]

/2015/04/07/recent-attack-on-affirmative-action-top-10-percent-rule-misguided


 49%|████▉     | 1940/3960 [16:45<17:26,  1.93it/s]

/2015/04/02/texas-tennis-does-its-job-university-responsible-for-lack-of-courts


 49%|████▉     | 1941/3960 [16:45<17:25,  1.93it/s]

/2015/04/02/black-and-latino-studies-students-left-out-of-building-name-change


 49%|████▉     | 1947/3960 [16:48<17:22,  1.93it/s]

/2015/03/31/texas-tennis-fan-got-it-right


 49%|████▉     | 1950/3960 [16:50<17:21,  1.93it/s]

/2015/03/30/texas-tennis-deserves-proper-home


 50%|█████     | 1991/3960 [17:10<16:59,  1.93it/s]

/2015/03/05/fiji-party-exemplifies-limited-views-of-oppression


 50%|█████     | 1994/3960 [17:12<16:57,  1.93it/s]

/2015/03/03/kroll-report-means-prospective-ut-students-arent-getting-a-fair-chance


 52%|█████▏    | 2043/3960 [17:37<16:31,  1.93it/s]

/2015/02/11/reader-warns-chick-fil-a-customers-you-could-be-overpaying-for-bagels


 56%|█████▌    | 2201/3960 [18:57<15:09,  1.93it/s]

/2014/10/24/despite-social-media-concerns-platforms-can-still-enrich-lives


 56%|█████▌    | 2210/3960 [19:02<15:04,  1.93it/s]

/2014/10/22/district-9-endorsement-overlooks-kathie-tovo


 56%|█████▋    | 2229/3960 [19:12<14:55,  1.93it/s]

/2014/10/13/world-mental-health-day-helps-bring-more-attention-to-mental-illness


 56%|█████▋    | 2230/3960 [19:13<14:54,  1.93it/s]

/2014/10/13/dont-mischaracterize-cola


 57%|█████▋    | 2239/3960 [19:17<14:49,  1.93it/s]

/2014/10/08/austin-rail-in-context-of-city%E2%80%99s-growth


 57%|█████▋    | 2243/3960 [19:19<14:47,  1.93it/s]

/2014/10/07/mens-athletics-director-says-athletic-ticket-sales-increased-from-last-year


 57%|█████▋    | 2253/3960 [19:24<14:42,  1.93it/s]

/2014/10/02/article-fails-to-mention-that-tower-sniper-was-also-a-victim


 57%|█████▋    | 2257/3960 [19:26<14:40,  1.93it/s]

/2014/10/01/column-on-economy-lacks-logic-research


 57%|█████▋    | 2266/3960 [19:31<14:35,  1.93it/s]

/2014/09/29/article-on-ferguson-to-palestine-panel-omits-crucial-viewpoints


 57%|█████▋    | 2268/3960 [19:32<14:34,  1.93it/s]

/2014/09/26/reader-commends-family-for-donation-to-stuttering-institute


 60%|██████    | 2385/3960 [20:32<13:33,  1.94it/s]

/2014/06/29/top-online-comments


 62%|██████▏   | 2436/3960 [21:00<13:08,  1.93it/s]

/opinion/2014/04/27/firing-line-scrutiny-of-powers-undeserved


 62%|██████▏   | 2437/3960 [21:01<13:08,  1.93it/s]

/opinion/2014/04/24/we-asked-why-did-you-choose-to-come-to-ut


 62%|██████▏   | 2439/3960 [21:02<13:07,  1.93it/s]

/opinion/2014/04/24/friday-firing-lines-five-star-recruit-myles-turner


 62%|██████▏   | 2452/3960 [21:08<13:00,  1.93it/s]

/opinion/2014/04/17/quotes-to-note-patrick-castro-face-off-hall-drama


 62%|██████▏   | 2453/3960 [21:09<12:59,  1.93it/s]

/opinion/2014/04/17/friday-firing-lines-tasers-regent-wallace-hall-cns


 62%|██████▏   | 2460/3960 [21:12<12:56,  1.93it/s]

/opinion/2014/04/15/in-fight-over-ut-regent-wallace-hall-students-were-forgotten


 62%|██████▏   | 2468/3960 [21:16<12:51,  1.93it/s]

/opinion/2014/04/10/quotes-to-note-from-civil-rights-summit-speakers


 62%|██████▏   | 2469/3960 [21:17<12:51,  1.93it/s]

/opinion/2014/04/10/friday-firing-lines-information-technology-yik-yak-equal-rights


 62%|██████▏   | 2473/3960 [21:19<12:49,  1.93it/s]

/opinion/2014/04/08/firing-line-for-april-9-charles-murray-and-civil-rights-summit


 62%|██████▏   | 2474/3960 [21:20<12:48,  1.93it/s]

/opinion/2014/04/08/horns-up-for-april-9-civil-rights-summit-adding-disability-panel


 63%|██████▎   | 2483/3960 [21:24<12:44,  1.93it/s]

/opinion/2014/04/03/friday-firing-lines-abbott-hall-student-fruit-choices-the-tower


 63%|██████▎   | 2501/3960 [21:33<12:34,  1.93it/s]

/opinion/2014/03/27/friday-firing-lines-for-week-of-march-24-tuition-hikes-and-shared-services


 64%|██████▍   | 2530/3960 [21:48<12:19,  1.93it/s]

/opinion/2014/03/06/friday-firing-lines-for-week-of-mar-1-amnesty-economics-of-sex-judge-orlando


 64%|██████▍   | 2545/3960 [21:55<12:11,  1.93it/s]

/opinion/2014/02/27/friday-firing-lines-for-week-of-feb-24-jay-walking-and-austin-police-department


 65%|██████▍   | 2558/3960 [22:02<12:05,  1.93it/s]

/opinion/2014/02/20/friday-firing-lines-for-week-of-feb-17-memorial-museum-funding-and-toilet-paper


 66%|██████▌   | 2605/3960 [22:26<11:40,  1.93it/s]

/opinion/2014/01/30/firing-lines-sherlock-spoilers-pension-plans-and-budget-cuts


 68%|██████▊   | 2697/3960 [23:13<10:52,  1.93it/s]

/opinion/2013/11/08/friday-firing-lines-for-nov8-rape-prevention-and-abortion-availability


 69%|██████▉   | 2748/3960 [23:41<10:27,  1.93it/s]

/opinion/2013/10/17/quotes-to-note-for-the-week-of-oct14


 70%|███████   | 2772/3960 [23:53<10:14,  1.93it/s]

/opinion/2013/10/08/intro-chemistry-class-shows-benefits-of-peer-mentoring-online-content


 72%|███████▏  | 2832/3960 [24:23<09:43,  1.93it/s]

/opinion/2013/09/16/entrepreneurship-is-good-but-responsible-entrepreneurship-is-better


 72%|███████▏  | 2853/3960 [24:34<09:32,  1.94it/s]

/opinion/2013/09/08/horns-up-horns-down-for-sept-9-gerrymandering-football-foibles-and-running-from


 72%|███████▏  | 2854/3960 [24:34<09:31,  1.93it/s]

/opinion/2013/09/05/the-friday-firing-lines-balloons-water-and-syria


 72%|███████▏  | 2856/3960 [24:35<09:30,  1.93it/s]

/opinion/2013/09/05/capital-metro-should-not-eliminate-ut-shuttle-routes


 72%|███████▏  | 2859/3960 [24:37<09:28,  1.94it/s]

/opinion/2013/09/05/horns-up-horns-down-for-sept-5-same-sex-benefits-concealed-carry-the-rain-and-ted


 72%|███████▏  | 2860/3960 [24:37<09:28,  1.94it/s]

/opinion/2013/09/05/in-higher-ed-debate-pitts-recommendation-letter-not-worth-consideration


 72%|███████▏  | 2861/3960 [24:38<09:27,  1.94it/s]

/opinion/2013/09/04/why-you-should-write-for-the-daily-texan-opinion-department


 74%|███████▍  | 2925/3960 [25:11<08:54,  1.94it/s]

/opinion/2013/05/01/what-sacrifice-means


 74%|███████▍  | 2930/3960 [25:13<08:52,  1.94it/s]

/opinion/2013/04/29/learn-from-lehmberg


 75%|███████▌  | 2970/3960 [25:33<08:31,  1.94it/s]

/opinion/2013/04/11/we-asked-oil-money


 75%|███████▌  | 2984/3960 [25:40<08:23,  1.94it/s]

/opinion/2013/04/05/gun-bans-dont-ban-guns


 75%|███████▌  | 2985/3960 [25:40<08:23,  1.94it/s]

/opinion/2013/04/05/we-asked-does-drought-matter


 77%|███████▋  | 3038/3960 [26:09<07:56,  1.94it/s]

/2013/03/04/the-daily-texan-should-not-be-a-platform-for-hate


 77%|███████▋  | 3051/3960 [26:15<07:49,  1.94it/s]

/opinion/2013/02/26/gov-perry-dont-condone-texting-while-driving


 78%|███████▊  | 3077/3960 [26:28<07:35,  1.94it/s]

/opinion/2013/02/14/childish-adoration


 79%|███████▊  | 3109/3960 [26:44<07:19,  1.94it/s]

/opinion/2013/02/03/rewriting-history


 79%|███████▊  | 3111/3960 [26:45<07:18,  1.94it/s]

/opinion/2013/01/31/powers%E2%80%99-contradictory-vision


 79%|███████▊  | 3112/3960 [26:46<07:17,  1.94it/s]

/opinion/2013/01/31/ut-tracks-veterans


 79%|███████▊  | 3118/3960 [26:49<07:14,  1.94it/s]

/opinion/2013/01/29/smarter-business-practices


 79%|███████▉  | 3119/3960 [26:49<07:13,  1.94it/s]

/opinion/2013/01/29/pointless-attention-to-the-nas


 80%|███████▉  | 3160/3960 [27:10<06:52,  1.94it/s]

/opinion/2012/12/03/distinguish-mormons-from-polygamists


 80%|████████  | 3171/3960 [27:16<06:47,  1.94it/s]

/opinion/2012/11/29/greed-stupidity-or-a-clever-marketing-ploy


 80%|████████  | 3172/3960 [27:16<06:46,  1.94it/s]

/opinion/2012/11/29/reject-regnerus


 82%|████████▏ | 3231/3960 [27:45<06:15,  1.94it/s]

/opinion/2012/10/31/stop-pushing-prop-1


 82%|████████▏ | 3254/3960 [27:56<06:03,  1.94it/s]

/opinion/2012/10/21/what-to-watch-october-22-26


 88%|████████▊ | 3478/3960 [29:51<04:08,  1.94it/s]

/opinion/2012/04/01/quotes-to-note-tejano-monument-soap-ban-and-more


 88%|████████▊ | 3491/3960 [29:57<04:01,  1.94it/s]

/opinion/2012/03/28/continued-discussion-of-tuesdays-editorial-cartoon


 88%|████████▊ | 3502/3960 [30:03<03:55,  1.94it/s]

/opinion/2012/03/22/the-firing-line-election-season


 92%|█████████▏| 3642/3960 [31:15<02:43,  1.94it/s]

/opinion/2011/12/01/take-preventative-measures-against-hiv


 92%|█████████▏| 3643/3960 [31:15<02:43,  1.94it/s]

/opinion/2011/12/01/time-to-start-over


 92%|█████████▏| 3647/3960 [31:17<02:41,  1.94it/s]

/opinion/2011/11/29/may-we-meet-again-soon-ut


 92%|█████████▏| 3652/3960 [31:20<02:38,  1.94it/s]

/2011/11/28/occupy-ut-exists


 93%|█████████▎| 3664/3960 [31:26<02:32,  1.94it/s]

/opinion/2011/11/21/top-10-bathroom-story-overlooks-larger-issue


 93%|█████████▎| 3665/3960 [31:27<02:31,  1.94it/s]

/opinion/2011/11/21/thank-you-longhorns


 93%|█████████▎| 3668/3960 [31:28<02:30,  1.94it/s]

/opinion/2011/11/20/quotes-to-note-addressing-higher-education-governance


 93%|█████████▎| 3671/3960 [31:30<02:28,  1.94it/s]

/opinion/2011/11/17/the-firing-line-valuing-the-arts


 93%|█████████▎| 3695/3960 [31:42<02:16,  1.94it/s]

/2011/11/08/quotes-to-note-registration-woes-inspiration-myedu


 93%|█████████▎| 3696/3960 [31:42<02:15,  1.94it/s]

/2011/11/08/discrimination-column-treads-too-much-on-political-grounds


 93%|█████████▎| 3700/3960 [31:44<02:13,  1.94it/s]

/opinion/2011/11/06/failing-to-see-the-big-picture-in-the-myedu-debate


 94%|█████████▎| 3708/3960 [31:49<02:09,  1.94it/s]

/opinion/2011/11/02/show-your-longhorn-pride


 94%|█████████▎| 3709/3960 [31:49<02:09,  1.94it/s]

/2011/11/02/study-abroad-column-drew-inaccurate-conclusions


 94%|█████████▍| 3732/3960 [32:03<01:57,  1.94it/s]

/opinion/2011/10/24/leaving-no-stone-unturned


 94%|█████████▍| 3740/3960 [32:07<01:53,  1.94it/s]

/opinion/2011/10/19/front-page-gem


 94%|█████████▍| 3742/3960 [32:08<01:52,  1.94it/s]

/opinion/2011/10/19/defending-the-occupy-message


 95%|█████████▌| 3763/3960 [32:19<01:41,  1.94it/s]

/2011/10/12/a-specious-argument


 95%|█████████▌| 3771/3960 [32:23<01:37,  1.94it/s]

/opinion/2011/10/09/single-gender-schools-prove-beneficial


 95%|█████████▌| 3772/3960 [32:23<01:36,  1.94it/s]

/opinion/2011/10/09/the-role-of-a-newspaper


 95%|█████████▌| 3779/3960 [32:27<01:33,  1.94it/s]

/2011/10/03/simply-embarrassing


 95%|█████████▌| 3781/3960 [32:28<01:32,  1.94it/s]

/opinion/2011/10/03/standardized-curiosity


 96%|█████████▌| 3786/3960 [32:31<01:29,  1.94it/s]

/opinion/2011/09/29/gilberts-jersey-article-causes-ire-in-readers


 96%|█████████▌| 3804/3960 [32:39<01:20,  1.94it/s]

/opinion/2011/09/23/from-texas-to-georgia-stop-executions


 96%|█████████▌| 3805/3960 [32:40<01:19,  1.94it/s]

/opinion/2011/09/23/greek-studies-still-offered


 96%|█████████▌| 3806/3960 [32:40<01:19,  1.94it/s]

/opinion/2011/09/22/inaccuracies-of-ad-regarding-israel-palestine-conflict


 97%|█████████▋| 3823/3960 [32:48<01:10,  1.94it/s]

/opinion/2011/09/13/longhorns-welcome-byu-fans


 97%|█████████▋| 3824/3960 [32:49<01:10,  1.94it/s]

/opinion/2011/09/13/still-embarassed


 97%|█████████▋| 3825/3960 [32:49<01:09,  1.94it/s]

/opinion/2011/09/13/a-shared-ut-experience


 97%|█████████▋| 3829/3960 [32:51<01:07,  1.94it/s]

/opinion/2011/09/13/more-than-a-game


 97%|█████████▋| 3839/3960 [32:58<01:02,  1.94it/s]

/opinion/2011/09/06/the-most-polite-burglar


 97%|█████████▋| 3842/3960 [32:59<01:00,  1.94it/s]

/opinion/2011/09/05/following-the-lead-of-fools


 97%|█████████▋| 3854/3960 [33:05<00:54,  1.94it/s]

/opinion/2011/08/28/a-big-win-for-students


 98%|█████████▊| 3865/3960 [33:11<00:48,  1.94it/s]

/opinion/2011/08/04/psychics-fail-to-adequately-demonstrate-ability


 98%|█████████▊| 3871/3960 [33:14<00:45,  1.94it/s]

/opinion/2011/07/27/where-is-the-research


 98%|█████████▊| 3872/3960 [33:14<00:45,  1.94it/s]

/opinion/2011/07/27/the-cost-of-free-parking


 98%|█████████▊| 3885/3960 [33:21<00:38,  1.94it/s]

/opinion/2011/07/13/a-faculty%C2%92s-obligation


 98%|█████████▊| 3887/3960 [33:22<00:37,  1.94it/s]

/opinion/2011/07/12/our-beauty-your-charm


 98%|█████████▊| 3888/3960 [33:23<00:37,  1.94it/s]

/opinion/2011/07/11/tenure-proves-beneficial-to-higher-education


 98%|█████████▊| 3889/3960 [33:23<00:36,  1.94it/s]

/opinion/2011/07/11/replace-tenure-with-multiyear-renewable-contracts


 98%|█████████▊| 3890/3960 [33:23<00:36,  1.94it/s]

/opinion/2011/07/07/confederate-plates-create-mixed-emotions


 98%|█████████▊| 3893/3960 [33:25<00:34,  1.94it/s]

/opinion/2011/07/05/disgusted-and-disappointed


 98%|█████████▊| 3899/3960 [33:28<00:31,  1.94it/s]

/opinion/2011/06/26/governor-rick-perry-only-representing-himself


 99%|█████████▊| 3905/3960 [33:31<00:28,  1.94it/s]

/opinion/2011/06/20/champagne-on-a-beer-budget


100%|██████████| 3960/3960 [33:59<00:00,  1.94it/s]


In [3]:
data = pd.read_pickle('data/opinion.p')
data.text = data.text.str.strip()
data.author = data.author.str.lower()
# data = data[data.author.isin(data.author.value_counts().head(20).index)]

In [5]:
from nltk.sentiment import SentimentIntensityAnalyzer
sid = SentimentIntensityAnalyzer()

In [8]:
data['sent'] = data.text.map(lambda i: sid.polarity_scores(i)['compound'])

In [53]:
from sklearn.feature_extraction.text import TfidfVectorizer
v = TfidfVectorizer(stop_words='english', strip_accents='unicode').fit(data.text)
w = v.transform(data.text)

In [50]:
from sklearn.naive_bayes import MultinomialNB
y = LabelEncoder().fit_transform(data.author.str.lower())
m = MultinomialNB(alpha=.1).fit(w, y)

In [51]:
m.score(w, y)

0.97190293742017875

In [54]:
from sklearn.cluster import KMeans
k = KMeans(n_jobs=-1, verbose=1, random_state=42).fit(w)

Initialization complete
Initialization complete
Initialization complete
Initialization complete
Initialization complete
Initialization complete
Initialization complete
Initialization complete
Iteration  0, inertia 4730.641
Iteration  0, inertia 4743.456
Iteration  0, inertia 4683.911
Iteration  0, inertia 4730.312
Iteration  0, inertia 4740.505
Iteration  0, inertia 4724.488
Iteration  0, inertia 4734.731
Iteration  0, inertia 4729.638
Iteration  1, inertia 2449.850
Iteration  1, inertia 2447.293
Iteration  1, inertia 2456.910
Iteration  1, inertia 2450.733
Iteration  1, inertia 2450.318
Iteration  1, inertia 2452.134
Iteration  1, inertia 2457.944
Iteration  1, inertia 2456.176
Iteration  2, inertia 2439.075
Iteration  2, inertia 2438.401
Iteration  2, inertia 2438.472
Iteration  2, inertia 2438.705
Iteration  2, inertia 2445.336
Iteration  2, inertia 2440.616
Iteration  2, inertia 2445.380
Iteration  2, inertia 2444.665
Iteration  3, inertia 2435.751
Iteration  3, inertia 2434.428
It

In [58]:
cc = pd.DataFrame(data=k.cluster_centers_, columns=v.vocabulary_).T
cc[0].sort_values(ascending=False)

spurning          0.109158
harmfully         0.077428
nepotistic        0.076215
ingloriously      0.066014
inefficiency      0.061617
constituency      0.052348
brashly           0.048730
1462              0.046882
barrel            0.045194
demonstrates      0.041288
devoting          0.040403
surcharge         0.039286
gnarly            0.036088
wooing            0.032325
erg               0.031470
sweetheart        0.030562
356               0.030233
foodstuffs        0.030133
surrendering      0.029976
cassell           0.028429
modulators        0.027308
ny                0.026300
crudely           0.024046
boss              0.023613
120               0.021996
discomfort        0.021676
courtrooms        0.021377
template          0.021295
campuswide        0.021219
eliciting         0.020514
                    ...   
devon             0.000000
dysfunctional     0.000000
sanger            0.000000
contextualizes    0.000000
circumvented      0.000000
borrows           0.000000
e

In [60]:
data['cluster'] = k.predict(w)

In [70]:
(data[data.cluster==0].author.value_counts()/data.author.value_counts()).dropna().sort_values(ascending=False)

matt offill                           1.000000
mitchell hughes ralph                 1.000000
ezra siegel                           1.000000
evan berkowitz                        1.000000
juan carlos delafuente                1.000000
ou daily                              1.000000
marc nestenius                        1.000000
david leffler                         1.000000
garrett callahan                      1.000000
curry shoff                           1.000000
christian corona                      1.000000
the oklahoma daily editorial board    1.000000
thomas hunt, anne mueller             1.000000
chad markey                           1.000000
nick castillo                         1.000000
travis adams                          0.666667
jasmine c. johnson                    0.666667
jori epstein                          0.615385
daley epstein                         0.500000
matthew gil                           0.500000
patrick st. pierre                    0.500000
zachary adams