In [12]:
from bs4 import BeautifulSoup
from requests import get
import pandas as pd, matplotlib.pyplot as plt
from tqdm import tqdm
%matplotlib inline

In [13]:
# HREF SCRAPER
def get_urls(n_pages, section):
  urls = []
  for page in tqdm(range(n_pages+1)):
    try:
      soup = BeautifulSoup(get(f'https://www.dailytexanonline.com/section/{section}?page={page}').text,'lxml')
      for i in soup.findAll('h2', {'class':'story-title'}):
        urls.append(i.find('a')['href'])
    except AttributeError:
      print(page)
  return urls

In [14]:
# ARTICLE SCRAPER
def get_articles(urls):
  articles = {}
  for h in tqdm(urls):
    try:
      soup = BeautifulSoup(get('https://www.dailytexanonline.com'+h).text,'lxml')
      auth = soup.find('div',{'id':'author-name'})
      articles[h] = dict(
        text = soup.find('div',{'id':'article-body'}).text,
        author = auth.text,
        author_link = auth.find('a')['href'],
        date = soup.find('time',{'id':'article-published'}).text,
        title = soup.find('h1',{'id':'page-title'}).text,
      )
    except:
      print(h)
  articles = pd.DataFrame.from_dict(articles, orient='index')
  articles.date = pd.to_datetime(articles.date.str.lstrip('Published on '), format='%B %d, %Y at %I:%M %p')
  articles.text = articles.text.str.replace('\\xa0',' ').str.replace('\\n',' ')
  articles.author = articles.author.str.lstrip('By ')
  articles.title = articles.title.str.strip()
  return articles

In [15]:
section = 'university'
get_articles(get_urls(485,section)).to_pickle(f'data/{section}.p')

100%|██████████| 486/486 [08:00<00:00,  1.01it/s]
  3%|▎         | 140/4852 [01:07<37:59,  2.07it/s]

/2017/05/24/ut-stabbing-suspect-scheduled-for-another-hearing


  3%|▎         | 141/4852 [01:08<38:05,  2.06it/s]

/2017/05/07/former-university-president-peter-flawn-passes-away


 12%|█▏        | 564/4852 [04:50<36:46,  1.94it/s]

/2015/09/24/ut-system-to-launch-sexual-assualt-survey-in-october


 44%|████▍     | 2123/4852 [19:15<24:45,  1.84it/s]

/news/2013/09/26/conflicts-of-interest-arise-in-ut-methane-emissions-study


 45%|████▍     | 2162/4852 [19:41<24:29,  1.83it/s]

/news/2013/08/28/gone-to-texas-welcomes-class-of-2017


 46%|████▌     | 2209/4852 [20:08<24:05,  1.83it/s]

/news/2013/06/28/listen-daily-texan-podcast-discusses-fisher-v-texas-decision


 46%|████▌     | 2228/4852 [20:22<23:59,  1.82it/s]

/news/2013/06/14/listen-the-first-summer-podcast


 47%|████▋     | 2294/4852 [21:03<23:28,  1.82it/s]

/news/2013/04/26/university-external-foundations-profiles


 48%|████▊     | 2353/4852 [21:37<22:57,  1.81it/s]

/news/2013/04/15/explosion-at-boston-marathon-stories-and-coverage


 49%|████▊     | 2365/4852 [21:43<22:50,  1.81it/s]

/news/2013/04/12/here-comes-the-boom


 49%|████▉     | 2391/4852 [21:58<22:37,  1.81it/s]

/news/2013/04/09/pop-culture-sports-business-and-school-present-cases-of-infidelity-scandal-and


 62%|██████▏   | 3023/4852 [28:01<16:57,  1.80it/s]

/news/2012/10/01/graduate-career-services-expand-online-with-new-program


 70%|██████▉   | 3375/4852 [31:03<13:35,  1.81it/s]

/news/2012/04/12/myedu-offers-enhanced-features-for-easier-registration


 72%|███████▏  | 3509/4852 [32:14<12:20,  1.81it/s]

/news/2012/03/06/ut-entrepreneurship-week-helps%C2%A0students-build-connections-and-receive-networking


 76%|███████▌  | 3664/4852 [33:33<10:52,  1.82it/s]

/2012/01/30/french-soldiers-attitudes-influence-past-current-culture


 80%|████████  | 3891/4852 [35:29<08:46,  1.83it/s]

/2011/10/20/natural-gas-conference-features-industry-experts-texas-gas-use-and-outlook


 83%|████████▎ | 4010/4852 [36:30<07:39,  1.83it/s]

/2011/09/22/education-professor-receives-prestigious-award-for-developing-future-educators


 91%|█████████ | 4400/4852 [39:42<04:04,  1.85it/s]

/news/2011/03/29/ut-festival-showcases-student-plays


 91%|█████████ | 4411/4852 [39:48<03:58,  1.85it/s]

/news/2011/03/25/whole-foods-ceo-lectures-about-business-philosophy


 92%|█████████▏| 4460/4852 [40:11<03:31,  1.85it/s]

/news/2011/03/08/state-senator-traces-success-to-years-spent-on-forty-acres


 92%|█████████▏| 4488/4852 [40:27<03:16,  1.85it/s]

/2011/03/01/ut-senate-kicks-off-event-to-honor-thank-faculty


 93%|█████████▎| 4495/4852 [40:31<03:13,  1.85it/s]

/2011/03/01/ut-senate-kicks-off-event-to-honor-thank-faculty


 94%|█████████▍| 4577/4852 [41:12<02:28,  1.85it/s]

/news/2011/02/11/ransom-center-exhibit-reveals-playwright%C2%92s-creative-mentality


 96%|█████████▌| 4650/4852 [41:47<01:48,  1.85it/s]

/news/2011/01/26/forum-discusses-reductions-to-liberal-arts-ethnic-centers


100%|█████████▉| 4846/4852 [43:23<00:03,  1.86it/s]

/news/2010/08/26/ut-expands-sustainability-efforts


100%|██████████| 4852/4852 [43:26<00:00,  1.86it/s]


In [3]:
data = pd.read_pickle('data/opinion.p')
data.text = data.text.str.strip()
data.author = data.author.str.lower()
# data = data[data.author.isin(data.author.value_counts().head(20).index)]

In [5]:
from nltk.sentiment import SentimentIntensityAnalyzer
sid = SentimentIntensityAnalyzer()

In [8]:
data['sent'] = data.text.map(lambda i: sid.polarity_scores(i)['compound'])

In [53]:
from sklearn.feature_extraction.text import TfidfVectorizer
v = TfidfVectorizer(stop_words='english', strip_accents='unicode').fit(data.text)
w = v.transform(data.text)

In [50]:
from sklearn.naive_bayes import MultinomialNB
y = LabelEncoder().fit_transform(data.author.str.lower())
m = MultinomialNB(alpha=.1).fit(w, y)

In [51]:
m.score(w, y)

0.97190293742017875

In [54]:
from sklearn.cluster import KMeans
k = KMeans(n_jobs=-1, verbose=1, random_state=42).fit(w)

Initialization complete
Initialization complete
Initialization complete
Initialization complete
Initialization complete
Initialization complete
Initialization complete
Initialization complete
Iteration  0, inertia 4730.641
Iteration  0, inertia 4743.456
Iteration  0, inertia 4683.911
Iteration  0, inertia 4730.312
Iteration  0, inertia 4740.505
Iteration  0, inertia 4724.488
Iteration  0, inertia 4734.731
Iteration  0, inertia 4729.638
Iteration  1, inertia 2449.850
Iteration  1, inertia 2447.293
Iteration  1, inertia 2456.910
Iteration  1, inertia 2450.733
Iteration  1, inertia 2450.318
Iteration  1, inertia 2452.134
Iteration  1, inertia 2457.944
Iteration  1, inertia 2456.176
Iteration  2, inertia 2439.075
Iteration  2, inertia 2438.401
Iteration  2, inertia 2438.472
Iteration  2, inertia 2438.705
Iteration  2, inertia 2445.336
Iteration  2, inertia 2440.616
Iteration  2, inertia 2445.380
Iteration  2, inertia 2444.665
Iteration  3, inertia 2435.751
Iteration  3, inertia 2434.428
It

In [58]:
cc = pd.DataFrame(data=k.cluster_centers_, columns=v.vocabulary_).T
cc[0].sort_values(ascending=False)

spurning          0.109158
harmfully         0.077428
nepotistic        0.076215
ingloriously      0.066014
inefficiency      0.061617
constituency      0.052348
brashly           0.048730
1462              0.046882
barrel            0.045194
demonstrates      0.041288
devoting          0.040403
surcharge         0.039286
gnarly            0.036088
wooing            0.032325
erg               0.031470
sweetheart        0.030562
356               0.030233
foodstuffs        0.030133
surrendering      0.029976
cassell           0.028429
modulators        0.027308
ny                0.026300
crudely           0.024046
boss              0.023613
120               0.021996
discomfort        0.021676
courtrooms        0.021377
template          0.021295
campuswide        0.021219
eliciting         0.020514
                    ...   
devon             0.000000
dysfunctional     0.000000
sanger            0.000000
contextualizes    0.000000
circumvented      0.000000
borrows           0.000000
e

In [60]:
data['cluster'] = k.predict(w)

In [70]:
(data[data.cluster==0].author.value_counts()/data.author.value_counts()).dropna().sort_values(ascending=False)

matt offill                           1.000000
mitchell hughes ralph                 1.000000
ezra siegel                           1.000000
evan berkowitz                        1.000000
juan carlos delafuente                1.000000
ou daily                              1.000000
marc nestenius                        1.000000
david leffler                         1.000000
garrett callahan                      1.000000
curry shoff                           1.000000
christian corona                      1.000000
the oklahoma daily editorial board    1.000000
thomas hunt, anne mueller             1.000000
chad markey                           1.000000
nick castillo                         1.000000
travis adams                          0.666667
jasmine c. johnson                    0.666667
jori epstein                          0.615385
daley epstein                         0.500000
matthew gil                           0.500000
patrick st. pierre                    0.500000
zachary adams