In [1]:
import requests
from bs4 import BeautifulSoup, SoupStrainer
import pandas as pd
import numpy as np
import re

# ESPN

In [2]:
page = requests.get('https://www.espn.com/nba/playbyplay/_/gameId/401344140')
soup = BeautifulSoup(page.content, 'html.parser')

raw_timestamps = soup.find_all('td', class_='time-stamp')
raw_team = soup.find_all('img', class_='team-logo')
raw_details = soup.find_all('td', class_='game-details')
raw_score = soup.find_all('td', class_='combined-score')

In [3]:
# timestamps are directly scraped
timestamp = list()
for t in raw_timestamps:
  timestamp.append(t.string)

# raw_details as well
details = list()
for d in raw_details:
  details.append(d.string)

In [4]:
# teams must be cleaned of html code (since there are only two teams it can be done with a simple if)
team = list()
for t in raw_team:
  t = str(t)
  if t == '<img class="team-logo" src="https://a.espncdn.com/combiner/i?img=/i/teamlogos/nba/500/mil.png&amp;h=100&amp;w=100"/>':
    team.append('mil')
  else:
    team.append('atl')

In [5]:
score = list()
for s in raw_score:
  s = str(s)
  s = s.replace('<td class="combined-score">', '').replace('</td>', '').replace(' - ', '-')
  score.append(s)

In [6]:
df_espn = pd.DataFrame(list(zip(timestamp, details, team, score)), columns=['gametime', 'details', 'team', 'score'])
# type conversion and quarters insertion
minutes = df_espn['gametime'].str.contains(':')
# time of quarter for records that are expressed in seconds
df_espn.loc[~minutes, 'gametime'] = pd.to_datetime(np.floor(pd.to_numeric(df_espn.loc[~minutes, 'gametime'])), unit='s').dt.time.astype(str)
# time of quarter for records that are expressed in minutes
df_espn.loc[minutes, 'gametime'] = pd.to_datetime(df_espn.loc[minutes, 'gametime'], format='%M:%S').dt.time.astype(str)

In [7]:
df_espn

Unnamed: 0,gametime,details,team,score
0,00:12:00,Deandre Ayton vs. Brook Lopez (Devin Booker ga...,atl,0-0
1,00:11:40,Chris Paul misses 20-foot pullup jump shot,mil,0-0
2,00:11:39,Giannis Antetokounmpo defensive rebound,mil,0-0
3,00:11:21,Jrue Holiday misses 19-foot pullup jump shot,atl,0-0
4,00:11:20,Jae Crowder defensive rebound,atl,0-0
...,...,...,...,...
464,00:00:09,Suns Full timeout,mil,98-105
465,00:00:07,Devin Booker misses 29-foot three point jumper,mil,98-105
466,00:00:06,P.J. Tucker defensive rebound,atl,98-105
467,00:00:00,End of the 4th Quarter,atl,98-105


# GUARDIAN

In [8]:
pages = list()
pages.append(requests.get('https://www.theguardian.com/sport/live/2021/jul/20/nba-finals-2021-game-6-phoenix-suns-v-milwaukee-bucks-live'))
pages.append(requests.get('https://www.theguardian.com/sport/live/2021/jul/20/nba-finals-2021-game-6-phoenix-suns-v-milwaukee-bucks-live?page=with:block-60f789438f0814e7a316dc6d#liveblog-navigation'))
pages.append(requests.get('https://www.theguardian.com/sport/live/2021/jul/20/nba-finals-2021-game-6-phoenix-suns-v-milwaukee-bucks-live?page=with:block-60f77a7d8f085dcd6b59424f#liveblog-navigation'))

# only parse the live blog, otherwise the search includes tags that are outside of it
raw_times = list()
raw_titles = list()
liveblog = SoupStrainer('div', class_='js-liveblog-body u-cf from-content-api js-blog-blocks blocks ')
for page in pages:
  soup = BeautifulSoup(page.content, 'html.parser', parse_only=liveblog)
  for block in soup.find_all('div', class_='block block--content'):
    # we only need blocks with titles
    if block.find('strong'):
      raw_times.append(block.find('span', class_='block-time__absolute'))
      raw_titles.append(block.find('strong'))

# times are directly scraped
times = list()
for t in raw_times:
  times.append(t.string)

# titles cannot be scraped directly since they contain a tag
titles = list()
for t in raw_titles:
  t = str(t)
  t = t.replace('<sup>', '').replace('</sup>', '')
  t = t.replace('<strong>', '').replace('</strong>', '')
  titles.append(t)

In [9]:
score_mil = list()
score_atl = list()
scores = list()
quarter = list()
time = list()

for t, i in zip(raw_titles, times):
  t = str(t)
  t = t.replace('<sup>', '').replace('</sup>', '')
  t = t.replace('<strong>', '').replace('</strong>', '')
  t = t.split(', ')
  # a few titles aren't in the shape needed to scrape data, we're gonna skip them
  try:
    score = t[0].replace('Suns ', '').replace(' Bucks', '')
    score_full = score
    score = score.split('-')
    # score[1] is filled only for the titles in the right shape, the others will throw an exception and get skipped
    score_mil.append(score[1])
    score_atl.append(score[0])
    scores.append(score_full)
    time.append(i)
    quarter.append(t[2][0])
  except IndexError:
    pass

# one row in first quarter doesn't have the usual shape, since it's a simple fix it will be done manually
quarter.append('1')

In [10]:
df_guardian = pd.DataFrame(list(zip(time, quarter, scores)), columns=['realtime', 'quarter', 'scores'])
df_guardian['realtime'] = pd.to_datetime(df_guardian['realtime'], format='%H:%M').dt.time
df_guardian

Unnamed: 0,realtime,quarter,scores
0,04:40:00,4,98-104
1,04:37:00,4,96-104
2,04:34:00,4,96-102
3,04:33:00,4,96-100
4,04:31:00,4,94-100
...,...,...,...
57,02:20:00,1,5-6
58,02:18:00,1,2-6
59,02:17:00,1,2-4
60,02:15:00,1,2-2


# FINAL DATAFRAME

In [11]:
# final merge
live_comm = df_espn.merge(df_guardian, how='inner', left_on='score', right_on='scores').iloc[:, :-1]
#live_comm.tail()

In [12]:
# download_csv
#live_comm.to_csv('live-commentary.csv')

# INTEGRATE TWEETS

In [13]:
# integrate with tweets
tweets = pd.read_csv('suns-bucks-G6.csv')

#tweets['timestamp'] = pd.to_datetime(tweets['timestamp'], infer_datetime_format=True)
#tweets['timestamp'] = tweets.timestamp.values.astype(np.int64) // 10 ** 9
#tweets['timestamp'] = tweets['timestamp'].to_timestamp()

# the default round() gives errors, manual construction of the necessary column
tweets['hour'] = pd.to_datetime(tweets['timestamp'], infer_datetime_format=True).dt.hour
tweets['minute'] = pd.to_datetime(tweets['timestamp'], infer_datetime_format=True).dt.minute
tweets['hour'] = tweets['hour'].astype('string')
tweets['minute'] = tweets['minute'].astype('string')
tweets['time'] = tweets['hour'].str.cat(tweets['minute'], sep=":")
# convert back into time
tweets['time'] = pd.to_datetime(tweets['time'], format='%H:%M').dt.time
tweets.head()

Unnamed: 0.1,Unnamed: 0,_id,id,user_location,language,timestamp,text,retweet_count,user_followers,geo,...,in_reply_to_status_id_str,in_reply_to_user_id_str,retweeted_status_id,retweeted_status__user_id,retweeted_status_user_name,id_str,screen_name,hour,minute,time
0,0,60f74336b712538547cbf0fc,1417600743033360384,AZ,en,2021-07-20 23:42:09,RT @Suns: 𝐍𝐁𝐀 𝐅𝐈𝐍𝐀𝐋𝐒. 𝐆𝐀𝐌𝐄 𝟔.\n\n📺 @ABCNetwork...,0,1537,,...,,,1.417543e+18,18481110.0,Phoenix Suns,1146370740,boi_guapo,23,42,23:42:00
1,1,60f74338b712538547cbf0fd,1417600749182365700,"Stockholm, Sverige",en,2021-07-20 23:42:11,RT @davidalangrier: BREAKING NEWS: NBA Ref Sco...,0,1036,,...,,,1.417601e+18,18144090.0,David Alan Grier,1061255281341054977,Ahmedberhan1,23,42,23:42:00
2,2,60f7433ab712538547cbf0fe,1417600757872959492,By the Rich People,en,2021-07-20 23:42:13,RT @wildcardsprts: 4K+ streams‼️\n\nFor our al...,0,2485,,...,,,1.417596e+18,1.142281e+18,WildCard Sports,239012690,Pastor_KevyKev,23,42,23:42:00
3,3,60f7433bb712538547cbf0ff,1417600761500999681,"Charlotte, NC",en,2021-07-20 23:42:13,SO MANY POST-IT NOTES,0,174837,,...,,,,,,21362008,tomhaberstroh,23,42,23:42:00
4,4,60f7433bb712538547cbf100,1417600763707154435,,en,2021-07-20 23:42:14,RT @JohnnyPropz: NBA 🏀 July 20th:\n\nD. Ayton ...,0,664,,...,,,1.417601e+18,1.356723e+18,Johnny Props,1281401105398796288,JohnnyLockz,23,42,23:42:00


In [18]:
# final dataframe
final = tweets.merge(live_comm, how='left', left_on='time', right_on='realtime')
final['on'] = final['realtime'].notna()

In [19]:
# csv download
final.to_csv('final.csv')