In [5]:
import requests
from bs4 import BeautifulSoup
from fake_useragent import UserAgent
import concurrent.futures
import pandas as pd

## Scraping and cleaning Depth-chart and Roster

In [82]:
url = 'https://www.ourlads.com/ncaa-football-depth-charts'

ua = UserAgent()
userAgent = ua.random
headers = {'User-Agent': userAgent}
page = requests.get(url, headers = headers)
soup = BeautifulSoup(page.content, "html.parser")
team_link = soup.find_all('div', class_= "nfl-dc-mm-team-links")

In [83]:
team_links = []
team_link = soup.find_all('div', class_= "nfl-dc-mm-team-links")
for i in team_link:
    team_links.append('https://www.ourlads.com/ncaa-football-depth-charts/'+ i.a['href'])

team_links_depth_chart = team_links[1:]
team_links_depth_chart 

['https://www.ourlads.com/ncaa-football-depth-charts/depth-chart.aspx?s=army&id=90038',
 'https://www.ourlads.com/ncaa-football-depth-charts/depth-chart.aspx?s=charlotte&id=92936',
 'https://www.ourlads.com/ncaa-football-depth-charts/depth-chart.aspx?s=east-carolina&id=90452',
 'https://www.ourlads.com/ncaa-football-depth-charts/depth-chart.aspx?s=florida-atlantic&id=90521',
 'https://www.ourlads.com/ncaa-football-depth-charts/depth-chart.aspx?s=memphis&id=91050',
 'https://www.ourlads.com/ncaa-football-depth-charts/depth-chart.aspx?s=navy&id=91257',
 'https://www.ourlads.com/ncaa-football-depth-charts/depth-chart.aspx?s=north-texas&id=92660',
 'https://www.ourlads.com/ncaa-football-depth-charts/depth-chart.aspx?s=rice&id=91740',
 'https://www.ourlads.com/ncaa-football-depth-charts/depth-chart.aspx?s=south-florida&id=91855',
 'https://www.ourlads.com/ncaa-football-depth-charts/depth-chart.aspx?s=temple&id=91970',
 'https://www.ourlads.com/ncaa-football-depth-charts/depth-chart.aspx?s=t

In [84]:
team_links_roster = [url.replace('depth-chart.aspx', 'roster.aspx') for url in team_links]
team_links_roster = team_links_roster[1:]
team_links_roster

['https://www.ourlads.com/ncaa-football-depth-charts/roster.aspx?s=army&id=90038',
 'https://www.ourlads.com/ncaa-football-depth-charts/roster.aspx?s=charlotte&id=92936',
 'https://www.ourlads.com/ncaa-football-depth-charts/roster.aspx?s=east-carolina&id=90452',
 'https://www.ourlads.com/ncaa-football-depth-charts/roster.aspx?s=florida-atlantic&id=90521',
 'https://www.ourlads.com/ncaa-football-depth-charts/roster.aspx?s=memphis&id=91050',
 'https://www.ourlads.com/ncaa-football-depth-charts/roster.aspx?s=navy&id=91257',
 'https://www.ourlads.com/ncaa-football-depth-charts/roster.aspx?s=north-texas&id=92660',
 'https://www.ourlads.com/ncaa-football-depth-charts/roster.aspx?s=rice&id=91740',
 'https://www.ourlads.com/ncaa-football-depth-charts/roster.aspx?s=south-florida&id=91855',
 'https://www.ourlads.com/ncaa-football-depth-charts/roster.aspx?s=temple&id=91970',
 'https://www.ourlads.com/ncaa-football-depth-charts/roster.aspx?s=tulane&id=92131',
 'https://www.ourlads.com/ncaa-footbal

### Depth chart

In [None]:
master_df = []

def get_all_csv(url):
    ua = UserAgent()
    userAgent = ua.random
    head = {'User-Agent': userAgent}
    page = requests.get(url, headers = head)
    soup = BeautifulSoup(page.content, "html.parser")
    team_name = soup.find('div', {'class': 'pt-team'}).text.strip()

    table = soup.find('table', class_='table table-bordered')
    # Extract the headers
    headers = [th.text.strip() for th in table.find_all('th')]
    rows = []
    for tr in table.find('tbody').find_all('tr'):
        # Each cell within the row
        cells = [td.text.strip() for td in tr.find_all('td')]
        rows.append(cells)

    df = pd.DataFrame(rows, columns=headers)
    df['team_name'] = team_name
    print(team_name)
    df['field_pos'] = df['Pos'].where(df['Pos'].isin(['Offense', 'Defense', 'Special Teams']))

    # Forward fill the 'field_pos' column to propagate values downwards
    df['field_pos'] = df['field_pos'].ffill()
    df = df[~df['Pos'].isin(['Offense', 'Defense', 'Special Teams', 'OFF','ST','DEF'])]
    df.rename(columns={'Player 1': '1st String','Player 2': '2nd String','Player 3': '3rd String','Player 4': '4th String','Player 5': '5th String','Pos':'Position','No.':'No1'}, inplace=True)
    no_counter = 2  # Start with 'No2'

    new_columns = []
    for col in df.columns:
        if col == 'No':
            new_columns.append(f'No{no_counter}')
            no_counter += 1
        else:
            new_columns.append(col)

    df.columns = new_columns

    df['1st String'] = df['No1'].astype(str) + ' ' + df['1st String']
    df['2nd String'] = df['No2'].astype(str) + ' ' + df['2nd String']
    df['3rd String'] = df['No3'].astype(str) + ' ' + df['3rd String']
    df['4th String'] = df['No4'].astype(str) + ' ' + df['4th String']
    df['5th String'] = df['No5'].astype(str) + ' ' + df['5th String']

    df = df.drop(columns=['No1', 'No2', 'No3','No4','No5'])


    df = df.melt(id_vars=['Position','field_pos','team_name'], 
                        value_vars=['1st String', '2nd String', '3rd String','4th String', '5th String'], 
                        var_name='String Order', 
                        value_name='Player Name')
    # Step 1: Extract the Player Number
    df['Player Number'] = df['Player Name'].str.extract(r'^(\d+)')

    # Step 2: Remove the Player Number from the Player Name
    df['Player'] = df['Player Name'].str.replace(r'^\d+\s+', '', regex=True)

    # Step 3: Split the Player Name into First Name and Other Names
    df[['First Name', 'Other Names']] = df['Player'].str.split(' ', n=1, expand=True)

    # Step 4: Drop the original 'Player' column as it's now split
    df = df.drop(columns=['Player Name','Player'])

    df['First Name']=df['First Name'].str.replace(',','')
    df[['Last Name', 'School year']] = df['Other Names'].str.split(' ', n=1, expand=True)
    df = df.drop(columns=['Other Names'])
    master_df.append(df)



for i in team_links:
    get_all_csv(i)

df_res = pd.concat(master_df)
df_res.to_csv('/workspace/sales/sports/college_football_depth_chart.csv', index=False)

### Roster

In [85]:
import numpy as np
all_df = []
def get_roster(url):
    ua = UserAgent()
    userAgent = ua.random
    head = {'User-Agent': userAgent}
    page = requests.get(url, headers = head)
    soup = BeautifulSoup(page.content, "html.parser")
    team_name = soup.find('div', {'class': 'pt-team'}).text.strip()

    table = soup.find('table', class_='table table-bordered')
    # Extract the headers
    headers = [th.text.strip() for th in table.find_all('th')]
    rows = []
    for tr in table.find('tbody').find_all('tr'):
        # Each cell within the row
        cells = [td.text.strip() for td in tr.find_all('td')]
        rows.append(cells)

    df = pd.DataFrame(rows, columns=headers)
    df = df[~df['Player'].isin(['Active Players'])]
    df.rename(columns={'#': 'Player Number','Pos.': 'Position','HT':'height','WT':'Weight'}, inplace=True)
    df[['First Name', 'Other Names']] = df['Player'].str.split(' ', n=1, expand=True)
    df['First Name']=df['First Name'].str.replace(',','')
    df['height'] = pd.to_numeric(df['height'])
    df['height']= (df['height'] /10).astype(int)
    df['height'] = df['height'].astype(str)

    df['height'] = df['height'].str.rstrip('0')
    single_digit_mask = df['height'].str.len() == 1
    df['formatted_height'] = np.where(single_digit_mask,
                                    df['height'] + "'",
                                    df['height'].str[0] + "'" + df['height'].str[1:])
    df['formatted_height']=df['formatted_height'].str.replace('0','')
    df['height'] = df['formatted_height']
    df = df.drop(columns=['Player','formatted_height'])
    df['team_name'] = team_name
    print(team_name)
    all_df.append(df)

for i in team_links_roster:
    get_roster(i)

df_roster = pd.concat(all_df)
df_roster.to_csv('/workspace/sales/sports/college_football_roster.csv', index=False)

Army Black Knights
Charlotte 49ers
East Carolina Pirates
Florida Atlantic Owls
Memphis Tigers
Navy Midshipmen
North Texas Mean Green
Rice Owls
South Florida Bulls
Temple Owls
Tulane Green Wave
Tulsa Golden Hurricane
UAB Blazers
UTSA Roadrunners
Boston College Eagles
California Golden Bears
Clemson Tigers
Duke Blue Devils
Florida State Seminoles
Georgia Tech Yellow Jackets
Louisville Cardinals
Miami Hurricanes
North Carolina Tar Heels
North Carolina State Wolfpack
Pittsburgh Panthers
SMU Mustangs
Stanford Cardinal
Syracuse Orange
Virginia Cavaliers
Virginia Tech Hokies
Wake Forest Demon Deacons
Illinois Fighting Illini
Indiana Hoosiers
Iowa Hawkeyes
Maryland Terrapins
Michigan Wolverines
Michigan State Spartans
Minnesota Golden Gophers
Nebraska Cornhuskers
Northwestern Wildcats
Ohio State Buckeyes
Oregon Ducks
Penn State Nittany Lions
Purdue Boilermakers
Rutgers Scarlet Knights
UCLA Bruins
USC Trojans
Washington Huskies
Wisconsin Badgers
Arizona Wildcats
Arizona State Sun Devils
Baylor 

### Streaming


In [55]:
import requests
import pandas as pd
from pandas import json_normalize

def fetch_live_scores():
    url = "https://site.api.espn.com/apis/site/v2/sports/football/college-football/scoreboard"
    payload = {'limit':'100'}
    response = requests.get(url,params=payload)
    return response.json()

def create_pandas_dataframe():
    data = fetch_live_scores()
    events = data['events']
    
    df = json_normalize(events)
    
    df['game_id'] = df['id']
    df['game_date'] = df['date']
    df['status'] = df['status.type.name']
    df['home_team'] = df['competitions'].apply(lambda x: x[0]['competitors'][0]['team']['displayName'])
    df['away_team'] = df['competitions'].apply(lambda x: x[0]['competitors'][1]['team']['displayName'])
    df['home_score'] = df['competitions'].apply(lambda x: x[0]['competitors'][0]['score'])
    df['away_score'] = df['competitions'].apply(lambda x: x[0]['competitors'][1]['score'])
    
    # Adding odds information
    df['odds'] = df['competitions'].apply(lambda x: x[0].get('odds', [{}])[0].get('details', 'N/A'))
    df['over_under'] = df['competitions'].apply(lambda x: x[0].get('odds', [{}])[0].get('overUnder', 'N/A'))
    
    # Adding home and away team odds (spread)
    df['home_team_odds'] = df['competitions'].apply(lambda x: x[0]['competitors'][0].get('odds', {}).get('spreadOdds', 'N/A'))
    df['away_team_odds'] = df['competitions'].apply(lambda x: x[0]['competitors'][1].get('odds', {}).get('spreadOdds', 'N/A'))
    
    columns_of_interest = ['game_id', 'game_date', 'status', 'home_team', 'away_team', 
                           'home_score', 'away_score', 'odds', 'over_under', 
                           'home_team_odds', 'away_team_odds']
    df = df[columns_of_interest]
    
    return df

# Create the DataFrame
df = create_pandas_dataframe()

df

Unnamed: 0,game_id,game_date,status,home_team,away_team,home_score,away_score,odds,over_under,home_team_odds,away_team_odds
0,401628347,2024-09-07T16:00Z,STATUS_SCHEDULED,Michigan Wolverines,Texas Longhorns,0,0,TEX -7.5,42.5,,
1,401628470,2024-09-07T16:00Z,STATUS_SCHEDULED,Penn State Nittany Lions,Bowling Green Falcons,0,0,PSU -34.5,48.5,,
2,401628336,2024-09-07T16:00Z,STATUS_SCHEDULED,Oklahoma State Cowboys,Arkansas Razorbacks,0,0,OKST -7.5,62.5,,
3,401636616,2024-09-07T16:00Z,STATUS_SCHEDULED,Tulane Green Wave,Kansas State Wildcats,0,0,KSU -10,48.5,,
4,401635534,2024-09-07T16:00Z,STATUS_SCHEDULED,Syracuse Orange,Georgia Tech Yellow Jackets,0,0,GT -3,62.5,,
5,401628339,2024-09-07T18:00Z,STATUS_SCHEDULED,Georgia Bulldogs,Tennessee Tech Golden Eagles,0,0,,,,
6,401628977,2024-09-07T19:30Z,STATUS_SCHEDULED,Notre Dame Fighting Irish,Northern Illinois Huskies,0,0,ND -28.5,44.5,,
7,401636618,2024-09-07T19:30Z,STATUS_SCHEDULED,Utah Utes,Baylor Bears,0,0,UTAH -14,54.5,,
8,401628464,2024-09-07T19:30Z,STATUS_SCHEDULED,Iowa Hawkeyes,Iowa State Cyclones,0,0,IOWA -2.5,35.5,,
9,401635535,2024-09-07T19:30Z,STATUS_SCHEDULED,Louisville Cardinals,Jacksonville State Gamecocks,0,0,LOU -28,57.5,,


In [21]:
from newsapi import NewsApiClient

# Init

newsapi = NewsApiClient(api_key='ff4373852c2343a98303951439854f8c')

# /v2/top-headlines
top_headlines = newsapi.get_top_headlines(   
                                        category='entertainment',
                                        language='en',
                                        page_size = 90,
                                        page= 1)

articles = top_headlines.get('articles',[])

init_df = pd.DataFrame(articles, columns = ['source','title','publishedAt','author','url'])

init_df['source'] = init_df['source'].apply(lambda x: x['name'] if pd.notna(x) and 'name' in x else None)

init_df['publishedAt'] = pd.to_datetime(init_df['publishedAt'])
init_df

Unnamed: 0,source,title,publishedAt,author,url
0,Google News,Artem Chigvintsev's wife Nikki Bella breaks si...,2024-08-30 17:30:00+00:00,Fox News,https://news.google.com/rss/articles/CBMiwAFBV...
1,Google News,"Afraid Review: Artificial Intelligence, Dumb M...",2024-08-30 16:33:36+00:00,IGN,https://news.google.com/rss/articles/CBMibkFVX...
2,Google News,Newly single Leah Remini warned J.Lo about Ben...,2024-08-30 15:50:00+00:00,New York Post,https://news.google.com/rss/articles/CBMirAFBV...
3,Google News,"ABBA calls out Trump for ""unauthorized use"" of...",2024-08-30 15:34:16+00:00,CBS News,https://news.google.com/rss/articles/CBMid0FVX...
4,Google News,Angelina Jolie bursts into tears at ‘Maria’ pr...,2024-08-30 15:10:00+00:00,New York Post,https://news.google.com/rss/articles/CBMivwFBV...
5,Google News,‘The Rings of Power’ Creators Break Down That ...,2024-08-30 15:05:26+00:00,Hollywood Reporter,https://news.google.com/rss/articles/CBMirwFBV...
6,Google News,Don Johnson shares stunning photo of former mo...,2024-08-30 14:20:00+00:00,Fox News,https://news.google.com/rss/articles/CBMirAFBV...
7,Google News,How RZA Translated His 'Hip-Hop Mind' into His...,2024-08-30 14:00:00+00:00,PEOPLE,https://news.google.com/rss/articles/CBMijgFBV...
8,Google News,Nicole Kidman Says Making Halina Reijn’s Eroti...,2024-08-30 13:05:00+00:00,Deadline,https://news.google.com/rss/articles/CBMiuAFBV...
9,Google News,‘Fascists’: Jack White threatens to sue Trump ...,2024-08-30 12:56:00+00:00,The Guardian US,https://news.google.com/rss/articles/CBMiigFBV...
