In [33]:
def clean_dataframe(df):
    # Remove the first column
    df = df.drop(columns=df.columns[0])

    # Remove the first two rows
    df = df.iloc[2:]

    # Reset the index to start at 1
    df.reset_index(drop=True, inplace=True)
    df.index += 1

    return df

In [38]:
import time

def extract_table_data(base_urls, num_pages, delay=2):
    all_data = []

    for base_url in base_urls:
        page_data = []

        for page in range(1, num_pages + 1):
            url = f"{base_url}&page={page}_50"
            response = requests.get(url)
            soup = BeautifulSoup(response.text, 'html.parser')

            table = soup.find('table', {'class': 'rgMasterTable'})
            headers = [header.text for header in table.find_all('th')]

            data = []
            for row in table.find_all('tr')[1:]:
                data.append([cell.text for cell in row.find_all('td')])

            df = pd.DataFrame(data, columns=headers)
            page_data.append(df)

            # Add a delay between requests
            time.sleep(delay)

        combined_df = pd.concat(page_data, ignore_index=True)
        all_data.append(combined_df)

    merged_df = pd.concat(all_data, axis=1)
    # Remove duplicate columns
    merged_df = merged_df.loc[:, ~merged_df.columns.duplicated()]
    return merged_df

batter_data_base_urls = [
    "https://www.fangraphs.com/leaders.aspx?pos=all&stats=bat&lg=all&qual=0&type=8&season=2022&month=0&season1=2022&ind=0&team=0&rost=0&age=0&filter=&players=0&startdate=2022-01-01&enddate=2022-12-31",
    "https://www.fangraphs.com/leaders.aspx?pos=all&stats=bat&lg=all&qual=0&type=1&season=2022&month=0&season1=2022&ind=0&team=0&rost=0&age=0&filter=&players=0&startdate=2022-01-01&enddate=2022-12-31",
    "https://www.fangraphs.com/leaders.aspx?pos=all&stats=bat&lg=all&qual=0&type=2&season=2022&month=0&season1=2022&ind=0&team=0&rost=0&age=0&filter=&players=0&startdate=2022-01-01&enddate=2022-12-31",
    "https://www.fangraphs.com/leaders.aspx?pos=all&stats=bat&lg=all&qual=0&type=4&season=2022&month=0&season1=2022&ind=0&team=0&rost=0&age=0&filter=&players=0&startdate=2022-01-01&enddate=2022-12-31"
]

pitcher_data_base_urls = [
    "https://www.fangraphs.com/leaders.aspx?pos=all&stats=pit&lg=all&qual=0&type=8&season=2022&month=0&season1=2022&ind=0&team=0&rost=0&age=0&filter=&players=0&startdate=2022-01-01&enddate=2022-12-31",
    "https://www.fangraphs.com/leaders.aspx?pos=all&stats=pit&lg=all&qual=0&type=1&season=2022&month=0&season1=2022&ind=0&team=0&rost=0&age=0&filter=&players=0&startdate=2022-01-01&enddate=2022-12-31",
    "https://www.fangraphs.com/leaders.aspx?pos=all&stats=pit&lg=all&qual=0&type=2&season=2022&month=0&season1=2022&ind=0&team=0&rost=0&age=0&filter=&players=0&startdate=2022-01-01&enddate=2022-12-31",
    "https://www.fangraphs.com/leaders.aspx?pos=all&stats=pit&lg=all&qual=0&type=4&season=2022&month=0&season1=2022&ind=0&team=0&rost=0&age=0&filter=&players=0&startdate=2022-01-01&enddate=2022-12-31"
]

num_pages = 30

batter_data = extract_table_data(batter_data_base_urls, num_pages)
batter_data_cleaned = clean_dataframe(batter_data)

pitcher_data = extract_table_data(pitcher_data_base_urls, num_pages)
pitcher_data_cleaned = clean_dataframe(pitcher_data)




In [45]:
column_names = batter_data_cleaned.columns
print(column_names)

Index(['Name', 'Team', 'G', 'PA', 'HR', 'R', 'RBI', 'SB', 'BB%', 'K%', 'ISO',
       'BABIP', 'AVG', 'OBP', 'SLG', 'wOBA', 'xwOBA', 'wRC+', 'BsR', 'Off',
       'Def', 'WAR', 'BB/K', 'OPS', 'Spd', 'UBR', 'wGDP', 'wSB', 'wRC', 'wRAA',
       'GB/FB', 'LD%', 'GB%', 'FB%', 'IFFB%', 'HR/FB', 'IFH', 'IFH%', 'BUH',
       'BUH%', 'Pull%', 'Cent%', 'Oppo%', 'Soft%', 'Med%', 'Hard%', 'FBv',
       'SL%', 'SLv', 'CT%', 'CTv', 'CB%', 'CBv', 'CH%', 'CHv', 'SF%', 'SFv',
       'KN%', 'KNv', 'XX%'],
      dtype='object')


In [40]:
pitcher_data_cleaned

Unnamed: 0,Name,Team,W,L,SV,G,GS,IP,K/9,BB/9,...,CTv,CB%,CBv,CH%,CHv,SF%,SFv,KN%,KNv,XX%
1,Aaron Nola,PHI,11,13,0,32,32,205.0,10.32,1.27,...,,31.2%,87.8,,,16.3%,96.5,,,1.0%
2,Carlos Rodon,SFG,14,8,0,31,31,178.0,11.98,2.63,...,,,,,,,,,,0.8%
3,Justin Verlander,HOU,18,4,0,28,28,175.0,9.51,1.49,...,95.4,,,,,,,,,
4,Sandy Alcantara,MIA,14,9,0,32,32,228.2,8.15,1.97,...,95.6,,,,,,,,,0.6%
5,Kevin Gausman,TOR,12,10,0,31,31,174.2,10.56,1.44,...,89.2,10.4%,80.8,0.6%,89.7,,,,,0.4%
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1177,Joe Smith,MIN,1,1,0,34,0,27.1,5.60,2.96,...,,,,3.8%,79.0,,,42.3%,46.8,
1178,Ian Kennedy,ARI,4,7,10,57,0,50.1,7.87,3.93,...,,,,3.6%,81.0,,,96.4%,56.0,
1179,Elieser Hernandez,MIA,3,6,0,20,10,62.1,8.66,3.18,...,93.1,26.5%,84.8,,,,,,,0.4%
1180,Elvin Rodriguez,DET,0,4,0,7,5,29.2,7.58,4.55,...,87.6,14.2%,73.5,,,,,,,0.4%
