In [1]:
import requests
import json
import re
import pandas as pd
from concurrent.futures import ThreadPoolExecutor

In [2]:
# Base URL for fetching data
BASE_URL = 'https://ipl-stats-sports-mechanic.s3.ap-south-1.amazonaws.com/ipl/feeds/'

In [3]:
# List of match IDs
MATCH_IDS = [1799]

In [4]:
# Generate full URLs
URLS = [f'{BASE_URL}{match_id}-squad.js' for match_id in MATCH_IDS]
URLS

['https://ipl-stats-sports-mechanic.s3.ap-south-1.amazonaws.com/ipl/feeds/1799-squad.js']

In [5]:
# Headers to mimic a real browser
HEADERS = {
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
        "(KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36"
    )
}

In [6]:
# Columns to drop
DROP_COLUMNS = ["PlayerShortName", "ClientPlayerID", "PlayingOrder", "Squad"]

In [7]:
def fetch_and_process_data(url):
    try:
        response = requests.get(url, headers=HEADERS)
        response.raise_for_status()

        # Parse JSON data
        start_index = response.text.find('(') + 1
        end_index = response.text.rfind(')')
        json_data = response.text[start_index:end_index]
        data = json.loads(json_data)

        match_id = re.search(r'(\d+)-squad.js', url).group(1)

        dfs = []  # List to store DataFrames

        # Extract and clean squadA data
        if "squadA" in data:
            df_squadA = pd.DataFrame(data["squadA"])
            dfs.append(df_squadA)

        # Extract and clean squadB data
        if "squadB" in data:
            df_squadB = pd.DataFrame(data["squadB"])
            dfs.append(df_squadB)

        # Combine both squads
        if dfs:
            combined_df = pd.concat(dfs, ignore_index=True)

            # Remove unwanted columns
            combined_df.drop(columns=[col for col in DROP_COLUMNS if col in combined_df.columns], inplace=True)

            # Add "Season" column with value 2025
            combined_df["Season"] = 2025

            # Save the cleaned data
            combined_file = f"{match_id}-Squad.csv"
            combined_df.to_csv(combined_file, index=False)
            print(f"Saved Combined Squad File: {combined_file}")

    except requests.exceptions.RequestException as e:
        print(f"Request failed for {url}: {e}")
    except json.JSONDecodeError as e:
        print(f"Failed to parse JSON for {url}: {e}")
    except KeyError as e:
        print(f"KeyError: Missing expected key in the JSON data for {url}: {e}")
    except Exception as e:
        print(f"An error occurred for {url}: {e}")

In [8]:
# Use ThreadPoolExecutor for parallel processing
with ThreadPoolExecutor(max_workers=10) as executor:
    executor.map(fetch_and_process_data, URLS)

Saved Combined Squad File: 1799-Squad.csv


In [10]:
new_df = pd.read_csv("1799-Squad.csv")
new_df.shape

(32, 15)


In [11]:
# Load the extracted and cleaned squad data
old_df = pd.read_csv("Old_data/Squad.csv")
old_df.shape

(660, 15)

In [12]:
df = pd.concat([old_df, new_df], ignore_index=True)
df.shape

(692, 15)

In [13]:
df.to_csv('Output/Squad.csv', index=False)