In [0]:
import pandas as pd
import os

ipl_data = [
    [1, "2021-04-09", "Chennai", 1, 0.1, "Mumbai Indians", "RCB", "Rohit Sharma", "Washington Sundar", 0, 0, 0, "", ""],
    [1, "2021-04-09", "Chennai", 1, 0.2, "Mumbai Indians", "RCB", "Rohit Sharma", "Washington Sundar", 1, 0, 1, "", ""],
    [1, "2021-04-09", "Chennai", 1, 0.3, "Mumbai Indians", "RCB", "Chris Lynn", "Washington Sundar", 4, 0, 4, "", ""],
    [1, "2021-04-09", "Chennai", 1, 0.4, "Mumbai Indians", "RCB", "Chris Lynn", "Washington Sundar", 0, 1, 1, "run out", "Chris Lynn"],
    [1, "2021-04-09", "Chennai", 2, 0.1, "RCB", "Mumbai Indians", "Virat Kohli", "Trent Boult", 1, 0, 1, "", ""],
    [1, "2021-04-09", "Chennai", 2, 0.2, "RCB", "Mumbai Indians", "Virat Kohli", "Trent Boult", 0, 0, 0, "", ""]
]

columns = [
    'match_id', 'date', 'venue', 'innings', 'ball',
    'batting_team', 'bowling_team', 'batsman', 'bowler',
    'runs_batsman', 'runs_extras', 'runs_total',
    'dismissal_kind', 'player_dismissed'
]

df_sample = pd.DataFrame(ipl_data, columns=columns)

csv_path = "ipl_data.csv"
df_sample.to_csv(csv_path, index=False)
print(f" Sample IPL dataset saved to {csv_path}")

df_raw = pd.read_csv(csv_path)

expected_schema = {
    'match_id': int,
    'date': 'datetime64[ns]',
    'venue': str,
    'innings': int,
    'ball': float,
    'batting_team': str,
    'bowling_team': str,
    'batsman': str,
    'bowler': str,
    'runs_batsman': int,
    'runs_extras': int,
    'runs_total': int,
    'dismissal_kind': str,
    'player_dismissed': str
}

for col, dtype in expected_schema.items():
    if dtype == 'datetime64[ns]':
        df_raw[col] = pd.to_datetime(df_raw[col], errors='coerce')
    elif dtype == int:
        df_raw[col] = pd.to_numeric(df_raw[col], errors='coerce').astype('Int64')
    elif dtype == float:
        df_raw[col] = pd.to_numeric(df_raw[col], errors='coerce').astype('float64')
    else:
        df_raw[col] = df_raw[col].astype(str).str.strip()

required_columns = ['match_id', 'date', 'batsman', 'bowler', 'runs_total']
df_clean = df_raw.dropna(subset=required_columns)

df_clean = df_clean[
    (df_clean['innings'].between(1, 2)) &
    (df_clean['runs_total'] >= 0)
]

print("\n Cleaned IPL Data:")
print(df_clean.head())


 Sample IPL dataset saved to ipl_data.csv

 Cleaned IPL Data:
   match_id       date    venue  ...  runs_total  dismissal_kind player_dismissed
0         1 2021-04-09  Chennai  ...           0             nan              nan
1         1 2021-04-09  Chennai  ...           1             nan              nan
2         1 2021-04-09  Chennai  ...           4             nan              nan
3         1 2021-04-09  Chennai  ...           1         run out       Chris Lynn
4         1 2021-04-09  Chennai  ...           1             nan              nan

[5 rows x 14 columns]
