Data Preprocessing

In [2]:
from pathlib import Path
import pandas as pd
import os

raw_data_dir = Path("E:/Sri-Lanka-Cricket/data/raw")
processed_dir = Path("E:/Sri-Lanka-Cricket/data/processed")
extra_dir = Path("E:/Sri-Lanka-Cricket/data/extra")

df = pd.read_csv(raw_data_dir / "sl_match_results_odi.csv")
df.head()

print('Number of rows: ', df.shape[0])
print('Number of columns: ', df.shape[1])

Number of rows:  910
Number of columns:  9


In [7]:
#Extracting the abandoned and n/r(no result) matches
df = pd.read_csv(raw_data_dir / "sl_match_results_odi.csv")

matches = df[df["Result"].isin(["aban", "n/r"])]

os.makedirs(extra_dir, exist_ok = True)

matches.to_csv(extra_dir / "abandoned.csv", index = False)

print('Number of rows: ', matches.shape[0])
print('Number of columns: ', matches.shape[1])

Number of rows:  58
Number of columns:  9


In [12]:
#Removing the abandoned and n/r(no result) matches

from pathlib import Path
import pandas as pd

raw_data_dir = Path("E:/Sri-Lanka-Cricket/data/raw")
processed_dir = Path("E:/Sri-Lanka-Cricket/data/processed")
extra_dir = Path("E:/Sri-Lanka-Cricket/data/extra")

df = pd.read_csv(raw_data_dir / "sl_match_results_odi.csv")

df1 = df[~df["Result"].isin(["aban", "n/r"])].copy()

df2 = df1.reset_index(drop=True)

df3 = pd.read_csv(extra_dir / "abandoned.csv")

output_dir = processed_dir / "preprocessed_1.csv"
df2.to_csv(output_dir, index = False)

print(f"Rows before removing aban/nr: {len(df)}")
print(f"Rows containing aban/nr: {len(df3)}")
print(f"Rows after removing nr/aban: {len(df2)}")
print()
print(f"Cleaned dataset saved to {output_dir}")

Rows before removing aban/nr: 910
Rows containing aban/nr: 58
Rows after removing nr/aban: 852

Cleaned dataset saved to E:\Sri-Lanka-Cricket\data\processed\preprocessed_1.csv


In [23]:
from pathlib import Path
import pandas as pd

# --- Paths ---
raw_data_dir  = Path("E:/Sri-Lanka-Cricket/data/raw")
processed_dir = Path("E:/Sri-Lanka-Cricket/data/processed")

# --- Load the preprocessed (cleaned) dataset ---
# This file already has aban/n-r matches removed
df = pd.read_csv(processed_dir / "preprocessed_1.csv")

print(f"Total matches in preprocessed1.csv: {len(df)}")

# --- Convert 'Start Date' to datetime (only once!) ---
df['Start Date'] = pd.to_datetime(df['Start Date'], format='%d-%b-%y')

# --- Define the split date ---
split_date = pd.to_datetime('18-Mar-2015')

# --- Split the data ---
# 18-Mar-2015 = last match of Sangakkara & Mahela in ODIs → belongs to "before" era
df_before = df[df['Start Date'] <= split_date].copy()
df_after  = df[df['Start Date'] > split_date].copy()

# Reset index for clean row numbers
df_before = df_before.reset_index(drop=True)
df_after  = df_after.reset_index(drop=True)

# --- Save the two eras ---
df_before.to_csv(processed_dir / "before_Sanga_rtr.csv", index=False)
df_after.to_csv(processed_dir / "after_Sanga_rtr.csv", index=False)

# --- Summary ---
print("\nSplit complete!")
print(f"→ Before & including 18-Mar-2015 (Sangakkara era) : {len(df_before)} matches")
print(f"→ After 18-Mar-2015 (post-retirement era)        : {len(df_after)} matches")
print(f"Total completed matches                          : {len(df_before) + len(df_after)}")

# --- Show the transition matches ---
print("\nLast match of Sangakkara era (18-Mar-2015):")
last_match = df_before[df_before['Start Date'] == '2015-03-18']
print(last_match[['Start Date', 'Opposition', 'Ground', 'Result', 'Margin']].to_string(index=False))

print("\nFirst match of the new era (post-Sangakkara):")
if len(df_after) > 0:
    first_new = df_after.iloc[0]
    print(first_new[['Start Date', 'Opposition', 'Ground', 'Result', 'Margin']])
else:
    print("No matches found after 18-Mar-2015")

Total matches in preprocessed1.csv: 852

Split complete!
→ Before & including 18-Mar-2015 (Sangakkara era) : 730 matches
→ After 18-Mar-2015 (post-retirement era)        : 122 matches
Total completed matches                          : 852

Last match of Sangakkara era (18-Mar-2015):
Start Date     Opposition Ground Result    Margin
2015-03-18 v South Africa Sydney   lost 9 wickets

First match of the new era (post-Sangakkara):
Start Date    2015-07-11 00:00:00
Opposition             v Pakistan
Ground                   Dambulla
Result                       lost
Margin                  6 wickets
Name: 0, dtype: object


Before Sanga

In [None]:


raw_data_dir = Path("E:/Sri-Lanka-Cricket/data/raw")
processed_dir = Path("E:/Sri-Lanka-Cricket/data/processed")
extra_dir = Path("E:/Sri-Lanka-Cricket/data/extra")

