# Data Preprocessing
# Data Cleaning

In [6]:
import pandas as pd

matches_data = pd.read_csv("ipl_matches.csv")
batting_data = pd.read_csv("ipl_batting_data.csv")
bowling_data = pd.read_csv("ipl_bowling_data.csv")

# 1. **Handle Missing Data in Batting Data**
# Fill missing Runs, Balls, 4s, 6s, SR with 0 where applicable
batting_data['Runs'] = batting_data['Runs'].fillna(0)
batting_data['Balls'] = batting_data['Balls'].fillna(0)
batting_data['4s'] = batting_data['4s'].fillna(0)
batting_data['6s'] = batting_data['6s'].fillna(0)
batting_data['SR'] = batting_data['SR'].fillna(0)

# Fill missing Wicket_Taker with "Not applicable" for cases where player was not dismissed
batting_data['Wicket_Taker'] = batting_data['Wicket_Taker'].fillna('Not applicable')

# 2. **Handle Missing Data in Match Data**
# Fill missing Season values with a default season (e.g., 2008 for historical consistency)
matches_data['Season'] = matches_data['Season'].fillna(2008)

# Fill missing Winner and Margin values by either inferring or setting them as "Unknown"
matches_data['Winner'] = matches_data['Winner'].fillna('Unknown')
matches_data['Margin'] = matches_data['Margin'].fillna('Unknown')

# 3. **Convert Data Types**
# Convert Date columns to datetime type
batting_data['Date'] = pd.to_datetime(batting_data['Date'])
bowling_data['Date'] = pd.to_datetime(bowling_data['Date'])
matches_data['Date'] = pd.to_datetime(matches_data['Date'])

# Convert relevant columns to correct types
batting_data['MatchID'] = batting_data['MatchID'].astype(int)
bowling_data['MatchID'] = bowling_data['MatchID'].astype(int)
matches_data['ID'] = matches_data['ID'].astype(int)

# Verify the changes
print("Batting Data Info After Cleaning:")
print(batting_data.info())
print("\nMatch Data Info After Cleaning:")
print(matches_data.info())
print("\nBowling Data Info After Cleaning:")
print(bowling_data.info())

batting_data.to_csv('cleaned_batting_data.csv', index=False)
bowling_data.to_csv('cleaned_bowling_data.csv', index=False)
matches_data.to_csv('cleaned_match_data.csv', index=False)

Batting Data Info After Cleaning:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17130 entries, 0 to 17129
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   Player        17130 non-null  object        
 1   Runs          17130 non-null  object        
 2   Balls         17130 non-null  object        
 3   4s            17130 non-null  object        
 4   6s            17130 non-null  object        
 5   SR            17130 non-null  object        
 6   Dismissal     17130 non-null  object        
 7   Wicket_Taker  17130 non-null  object        
 8   MatchID       17130 non-null  int32         
 9   Team          17130 non-null  object        
 10  Match         17130 non-null  object        
 11  Date          17130 non-null  datetime64[ns]
 12  Teams         17130 non-null  object        
dtypes: datetime64[ns](1), int32(1), object(11)
memory usage: 1.6+ MB
None

Match Data Info After Cleaning:

# Merging Data