In [None]:
import pandas as pd 
import numpy as np

In [None]:
# loading the dataset

df = pd.read_csv("ipl.csv")

## Exploring the dataset

In [None]:
df.columns

In [None]:
df.head()

In [None]:
df.shape

In [None]:
pd.options.display.max_rows = 500
df

## Data Cleaning

In [None]:
## Removing unwanted columns

columns_to_remove = ['mid', 'venue', 'batsman', 'bowler', 'striker', 'non-striker']

print("Before removing unwanted columns: {}".format(df.shape))
df.drop(labels=columns_to_remove,axis=1,inplace=True)
print("After removing unwanted columns: {}".format(df.shape))


In [None]:
df.columns

In [None]:
df.head()

In [None]:
df.index

In [None]:
# Removing unwanted teams

df['bat_team'].unique()

In [None]:
teams_we_want = ['Kolkata Knight Riders', 'Chennai Super Kings', 
                'Rajasthan Royals','Mumbai Indians','Kings XI Punjab',
                'Royal Challengers Bangalore', 'Delhi Daredevils','Sunrisers Hyderabad']

In [None]:
print("Before removing teams : {}".format(df.shape))
df = df[(df['bat_team'].isin(teams_we_want)) & (df['bowl_team'].isin(teams_we_want))]
print("After removing teams : {}".format(df.shape))


In [None]:
df['bat_team'].unique()

In [None]:
# Removing first 5 overs data in the matches

print("Before removing first five over data : {}".format(df.shape))
df = df[df['overs']>=5.0]
print("After removing first five over data : {}".format(df.shape))



In [None]:

# Converting the column 'date' from string into datetime object
from datetime import datetime
print("Before converting 'date' column from string to datetime object: {}".format(type(df.iloc[0,0])))
df['date'] = df['date'].apply(lambda x: datetime.strptime(x, '%d-%m-%Y'))
print("After converting 'date' column from string to datetime object: {}".format(type(df.iloc[0,0])))

In [None]:
%matplotlib inline

In [None]:
import matplotlib.pyplot as plt
import seaborn as sb 

df1 = df.drop(['bat_team','bowl_team','date'],axis=1)

corr_matrix = df1.corr()
top_corr_features = corr_matrix.index

plt.figure(figsize=(13,10))
g = sb.heatmap(data=df[top_corr_features].corr(), annot=True, cmap='RdYlGn')


## Data Preprocessing

In [None]:
encoded_df = pd.get_dummies(data=df,columns=['bat_team','bowl_team'])
encoded_df.columns

In [None]:
encoded_df.head()

In [None]:
# Rearranging the columns

encoded_df = encoded_df[['date', 'bat_team_Chennai Super Kings', 'bat_team_Delhi Daredevils', 'bat_team_Kings XI Punjab',
              'bat_team_Kolkata Knight Riders', 'bat_team_Mumbai Indians', 'bat_team_Rajasthan Royals',
              'bat_team_Royal Challengers Bangalore', 'bat_team_Sunrisers Hyderabad',
              'bowl_team_Chennai Super Kings', 'bowl_team_Delhi Daredevils', 'bowl_team_Kings XI Punjab',
              'bowl_team_Kolkata Knight Riders', 'bowl_team_Mumbai Indians', 'bowl_team_Rajasthan Royals',
              'bowl_team_Royal Challengers Bangalore', 'bowl_team_Sunrisers Hyderabad',
              'overs', 'runs', 'wickets', 'runs_last_5', 'wickets_last_5', 'total']]

In [None]:
# Train-Test Splitting

X_train =  encoded_df.drop(labels='total',axis=1)[encoded_df['date'].dt.year <=2016]
X_test =  encoded_df.drop(labels='total',axis=1)[encoded_df['date'].dt.year >=2017]

y_train = encoded_df[encoded_df['date'].dt.year <= 2016]['total'].values
y_test = encoded_df[encoded_df['date'].dt.year >= 2017]['total'].values

# Removing the 'date' column
X_train.drop(labels='date', axis=True, inplace=True)
X_test.drop(labels='date', axis=True, inplace=True)

print("Training set: {} and Test set: {}".format(X_train.shape, X_test.shape))

## Model Building

In [None]:
# Linear Regression

from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(X_train,y_train)

In [None]:
# Predicting the result
y_pred_lr = model.predict(X_test)

In [None]:
from sklearn.metrics import mean_absolute_error as mae, mean_squared_error as mse, accuracy_score
print("---- Linear Regression - Model Evaluation ----")
print("Mean Absolute Error (MAE): {}".format(mae(y_test, y_pred_lr)))
print("Mean Squared Error (MSE): {}".format(mse(y_test, y_pred_lr)))
print("Root Mean Squared Error (RMSE): {}".format(np.sqrt(mse(y_test, y_pred_lr))))