# Machine Learning Algorithm - Multiple Linear Regression

###### Import all relevant libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from sklearn.model_selection import train_test_split

###### Load the Data-Set to Data-Frame

In [2]:
df = pd.read_csv('./DataSet/IPL IMB381IPL2013.csv')

###### Invoke head() & tail() function to verify if dataframe is loaded properly

In [3]:
df.head()

Unnamed: 0,Sl.NO.,PLAYER NAME,AGE,COUNTRY,TEAM,PLAYING ROLE,T-RUNS,T-WKTS,ODI-RUNS-S,ODI-SR-B,...,SR-B,SIXERS,RUNS-C,WKTS,AVE-BL,ECON,SR-BL,AUCTION YEAR,BASE PRICE,SOLD PRICE
0,1,"Abdulla, YA",2,SA,KXIP,Allrounder,0,0,0,0.0,...,0.0,0,307,15,20.47,8.9,13.93,2009,50000,50000
1,2,Abdur Razzak,2,BAN,RCB,Bowler,214,18,657,71.41,...,0.0,0,29,0,0.0,14.5,0.0,2008,50000,50000
2,3,"Agarkar, AB",2,IND,KKR,Bowler,571,58,1269,80.62,...,121.01,5,1059,29,36.52,8.81,24.9,2008,200000,350000
3,4,"Ashwin, R",1,IND,CSK,Bowler,284,31,241,84.56,...,76.32,0,1125,49,22.96,6.23,22.14,2011,100000,850000
4,5,"Badrinath, S",2,IND,CSK,Batsman,63,0,79,45.93,...,120.71,28,0,0,0.0,0.0,0.0,2011,100000,800000


In [4]:
df.tail()

Unnamed: 0,Sl.NO.,PLAYER NAME,AGE,COUNTRY,TEAM,PLAYING ROLE,T-RUNS,T-WKTS,ODI-RUNS-S,ODI-SR-B,...,SR-B,SIXERS,RUNS-C,WKTS,AVE-BL,ECON,SR-BL,AUCTION YEAR,BASE PRICE,SOLD PRICE
125,126,"Yadav, AS",2,IND,DC,Batsman,0,0,0,0.0,...,125.64,2,0,0,0.0,0.0,0.0,2010,50000,750000
126,127,Younis Khan,2,PAK,RR,Batsman,6398,7,6814,75.78,...,42.85,0,0,0,0.0,0.0,0.0,2008,225000,225000
127,128,Yuvraj Singh,2,IND,KXIP+,Batsman,1775,9,8051,87.58,...,131.88,67,569,23,24.74,7.02,21.13,2011,400000,1800000
128,129,Zaheer Khan,2,IND,MI+,Bowler,1114,288,790,73.55,...,91.67,1,1783,65,27.43,7.75,21.26,2008,200000,450000
129,130,"Zoysa, DNT",2,SL,DC,Bowler,288,64,343,95.81,...,122.22,0,99,2,49.5,9.0,33.0,2008,100000,110000


###### To print information about Data-Frame

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 130 entries, 0 to 129
Data columns (total 26 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Sl.NO.         130 non-null    int64  
 1   PLAYER NAME    130 non-null    object 
 2   AGE            130 non-null    int64  
 3   COUNTRY        130 non-null    object 
 4   TEAM           130 non-null    object 
 5   PLAYING ROLE   130 non-null    object 
 6   T-RUNS         130 non-null    int64  
 7   T-WKTS         130 non-null    int64  
 8   ODI-RUNS-S     130 non-null    int64  
 9   ODI-SR-B       130 non-null    float64
 10  ODI-WKTS       130 non-null    int64  
 11  ODI-SR-BL      130 non-null    float64
 12  CAPTAINCY EXP  130 non-null    int64  
 13  RUNS-S         130 non-null    int64  
 14  HS             130 non-null    int64  
 15  AVE            130 non-null    float64
 16  SR-B           130 non-null    float64
 17  SIXERS         130 non-null    int64  
 18  RUNS-C    

###### Display all columns

In [6]:
df.columns

Index(['Sl.NO.', 'PLAYER NAME', 'AGE', 'COUNTRY', 'TEAM', 'PLAYING ROLE',
       'T-RUNS', 'T-WKTS', 'ODI-RUNS-S', 'ODI-SR-B', 'ODI-WKTS', 'ODI-SR-BL',
       'CAPTAINCY EXP', 'RUNS-S', 'HS', 'AVE', 'SR-B', 'SIXERS', 'RUNS-C',
       'WKTS', 'AVE-BL', 'ECON', 'SR-BL', 'AUCTION YEAR', 'BASE PRICE',
       'SOLD PRICE'],
      dtype='object')

###### Defining X features for these columns

In [7]:
X_Features = df.columns
X_Features

Index(['Sl.NO.', 'PLAYER NAME', 'AGE', 'COUNTRY', 'TEAM', 'PLAYING ROLE',
       'T-RUNS', 'T-WKTS', 'ODI-RUNS-S', 'ODI-SR-B', 'ODI-WKTS', 'ODI-SR-BL',
       'CAPTAINCY EXP', 'RUNS-S', 'HS', 'AVE', 'SR-B', 'SIXERS', 'RUNS-C',
       'WKTS', 'AVE-BL', 'ECON', 'SR-BL', 'AUCTION YEAR', 'BASE PRICE',
       'SOLD PRICE'],
      dtype='object')

###### Filtering required columns for X_Feature

In [8]:
X_Features = ['AGE', 'COUNTRY',  'PLAYING ROLE',
       'T-RUNS', 'T-WKTS', 'ODI-RUNS-S', 'ODI-SR-B', 'ODI-WKTS', 'ODI-SR-BL',
       'CAPTAINCY EXP', 'RUNS-S', 'HS', 'AVE', 'SR-B', 'SIXERS', 'RUNS-C',
       'WKTS', 'AVE-BL', 'ECON', 'SR-BL']
len(X_Features) # To check no. of columns filtered out

20

###### To check all posible roles defined in 'PLAYING ROLE'

In [9]:
df['PLAYING ROLE'].unique()

array(['Allrounder', 'Bowler', 'Batsman', 'W. Keeper'], dtype=object)

###### get_dummies(): Convert categorical variable into dummy/indicator variables.

In [10]:
pd.get_dummies(df['PLAYING ROLE'])[0:5]
# We create dummy variable in Regression when we have qualitative data/categorical data into continuous data

Unnamed: 0,Allrounder,Batsman,Bowler,W. Keeper
0,1,0,0,0
1,0,0,1,0
2,0,0,1,0
3,0,0,1,0
4,0,1,0,0


###### Now considering all Categorical Features and making dummy variables 

In [11]:
cat_features = ['AGE', 'COUNTRY', 'PLAYING ROLE', 'CAPTAINCY EXP']
encoded_df = pd.get_dummies(df[X_Features],columns=cat_features, drop_first=True) # Merging
encoded_df.columns                                               # To print

Index(['T-RUNS', 'T-WKTS', 'ODI-RUNS-S', 'ODI-SR-B', 'ODI-WKTS', 'ODI-SR-BL',
       'RUNS-S', 'HS', 'AVE', 'SR-B', 'SIXERS', 'RUNS-C', 'WKTS', 'AVE-BL',
       'ECON', 'SR-BL', 'AGE_2', 'AGE_3', 'COUNTRY_BAN', 'COUNTRY_ENG',
       'COUNTRY_IND', 'COUNTRY_NZ', 'COUNTRY_PAK', 'COUNTRY_SA', 'COUNTRY_SL',
       'COUNTRY_WI', 'COUNTRY_ZIM', 'PLAYING ROLE_Batsman',
       'PLAYING ROLE_Bowler', 'PLAYING ROLE_W. Keeper', 'CAPTAINCY EXP_1'],
      dtype='object')

###### Now looking at length of encoded features

In [12]:
len(encoded_df.columns)

31

###### Now we identify dependant variables

In [13]:
X = sm.add_constant(encoded_df)
Y = df['SOLD PRICE']

# Splitting Data-Frame to train & split
train_X, test_X, train_Y, test_Y = train_test_split(X,Y,train_size=0.8,random_state=42) #Returns four dataframe

  x = pd.concat(x[::order], 1)


###### Now we build the Model

In [14]:
ipl_model = sm.OLS(train_Y, train_X).fit()
ipl_model.summary2()    # This would full summary of models

0,1,2,3
Model:,OLS,Adj. R-squared:,0.362
Dependent Variable:,SOLD PRICE,AIC:,2965.2841
Date:,2022-03-18 21:17,BIC:,3049.9046
No. Observations:,104,Log-Likelihood:,-1450.6
Df Model:,31,F-statistic:,2.883
Df Residuals:,72,Prob (F-statistic):,0.000114
R-squared:,0.554,Scale:,110340000000.0

0,1,2,3,4,5,6
,Coef.,Std.Err.,t,P>|t|,[0.025,0.975]
const,375827.1991,228849.9306,1.6422,0.1049,-80376.7996,832031.1978
T-RUNS,-53.7890,32.7172,-1.6441,0.1045,-119.0096,11.4316
T-WKTS,-132.5967,609.7525,-0.2175,0.8285,-1348.1162,1082.9228
ODI-RUNS-S,57.9600,31.5071,1.8396,0.0700,-4.8482,120.7681
ODI-SR-B,-524.1450,1576.6368,-0.3324,0.7405,-3667.1130,2618.8231
ODI-WKTS,815.3944,832.3883,0.9796,0.3306,-843.9413,2474.7301
ODI-SR-BL,-773.3092,1536.3334,-0.5033,0.6163,-3835.9338,2289.3154
RUNS-S,114.7205,173.3088,0.6619,0.5101,-230.7643,460.2054
HS,-5516.3354,2586.3277,-2.1329,0.0363,-10672.0855,-360.5853

0,1,2,3
Omnibus:,0.891,Durbin-Watson:,2.244
Prob(Omnibus):,0.64,Jarque-Bera (JB):,0.638
Skew:,0.19,Prob(JB):,0.727
Kurtosis:,3.059,Condition No.:,84116.0
