## Importing the essential libraries over here

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

## Importing the dataset over here

In [2]:
data=pd.read_excel("Rohit_Sharma_Centuries.xlsx")

In [3]:
data.head()

Unnamed: 0,S.No.,Date,Score,Strike Rate,Type of Match,Position,Innings,Dismissed,Man of the Match,Captain,Against,Venue,H/A/N,Result
0,1,2010-05-28,114,95.79,ODI,4,1,Yes,No,No,Zimbabwe,"Queens Sports Club, Bulawayo",Away,Lost
1,2,2010-05-30,101,170.0,ODI,4,2,No,Yes,No,Sri Lanka,"Queens Sports Club, Bulawayo",Neutral,Won
2,3,2013-10-16,141,114.63,ODI,1,2,No,Yes,No,Australia,"Sawai Mansingh Stadium, Jaipur",Home,Won
3,4,2013-11-02,209,132.27,ODI,1,1,Yes,Yes,No,Australia,"M. Chinnaswamy Stadium, Bangalore",Home,Won
4,5,2014-11-13,264,152.6,ODI,2,1,Yes,Yes,No,Sri Lanka,"Eden Gardens, Kolkata",Home,Won


## Taking care of duplicate observations if present over here

In [4]:
data.duplicated().sum()

0

## Taking care of missing values if present over here

In [5]:
data.isnull().sum()

S.No.               0
Date                0
Score               0
Strike Rate         0
Type of Match       0
Position            0
Innings             0
Dismissed           0
Man of the Match    0
Captain             0
Against             0
Venue               0
H/A/N               0
Result              0
dtype: int64

## Filtering all the numerical features over here

In [6]:
numerical_features=[feature for feature in data.columns if data[feature].dtype!='O']
for feature in numerical_features:
  print(feature)

S.No.
Date
Score
Strike Rate
Position
Innings


In [7]:
data[numerical_features]

Unnamed: 0,S.No.,Date,Score,Strike Rate,Position,Innings
0,1,2010-05-28,114,95.79,4,1
1,2,2010-05-30,101,170.0,4,2
2,3,2013-10-16,141,114.63,1,2
3,4,2013-11-02,209,132.27,1,1
4,5,2014-11-13,264,152.6,2,1
5,6,2015-01-18,138,99.28,1,1
6,7,2015-03-19,137,108.73,1,1
7,8,2015-10-11,150,112.78,1,2
8,9,2016-01-12,171,104.9,1,1
9,10,2016-01-15,124,97.63,1,1


In [8]:
data['Date']=data['Date'].astype(str)

## Filtering all the categorical features over here

In [9]:
cat_features=[feature for feature in data.columns if data[feature].dtype=='O']
for feature in cat_features:
  print(feature)

Date
Type of Match
Dismissed 
Man of the Match
Captain
Against
Venue
H/A/N
Result


In [10]:
data[cat_features]

Unnamed: 0,Date,Type of Match,Dismissed,Man of the Match,Captain,Against,Venue,H/A/N,Result
0,2010-05-28,ODI,Yes,No,No,Zimbabwe,"Queens Sports Club, Bulawayo",Away,Lost
1,2010-05-30,ODI,No,Yes,No,Sri Lanka,"Queens Sports Club, Bulawayo",Neutral,Won
2,2013-10-16,ODI,No,Yes,No,Australia,"Sawai Mansingh Stadium, Jaipur",Home,Won
3,2013-11-02,ODI,Yes,Yes,No,Australia,"M. Chinnaswamy Stadium, Bangalore",Home,Won
4,2014-11-13,ODI,Yes,Yes,No,Sri Lanka,"Eden Gardens, Kolkata",Home,Won
5,2015-01-18,ODI,Yes,No,No,Australia,"Melbourne Cricket Ground, Melbourne",Away,Lost
6,2015-03-19,ODI,Yes,Yes,No,Bangladesh,"Melbourne Cricket Ground, Melbourne",Neutral,Won
7,2015-10-11,ODI,Yes,No,No,South Africa,"Green Park Stadium, Kanpur",Home,Lost
8,2016-01-12,ODI,No,No,No,Australia,"WACA Ground, Perth",Away,Lost
9,2016-01-15,ODI,Yes,Yes,No,Australia,"The Gabba, Brisbane",Away,Lost


## We will have to treat Date as a categorical features over here

## Encoding the categorical features over here

In [11]:
for feature in cat_features:
  feature_mapping={category:index for index,category in enumerate(data[feature].unique())}
  data[feature]=data[feature].map(feature_mapping)

In [12]:
data.head()

Unnamed: 0,S.No.,Date,Score,Strike Rate,Type of Match,Position,Innings,Dismissed,Man of the Match,Captain,Against,Venue,H/A/N,Result
0,1,0,114,95.79,0,4,1,0,0,0,0,0,0,0
1,2,1,101,170.0,0,4,2,1,1,0,1,0,1,1
2,3,2,141,114.63,0,1,2,1,1,0,2,1,2,1
3,4,3,209,132.27,0,1,1,0,1,0,2,2,2,1
4,5,4,264,152.6,0,2,1,0,1,0,1,3,2,1


## Creating the features and labels over here

In [13]:
data['SCORE']=data['Score']

In [14]:
data.drop(['Score'],axis=1,inplace=True)

In [15]:
X=data.iloc[:,:-1]
y=data.iloc[:,-1]

## Splitting the dataset into training set and testing set to avoid the problem of overfitting over here

In [16]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=0)

## Training the model on the training dataset over here

In [17]:
# VERY IMPORTANT
# regressor.feature_importances_
# classifier.feature_importances_
# regressor.feature_importances_
# classifier.feature_importances_

In [18]:
import numpy as np
import pandas as pd
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.model_selection import train_test_split, GridSearchCV

# Assuming X and y are already defined
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initial model training to get feature importances
initial_regressor = AdaBoostRegressor()
initial_regressor.fit(X_train, y_train)
feature_importances = initial_regressor.feature_importances_

# Create a DataFrame for better visualization
feature_importances_df = pd.DataFrame({
    'feature': X_train.columns,
    'importance': feature_importances
})

# Sort the DataFrame by importance
feature_importances_df = feature_importances_df.sort_values(by='importance', ascending=False)

# Display the feature importances
print(feature_importances_df)

# Select the top 3 features
top_features = feature_importances_df.head(3)['feature']
print(f"Top 3 features: {list(top_features)}")

# Create new training and testing sets with only the top 3 features
X_train_top3 = X_train[top_features]
X_test_top3 = X_test[top_features]

# List of regressors to evaluate
regressors = {
    'LinearRegression': LinearRegression(),
    'Ridge': Ridge(),
    'Lasso': Lasso(),
    'KNeighborsRegressor': KNeighborsRegressor(),
    'DecisionTreeRegressor': DecisionTreeRegressor(),
    'RandomForestRegressor': RandomForestRegressor(),
    'AdaBoostRegressor': AdaBoostRegressor(),
    'XGBRegressor': XGBRegressor()
}

# Dictionary to store results
results = {}

# Evaluate each regressor
for name, regressor in regressors.items():
    regressor.fit(X_train_top3, y_train)
    y_pred = regressor.predict(X_test_top3)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    results[name] = {'MSE': mse, 'R2': r2}
    print(f'{name} - Mean Squared Error: {mse}, R^2 Score: {r2}')

# Find the best model
best_model_name = max(results, key=lambda k: results[k]['R2'])
best_model = regressors[best_model_name]
print(f'Best model: {best_model_name}')

# Hyperparameter tuning for the best model
if best_model_name == 'RandomForestRegressor':
    param_grid = {
        'n_estimators': [50, 100, 200],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    }
elif best_model_name == 'AdaBoostRegressor':
    param_grid = {
        'n_estimators': [50, 100, 200],
        'learning_rate': [0.01, 0.1, 1]
    }
elif best_model_name == 'XGBRegressor':
    param_grid = {
        'n_estimators': [50, 100, 200],
        'max_depth': [3, 6, 9],
        'learning_rate': [0.01, 0.1, 0.2],
        'subsample': [0.8, 1]
    }
else:
    param_grid = {}

if param_grid:
    grid_search = GridSearchCV(estimator=best_model, param_grid=param_grid, cv=3, n_jobs=-1, scoring='r2')
    grid_search.fit(X_train_top3, y_train)
    best_model = grid_search.best_estimator_
    print(f'Best parameters for {best_model_name}: {grid_search.best_params_}')

# Final model training with best parameters
final_regressor = best_model
final_regressor.fit(X_train_top3, y_train)

# Evaluate the final model
y_pred = final_regressor.predict(X_test_top3)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f'Final Model - {best_model_name} - Mean Squared Error: {mse}, R^2 Score: {r2}')


             feature  importance
2        Strike Rate    0.243266
1               Date    0.140376
11             H/A/N    0.110996
10             Venue    0.100229
9            Against    0.095333
5            Innings    0.092293
7   Man of the Match    0.084434
0              S.No.    0.073872
4           Position    0.018285
6         Dismissed     0.015264
8            Captain    0.011840
12            Result    0.007021
3      Type of Match    0.006791
Top 3 features: ['Strike Rate', 'Date', 'H/A/N']
LinearRegression - Mean Squared Error: 1660.6195448750423, R^2 Score: 0.2661899218849929
Ridge - Mean Squared Error: 1673.6444435094982, R^2 Score: 0.2604343579968723
Lasso - Mean Squared Error: 1697.311313944309, R^2 Score: 0.2499762201915552
KNeighborsRegressor - Mean Squared Error: 1783.7039999999997, R^2 Score: 0.2118002129906631
DecisionTreeRegressor - Mean Squared Error: 3941.0, R^2 Score: -0.7414858971016476
RandomForestRegressor - Mean Squared Error: 1529.5024299999995, R^2 Sc