In [1]:
# For processing the data
import numpy as np
import pandas as pd

# Visualization tools
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.lines import Line2D
%matplotlib inline
sns.set_style("white") # set style for seaborn plots

# Machine learning
from sklearn.decomposition import PCA, KernelPCA
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder, MinMaxScaler, LabelBinarizer, StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV
from sklearn.feature_selection import VarianceThreshold, RFE, SelectKBest, chi2
from sklearn.metrics import mean_squared_error, accuracy_score, make_scorer, log_loss
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import (BaggingClassifier, ExtraTreesClassifier, 
                              GradientBoostingClassifier, VotingClassifier, 
                              RandomForestClassifier, AdaBoostClassifier)
import xgboost as xgb

# Ignore warnings
import warnings 
warnings.filterwarnings('ignore')

In [2]:
data = pd.read_csv("data.csv")
data = pd.DataFrame(data)
data.head()

Unnamed: 0,action_type,combined_shot_type,game_event_id,game_id,lat,loc_x,loc_y,lon,minutes_remaining,period,...,shot_type,shot_zone_area,shot_zone_basic,shot_zone_range,team_id,team_name,game_date,matchup,opponent,shot_id
0,Jump Shot,Jump Shot,10,20000012,33.9723,167,72,-118.1028,10,1,...,2PT Field Goal,Right Side(R),Mid-Range,16-24 ft.,1610612747,Los Angeles Lakers,2000-10-31,LAL @ POR,POR,1
1,Jump Shot,Jump Shot,12,20000012,34.0443,-157,0,-118.4268,10,1,...,2PT Field Goal,Left Side(L),Mid-Range,8-16 ft.,1610612747,Los Angeles Lakers,2000-10-31,LAL @ POR,POR,2
2,Jump Shot,Jump Shot,35,20000012,33.9093,-101,135,-118.3708,7,1,...,2PT Field Goal,Left Side Center(LC),Mid-Range,16-24 ft.,1610612747,Los Angeles Lakers,2000-10-31,LAL @ POR,POR,3
3,Jump Shot,Jump Shot,43,20000012,33.8693,138,175,-118.1318,6,1,...,2PT Field Goal,Right Side Center(RC),Mid-Range,16-24 ft.,1610612747,Los Angeles Lakers,2000-10-31,LAL @ POR,POR,4
4,Driving Dunk Shot,Dunk,155,20000012,34.0443,0,0,-118.2698,6,2,...,2PT Field Goal,Center(C),Restricted Area,Less Than 8 ft.,1610612747,Los Angeles Lakers,2000-10-31,LAL @ POR,POR,5


In [3]:
irrelevant = ["lon", "lat", "team_id", "team_name", "game_id", "game_event_id", "game_date"]
data = data.drop(columns=irrelevant)
data

Unnamed: 0,action_type,combined_shot_type,loc_x,loc_y,minutes_remaining,period,playoffs,season,seconds_remaining,shot_distance,shot_made_flag,shot_type,shot_zone_area,shot_zone_basic,shot_zone_range,matchup,opponent,shot_id
0,Jump Shot,Jump Shot,167,72,10,1,0,2000-01,27,18,,2PT Field Goal,Right Side(R),Mid-Range,16-24 ft.,LAL @ POR,POR,1
1,Jump Shot,Jump Shot,-157,0,10,1,0,2000-01,22,15,0.0,2PT Field Goal,Left Side(L),Mid-Range,8-16 ft.,LAL @ POR,POR,2
2,Jump Shot,Jump Shot,-101,135,7,1,0,2000-01,45,16,1.0,2PT Field Goal,Left Side Center(LC),Mid-Range,16-24 ft.,LAL @ POR,POR,3
3,Jump Shot,Jump Shot,138,175,6,1,0,2000-01,52,22,0.0,2PT Field Goal,Right Side Center(RC),Mid-Range,16-24 ft.,LAL @ POR,POR,4
4,Driving Dunk Shot,Dunk,0,0,6,2,0,2000-01,19,0,1.0,2PT Field Goal,Center(C),Restricted Area,Less Than 8 ft.,LAL @ POR,POR,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30692,Jump Shot,Jump Shot,1,48,6,4,1,1999-00,5,4,0.0,2PT Field Goal,Center(C),In The Paint (Non-RA),Less Than 8 ft.,LAL vs. IND,IND,30693
30693,Tip Shot,Tip Shot,0,0,6,4,1,1999-00,5,0,,2PT Field Goal,Center(C),Restricted Area,Less Than 8 ft.,LAL vs. IND,IND,30694
30694,Running Jump Shot,Jump Shot,-134,166,3,4,1,1999-00,28,21,1.0,2PT Field Goal,Left Side Center(LC),Mid-Range,16-24 ft.,LAL vs. IND,IND,30695
30695,Jump Shot,Jump Shot,31,267,2,4,1,1999-00,10,26,0.0,3PT Field Goal,Center(C),Above the Break 3,24+ ft.,LAL vs. IND,IND,30696


In [4]:
tr = data

In [5]:
tr_labels = data["shot_made_flag"]
tr_labels = tr_labels.dropna()
tr_labels

1        0.0
2        1.0
3        0.0
4        1.0
5        0.0
        ... 
30691    0.0
30692    0.0
30694    1.0
30695    0.0
30696    0.0
Name: shot_made_flag, Length: 25697, dtype: float64

In [6]:
tr_num = tr.select_dtypes(include='int64')
tr_num

Unnamed: 0,loc_x,loc_y,minutes_remaining,period,playoffs,seconds_remaining,shot_distance,shot_id
0,167,72,10,1,0,27,18,1
1,-157,0,10,1,0,22,15,2
2,-101,135,7,1,0,45,16,3
3,138,175,6,1,0,52,22,4
4,0,0,6,2,0,19,0,5
...,...,...,...,...,...,...,...,...
30692,1,48,6,4,1,5,4,30693
30693,0,0,6,4,1,5,0,30694
30694,-134,166,3,4,1,28,21,30695
30695,31,267,2,4,1,10,26,30696


In [7]:
tr_cat = tr.select_dtypes(include='object')
tr_cat

Unnamed: 0,action_type,combined_shot_type,season,shot_type,shot_zone_area,shot_zone_basic,shot_zone_range,matchup,opponent
0,Jump Shot,Jump Shot,2000-01,2PT Field Goal,Right Side(R),Mid-Range,16-24 ft.,LAL @ POR,POR
1,Jump Shot,Jump Shot,2000-01,2PT Field Goal,Left Side(L),Mid-Range,8-16 ft.,LAL @ POR,POR
2,Jump Shot,Jump Shot,2000-01,2PT Field Goal,Left Side Center(LC),Mid-Range,16-24 ft.,LAL @ POR,POR
3,Jump Shot,Jump Shot,2000-01,2PT Field Goal,Right Side Center(RC),Mid-Range,16-24 ft.,LAL @ POR,POR
4,Driving Dunk Shot,Dunk,2000-01,2PT Field Goal,Center(C),Restricted Area,Less Than 8 ft.,LAL @ POR,POR
...,...,...,...,...,...,...,...,...,...
30692,Jump Shot,Jump Shot,1999-00,2PT Field Goal,Center(C),In The Paint (Non-RA),Less Than 8 ft.,LAL vs. IND,IND
30693,Tip Shot,Tip Shot,1999-00,2PT Field Goal,Center(C),Restricted Area,Less Than 8 ft.,LAL vs. IND,IND
30694,Running Jump Shot,Jump Shot,1999-00,2PT Field Goal,Left Side Center(LC),Mid-Range,16-24 ft.,LAL vs. IND,IND
30695,Jump Shot,Jump Shot,1999-00,3PT Field Goal,Center(C),Above the Break 3,24+ ft.,LAL vs. IND,IND


In [8]:
le = LabelEncoder()

In [9]:
# Turning string data into numbers
tr["action_type"] = le.fit_transform(tr["action_type"])
tr["season"] = le.fit_transform(tr["season"])
tr["combined_shot_type"] = le.fit_transform(tr["combined_shot_type"])
tr["shot_type"] = le.fit_transform(tr["shot_type"])
tr["shot_zone_area"] = le.fit_transform(tr["shot_zone_area"])
tr["shot_zone_basic"] = le.fit_transform(tr["shot_zone_basic"])
tr["shot_zone_range"] = le.fit_transform(tr["shot_zone_range"])
tr["matchup"] = le.fit_transform(tr["matchup"])
tr["opponent"] = le.fit_transform(tr["opponent"])

In [10]:
tr.head()

Unnamed: 0,action_type,combined_shot_type,loc_x,loc_y,minutes_remaining,period,playoffs,season,seconds_remaining,shot_distance,shot_made_flag,shot_type,shot_zone_area,shot_zone_basic,shot_zone_range,matchup,opponent,shot_id
0,26,3,167,72,10,1,0,4,27,18,,0,5,4,0,28,25,1
1,26,3,-157,0,10,1,0,4,22,15,0.0,0,3,4,2,28,25,2
2,26,3,-101,135,7,1,0,4,45,16,1.0,0,2,4,0,28,25,3
3,26,3,138,175,6,1,0,4,52,22,0.0,0,4,4,0,28,25,4
4,5,1,0,0,6,2,0,4,19,0,1.0,0,1,5,4,28,25,5


In [11]:
tr_test = tr
tr_train = tr

In [12]:
tr_test = tr_test[tr_test["shot_made_flag"].isnull()]
tr_train.dropna(subset=['shot_made_flag'], inplace=True)

X_train = tr_train.drop('shot_made_flag', axis=1)

In [13]:
X_test = tr_test.drop('shot_made_flag', axis=1)
X_test

Unnamed: 0,action_type,combined_shot_type,loc_x,loc_y,minutes_remaining,period,playoffs,season,seconds_remaining,shot_distance,shot_type,shot_zone_area,shot_zone_basic,shot_zone_range,matchup,opponent,shot_id
0,26,3,167,72,10,1,0,4,27,18,0,5,4,0,28,25,1
7,26,3,1,28,8,3,0,4,5,2,0,1,5,4,28,25,8
16,12,4,0,0,0,1,0,4,1,0,0,1,5,4,71,30,17
19,12,4,0,0,10,3,0,4,46,0,0,1,5,4,71,30,20
32,26,3,163,76,11,1,0,4,26,17,0,5,4,0,35,31,33
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30668,26,3,-23,222,7,4,1,3,27,22,0,1,4,0,12,11,30669
30680,50,5,0,0,0,2,1,3,40,0,0,1,5,4,49,11,30681
30682,41,3,-68,48,11,3,1,3,30,8,0,3,2,2,49,11,30683
30686,41,3,16,93,5,3,1,3,37,9,0,1,2,2,49,11,30687


In [14]:
Dtree = DecisionTreeClassifier(criterion='entropy',
                              max_depth=1, random_state=1)

In [15]:
bagclassifier = BaggingClassifier(base_estimator=Dtree,
                                 n_estimators=1000,
                                 max_samples=1.0,
                                 max_features=1.0,
                                 bootstrap=True,
                                 bootstrap_features=False,
                                 n_jobs=-1, random_state=1)

In [16]:
train_attributes, test_attributes, train_target, test_target = train_test_split(X_train, tr_labels, test_size=0.20, random_state=1)

In [17]:
Dtree.fit(train_attributes, train_target)
tree_prediction = Dtree.predict(test_attributes)
tree_acc = accuracy_score(test_target, tree_prediction)
print('Test accuracy of the decision tree is', tree_acc)

Test accuracy of the decision tree is 0.627431906614786


In [18]:
bagclassifier.fit(train_attributes, train_target)
bagging_prediction = bagclassifier.predict(test_attributes)
bagging_acc = accuracy_score(test_target, bagging_prediction)
print('Test accuracy of the bagclassifier is', bagging_acc)

Test accuracy of the bagclassifier is 0.627431906614786


In [19]:
bagclassifier_2 = BaggingClassifier(base_estimator=Dtree,
                                 n_estimators=1000,
                                 max_samples=1.0,
                                 max_features=1.0,
                                 bootstrap=False,
                                 bootstrap_features=True,
                                 n_jobs=-1, random_state=1)

In [20]:
bagclassifier_2.fit(train_attributes, train_target)
bagclassifier2_prediction = bagclassifier_2.predict(test_attributes)
bagging_acc = accuracy_score(test_target, bagclassifier2_prediction)
print('Test accuracy of the bagclassifier is', bagging_acc)

Test accuracy of the bagclassifier is 0.6457198443579767


In [21]:
adaboost = AdaBoostClassifier(base_estimator=Dtree, n_estimators=1000,random_state=1)

In [22]:
adaboost.fit(train_attributes, train_target)
adaboost_prediction = adaboost.predict(test_attributes)
adaboost_acc = accuracy_score(test_target, adaboost_prediction)
print('Test accuracy of adaboost is', adaboost_acc)

Test accuracy of adaboost is 0.6815175097276265


In [31]:
model = bagclassifier
pred = model.predict_proba(X_test)[:, 1]


sub_data = pd.DataFrame({'shot_id':X_test['shot_id'], 'shot_made_flag':pred})

In [32]:
sub_data.head()

Unnamed: 0,shot_id,shot_made_flag
0,1,0.394647
7,8,0.394647
16,17,0.723491
19,20,0.723491
32,33,0.394647


In [33]:
submission = sub_data
submission

Unnamed: 0,shot_id,shot_made_flag
0,1,0.394647
7,8,0.394647
16,17,0.723491
19,20,0.723491
32,33,0.394647
...,...,...
30668,30669,0.394647
30680,30681,0.394647
30682,30683,0.394647
30686,30687,0.394647


In [34]:
submission.to_csv('submission_bagclassifier.csv', index=False)

In [None]:
class LabelEncoderExt(object):
    def __init__(self):
        """
        It differs from LabelEncoder by handling new classes and providing a value for it [Unknown]
        Unknown will be added in fit and transform will take care of new item. It gives unknown class id
        """
        self.label_encoder = LabelEncoder()
        # self.classes_ = self.label_encoder.classes_

    def fit(self, data_list):
        """
        This will fit the encoder for all the unique values and introduce unknown value
        :param data_list: A list of string
        :return: self
        """
        self.label_encoder = self.label_encoder.fit(list(data_list) + ['Unknown'])
        self.classes_ = self.label_encoder.classes_

        return self

    def transform(self, data_list):
        """
        This will transform the data_list to id list where the new values get assigned to Unknown class
        :param data_list:
        :return:
        """
        new_data_list = list(data_list)
        for unique_item in np.unique(data_list):
            if unique_item not in self.label_encoder.classes_:
                new_data_list = ['Unknown' if x==unique_item else x for x in new_data_list]

        return self.label_encoder.transform(new_data_list)