In [19]:
# For processing the data
import numpy as np
import pandas as pd

# Visualization tools
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.lines import Line2D
%matplotlib inline
sns.set_style("white") # set style for seaborn plots

# Machine learning
from sklearn.decomposition import PCA, KernelPCA
from sklearn.preprocessing import MinMaxScaler, LabelBinarizer, StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV
from sklearn.feature_selection import VarianceThreshold, RFE, SelectKBest, chi2
from sklearn.metrics import accuracy_score, make_scorer, log_loss
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import (BaggingClassifier, ExtraTreesClassifier, 
                              GradientBoostingClassifier, VotingClassifier, 
                              RandomForestClassifier, AdaBoostClassifier)

# Ignore warnings
import warnings 
warnings.filterwarnings('ignore')

In [2]:
training_data = pd.read_csv("training_set.csv")
training_data = pd.DataFrame(training_data)
training_data = training_data.drop(training_data.columns[0], axis = 1)
training_data.head()

Unnamed: 0,action_type,combined_shot_type,game_event_id,game_id,lat,loc_x,loc_y,lon,minutes_remaining,period,...,shot_type,shot_zone_area,shot_zone_basic,shot_zone_range,team_id,team_name,game_date,matchup,opponent,shot_id
0,Jump Shot,Jump Shot,12,20000012,34.0443,-157,0,-118.4268,10,1,...,2PT Field Goal,Left Side(L),Mid-Range,8-16 ft.,1610612747,Los Angeles Lakers,2000-10-31,LAL @ POR,POR,2
1,Jump Shot,Jump Shot,35,20000012,33.9093,-101,135,-118.3708,7,1,...,2PT Field Goal,Left Side Center(LC),Mid-Range,16-24 ft.,1610612747,Los Angeles Lakers,2000-10-31,LAL @ POR,POR,3
2,Jump Shot,Jump Shot,43,20000012,33.8693,138,175,-118.1318,6,1,...,2PT Field Goal,Right Side Center(RC),Mid-Range,16-24 ft.,1610612747,Los Angeles Lakers,2000-10-31,LAL @ POR,POR,4
3,Driving Dunk Shot,Dunk,155,20000012,34.0443,0,0,-118.2698,6,2,...,2PT Field Goal,Center(C),Restricted Area,Less Than 8 ft.,1610612747,Los Angeles Lakers,2000-10-31,LAL @ POR,POR,5
4,Jump Shot,Jump Shot,244,20000012,34.0553,-145,-11,-118.4148,9,3,...,2PT Field Goal,Left Side(L),Mid-Range,8-16 ft.,1610612747,Los Angeles Lakers,2000-10-31,LAL @ POR,POR,6


In [3]:
irrelevant = ["lon", "lat", "team_id", "team_name", "game_id", "game_event_id"]
training_data = training_data.drop(columns=irrelevant)
training_data = training_data.set_index('shot_id')
training_data

Unnamed: 0_level_0,action_type,combined_shot_type,loc_x,loc_y,minutes_remaining,period,playoffs,season,seconds_remaining,shot_distance,shot_made_flag,shot_type,shot_zone_area,shot_zone_basic,shot_zone_range,game_date,matchup,opponent
shot_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2,Jump Shot,Jump Shot,-157,0,10,1,0,2000-01,22,15,0.0,2PT Field Goal,Left Side(L),Mid-Range,8-16 ft.,2000-10-31,LAL @ POR,POR
3,Jump Shot,Jump Shot,-101,135,7,1,0,2000-01,45,16,1.0,2PT Field Goal,Left Side Center(LC),Mid-Range,16-24 ft.,2000-10-31,LAL @ POR,POR
4,Jump Shot,Jump Shot,138,175,6,1,0,2000-01,52,22,0.0,2PT Field Goal,Right Side Center(RC),Mid-Range,16-24 ft.,2000-10-31,LAL @ POR,POR
5,Driving Dunk Shot,Dunk,0,0,6,2,0,2000-01,19,0,1.0,2PT Field Goal,Center(C),Restricted Area,Less Than 8 ft.,2000-10-31,LAL @ POR,POR
6,Jump Shot,Jump Shot,-145,-11,9,3,0,2000-01,32,14,0.0,2PT Field Goal,Left Side(L),Mid-Range,8-16 ft.,2000-10-31,LAL @ POR,POR
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30692,Driving Layup Shot,Layup,0,0,7,4,1,1999-00,4,0,0.0,2PT Field Goal,Center(C),Restricted Area,Less Than 8 ft.,2000-06-19,LAL vs. IND,IND
30693,Jump Shot,Jump Shot,1,48,6,4,1,1999-00,5,4,0.0,2PT Field Goal,Center(C),In The Paint (Non-RA),Less Than 8 ft.,2000-06-19,LAL vs. IND,IND
30695,Running Jump Shot,Jump Shot,-134,166,3,4,1,1999-00,28,21,1.0,2PT Field Goal,Left Side Center(LC),Mid-Range,16-24 ft.,2000-06-19,LAL vs. IND,IND
30696,Jump Shot,Jump Shot,31,267,2,4,1,1999-00,10,26,0.0,3PT Field Goal,Center(C),Above the Break 3,24+ ft.,2000-06-19,LAL vs. IND,IND


In [4]:
tr = training_data.drop("shot_made_flag", axis = 1)
tr_labels = training_data["shot_made_flag"].copy()

In [5]:
tr_cat = tr.select_dtypes(include='object')
tr_cat

Unnamed: 0_level_0,action_type,combined_shot_type,season,shot_type,shot_zone_area,shot_zone_basic,shot_zone_range,game_date,matchup,opponent
shot_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2,Jump Shot,Jump Shot,2000-01,2PT Field Goal,Left Side(L),Mid-Range,8-16 ft.,2000-10-31,LAL @ POR,POR
3,Jump Shot,Jump Shot,2000-01,2PT Field Goal,Left Side Center(LC),Mid-Range,16-24 ft.,2000-10-31,LAL @ POR,POR
4,Jump Shot,Jump Shot,2000-01,2PT Field Goal,Right Side Center(RC),Mid-Range,16-24 ft.,2000-10-31,LAL @ POR,POR
5,Driving Dunk Shot,Dunk,2000-01,2PT Field Goal,Center(C),Restricted Area,Less Than 8 ft.,2000-10-31,LAL @ POR,POR
6,Jump Shot,Jump Shot,2000-01,2PT Field Goal,Left Side(L),Mid-Range,8-16 ft.,2000-10-31,LAL @ POR,POR
...,...,...,...,...,...,...,...,...,...,...
30692,Driving Layup Shot,Layup,1999-00,2PT Field Goal,Center(C),Restricted Area,Less Than 8 ft.,2000-06-19,LAL vs. IND,IND
30693,Jump Shot,Jump Shot,1999-00,2PT Field Goal,Center(C),In The Paint (Non-RA),Less Than 8 ft.,2000-06-19,LAL vs. IND,IND
30695,Running Jump Shot,Jump Shot,1999-00,2PT Field Goal,Left Side Center(LC),Mid-Range,16-24 ft.,2000-06-19,LAL vs. IND,IND
30696,Jump Shot,Jump Shot,1999-00,3PT Field Goal,Center(C),Above the Break 3,24+ ft.,2000-06-19,LAL vs. IND,IND


In [6]:
lbe = LabelBinarizer()

In [7]:
tr_action = tr_cat["action_type"]
tr_combinedshot = tr_cat["combined_shot_type"]
tr_season = tr_cat["season"]
tr_shottype = tr_cat["shot_type"]
tr_shotarea = tr_cat["shot_zone_area"]
tr_shotbasic = tr_cat["shot_zone_basic"]
tr_shotrange = tr_cat["shot_zone_range"]
tr_gamedate = tr_cat["game_date"]
tr_matchup = tr_cat["matchup"]
tr_opponent = tr_cat["opponent"]

In [8]:
tr_action_1hot = lbe.fit_transform(tr_action)
tr_combinedshot_1hot = lbe.fit_transform(tr_combinedshot)
tr_season_1hot = lbe.fit_transform(tr_season)
tr_shottype_1hot = lbe.fit_transform(tr_shottype)
tr_shotarea_1hot = lbe.fit_transform(tr_shotarea)
tr_shotbasic_1hot = lbe.fit_transform(tr_shotbasic)
tr_shotrange_1hot = lbe.fit_transform(tr_shotrange)
tr_gamedate_1hot = lbe.fit_transform(tr_gamedate)
tr_matchup_1hot = lbe.fit_transform(tr_matchup)
tr_opponent_1hot = lbe.fit_transform(tr_opponent)

In [9]:
tr_num = tr.select_dtypes(include='int64')
tr_num

Unnamed: 0_level_0,loc_x,loc_y,minutes_remaining,period,playoffs,seconds_remaining,shot_distance
shot_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2,-157,0,10,1,0,22,15
3,-101,135,7,1,0,45,16
4,138,175,6,1,0,52,22
5,0,0,6,2,0,19,0
6,-145,-11,9,3,0,32,14
...,...,...,...,...,...,...,...
30692,0,0,7,4,1,4,0
30693,1,48,6,4,1,5,4
30695,-134,166,3,4,1,28,21
30696,31,267,2,4,1,10,26


In [10]:
from sklearn.base import BaseEstimator, TransformerMixin
class DataFrameSelector(BaseEstimator, TransformerMixin): 
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names 
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attribute_names].values

In [11]:
from sklearn.pipeline import FeatureUnion

num_attribs = list(tr_num)
cat_attribs = list(tr_cat)

In [12]:
num_pipeline = Pipeline([
    ('selector', DataFrameSelector(num_attribs)),
    ('std_scaler', StandardScaler()),
])

In [13]:
cat_pipeline = Pipeline([
             ('selector', DataFrameSelector(cat_attribs)),
             ('one_hot_encoder', OneHotEncoder(sparse=False, handle_unknown='ignore')),
])

In [14]:
full_pipeline = FeatureUnion(transformer_list=[
             ("num_pipeline", num_pipeline),
             ("cat_pipeline", cat_pipeline),
])

In [15]:
tr_prepared = full_pipeline.fit_transform(training_data)
tr_prepared

array([[-1.49129575, -1.03524606,  1.48105426, ...,  0.        ,
         0.        ,  0.        ],
       [-0.98253325,  0.49622757,  0.6120956 , ...,  0.        ,
         0.        ,  0.        ],
       [ 1.18879244,  0.94999754,  0.32244271, ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [-1.28233972,  0.8478993 , -0.54651594, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.21669265,  1.99366846, -0.83616883, ...,  0.        ,
         0.        ,  0.        ],
       [-0.05585869, -0.21846012, -1.4154746 , ...,  0.        ,
         0.        ,  0.        ]])

In [16]:
tr_prepared.shape

(25697, 1773)

In [17]:
Dtree = DecisionTreeClassifier(criterion='entropy',
                              max_depth=1, random_state=1)

In [18]:
bagclassifier = BaggingClassifier(base_estimator=Dtree,
                                 n_estimators=1000,
                                 max_samples=1.0,
                                 max_features=1.0,
                                 bootstrap=True,
                                 bootstrap_features=False,
                                 n_jobs=-1, random_state=1)

In [20]:
train_attributes, test_attributes, train_target, test_target = train_test_split(tr_prepared, tr_labels, test_size=0.20, random_state=1)

In [21]:
Dtree.fit(train_attributes, train_target)
test_prediction = Dtree.predict(test_attributes)
tree_acc = accuracy_score(test_target, test_prediction)
print('Test accuracy of the decision tree is', tree_acc)

Test accuracy of the decision tree is 0.6571984435797665


In [22]:
bagclassifier.fit(train_attributes, train_target)
test_prediction = bagclassifier.predict(test_attributes)
bagging_acc = accuracy_score(test_target, test_prediction)
print('Test accuracy of the bagclassifier is', bagging_acc)

Test accuracy of the bagclassifier is 0.6571984435797665


In [23]:
bagclassifier_2 = BaggingClassifier(base_estimator=Dtree,
                                 n_estimators=1000,
                                 max_samples=1.0,
                                 max_features=1.0,
                                 bootstrap=False,
                                 bootstrap_features=True,
                                 n_jobs=-1, random_state=1)

In [25]:
bagclassifier_2.fit(train_attributes, train_target)
test_prediction = bagclassifier_2.predict(test_attributes)
bagging_acc = accuracy_score(test_target, test_prediction)
print('Test accuracy of the bagclassifier is', bagging_acc)

Test accuracy of the bagclassifier is 0.6571984435797665


In [26]:
adaboost = AdaBoostClassifier(base_estimator=Dtree, n_estimators=1000,random_state=1)

In [27]:
adaboost.fit(train_attributes, train_target)
test_prediction = adaboost.predict(test_attributes)
adaboost_acc = accuracy_score(test_target, test_prediction)
print('Test accuracy of adaboost is', adaboost_acc)

Test accuracy of adaboost is 0.6494163424124514


In [41]:
test_data = pd.read_csv("test_set.csv")
test_data = pd.DataFrame(test_data)
test_data = test_data.drop(test_data.columns[0], axis = 1)
test_data = test_data.drop(columns=irrelevant)
test_data_id = test_data["shot_id"]
test_data_id = pd.DataFrame(test_data_id)
x_test = test_data.drop("shot_made_flag", axis = 1)
x_test = x_test.set_index('shot_id')
x_test.head()

Unnamed: 0_level_0,action_type,combined_shot_type,loc_x,loc_y,minutes_remaining,period,playoffs,season,seconds_remaining,shot_distance,shot_type,shot_zone_area,shot_zone_basic,shot_zone_range,game_date,matchup,opponent
shot_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
1,Jump Shot,Jump Shot,167,72,10,1,0,2000-01,27,18,2PT Field Goal,Right Side(R),Mid-Range,16-24 ft.,2000-10-31,LAL @ POR,POR
8,Jump Shot,Jump Shot,1,28,8,3,0,2000-01,5,2,2PT Field Goal,Center(C),Restricted Area,Less Than 8 ft.,2000-10-31,LAL @ POR,POR
17,Driving Layup Shot,Layup,0,0,0,1,0,2000-01,1,0,2PT Field Goal,Center(C),Restricted Area,Less Than 8 ft.,2000-11-01,LAL vs. UTA,UTA
20,Driving Layup Shot,Layup,0,0,10,3,0,2000-01,46,0,2PT Field Goal,Center(C),Restricted Area,Less Than 8 ft.,2000-11-01,LAL vs. UTA,UTA
33,Jump Shot,Jump Shot,163,76,11,1,0,2000-01,26,17,2PT Field Goal,Right Side(R),Mid-Range,16-24 ft.,2000-11-04,LAL @ VAN,VAN


In [42]:
final_model = Dtree

In [43]:
x_test_prepared = full_pipeline.transform(x_test)

In [44]:
final_predictions = final_model.predict(x_test_prepared)

In [45]:
final_predictions

array([0., 0., 1., ..., 1., 1., 1.])

In [46]:
final_predictions = pd.DataFrame(final_predictions)
final_predictions

Unnamed: 0,0
0,0.0
1,0.0
2,1.0
3,1.0
4,0.0
...,...
4995,0.0
4996,1.0
4997,1.0
4998,1.0


In [48]:
submission_file_boosting = pd.concat([test_data_id, final_predictions], axis=1)
submission_file_boosting

Unnamed: 0,shot_id,0
0,1,0.0
1,8,0.0
2,17,1.0
3,20,1.0
4,33,0.0
...,...,...
4995,30669,0.0
4996,30681,1.0
4997,30683,1.0
4998,30687,1.0


In [49]:
submission_file_boosting.columns = ["shot_id", "shot_made_flag"]

In [50]:
submission_file_boosting.to_csv('submission_file_boosting.csv')