In [None]:
# For processing the data
import numpy as np
import pandas as pd

# Visualization tools
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.lines import Line2D
%matplotlib inline
sns.set_style("white") # set style for seaborn plots
import tensorflow as tf
# Machine learning
from sklearn.decomposition import PCA, KernelPCA
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import KFold, cross_val_score, GridSearchCV
from sklearn.feature_selection import VarianceThreshold, RFE, SelectKBest, chi2
from sklearn.metrics import make_scorer, log_loss
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import (BaggingClassifier, ExtraTreesClassifier, 
                              GradientBoostingClassifier, VotingClassifier, 
                              RandomForestClassifier, AdaBoostClassifier)

# Ignore warnings
import warnings 
warnings.filterwarnings('ignore')

In [None]:
df = pd.read_csv("data.csv")

In [None]:
#设置索引
df.set_index('shot_id', inplace=True)
df.head()

In [None]:
#不同列数据类型转换
df["period"] = df["period"].astype('object')

vars_to_category = ["combined_shot_type", "game_event_id", "game_id", "playoffs", 
                    "season", "shot_made_flag", "shot_type", "team_id"]
for col in vars_to_category:
    df[col] = df[col].astype('category')

In [None]:
print(df.shape)

In [None]:
copy_df = df.copy()
target = copy_df['shot_made_flag'].copy()

In [None]:
#移除无关列
vars_to_remove = ["team_id", "team_name", "game_id", "game_event_id", 
                  "lat", "lon", "shot_made_flag"]

for var in vars_to_remove:
    copy_df = copy_df.drop(var, axis=1)
    

In [None]:
pd.DataFrame({"counts": copy_df["action_type"].value_counts().sort_values()[:25]})

In [None]:
rare_action_types = copy_df["action_type"].value_counts().sort_values().index.values[:20]#将出现次数count在前20个的列入稀少名单
copy_df.loc[copy_df["action_type"].isin(rare_action_types), "action_type"] = "Other"

In [None]:
#年月进行分离
copy_df["game_date"] = pd.to_datetime(copy_df["game_date"])
copy_df["game_year"] = copy_df["game_date"].dt.year
copy_df["game_month"] = copy_df["game_date"].dt.month
copy_df = copy_df.drop("game_date", axis=1)


In [None]:
#时间转换，只要小于5秒的那些因素，这些因素的效果更明显，布尔类型
copy_df["seconds_from_period_end"] = 60 * copy_df["minutes_remaining"] + copy_df["seconds_remaining"]
copy_df["last_5_sec_in_period"] = copy_df["seconds_from_period_end"] < 5

# We can drop the rest of time related fields
copy_df = copy_df.drop("minutes_remaining", axis=1)
copy_df = copy_df.drop("seconds_remaining", axis=1)
copy_df = copy_df.drop("seconds_from_period_end", axis=1)

In [None]:
#数值转换成25个区间
#copy_df["x_zones"] = pd.cut(copy_df["loc_x"], bins=25)
#copy_df["y_zones"] = pd.cut(copy_df["loc_y"], bins=25)

In [None]:
#vs的变为1，@的变为0
copy_df["home_play"] = copy_df["matchup"].str.contains("vs").astype("int")
copy_df = copy_df.drop("matchup", axis=1)

In [None]:
#独热编码
pd.get_dummies(copy_df["action_type"]).add_prefix("{}#".format("action_type"))
categorial_vars = [
    'action_type', 'combined_shot_type', 'period', 'season', 'shot_type',
    'shot_zone_area', 'shot_zone_basic', 'shot_zone_range', 'game_year',
    'game_month', 'opponent']

for var in categorial_vars:
    dummies = pd.get_dummies(copy_df[var])
    dummies = dummies.add_prefix("{}#".format(var))
    copy_df.drop(var, axis=1, inplace=True)
    copy_df = copy_df.join(dummies)

In [None]:
copy_df

In [None]:
#data_submit为shot_made_flag的缺失列
#X为所有非缺失行（除开shot_made_flag)
#Y为所有shot_made_flag非缺失行
missing = target.isnull()
data_submit = copy_df[missing]
X = copy_df[~missing]
Y = target[~missing]

In [None]:
print(X.shape, Y.shape)
print(copy_df.shape)
print(data_submit.shape)

In [None]:
#使用随机森林分类器，来提取X与Y最相关的30个特征
model = RandomForestClassifier()
model.fit(X, Y)

feature_imp = pd.DataFrame(model.feature_importances_, index=X.columns, columns=["importance"])
feat_imp_30 = feature_imp.sort_values("importance", ascending=False).head(30).index#降序排列
feat_imp_30

In [None]:


features = np.unique(feat_imp_30) #去除4个方法选取的特征之间互相重复的
print("Final features set:\n")
for f in features:
    print("\t-{}".format(f))

In [None]:
copy_df = copy_df.loc[:, features]   #把这些有用的特征列提取出来
data_submit = data_submit.loc[:, features]  #空缺数据
X = X.loc[:, features]              #没有空缺的

print("Clean dataset shape: {}".format(copy_df.shape))
print("Subbmitable dataset shape: {}".format(data_submit.shape))
print("Train features shape: {}".format(X.shape))
print("Target label shape: {}".format(Y.shape))

In [None]:
seed = 2666          #获得可重复的结果
processors = -1       #计算机将使用其所有核心并行处理代码。
num_folds = 3         #交叉验证时的分区数量
scoring="neg_log_loss"  #损失评分指标

kfold = KFold(n_splits=num_folds, random_state=seed,shuffle=True)

In [None]:
#模型3,随机森林：
rf_grid = GridSearchCV(
    estimator = RandomForestClassifier(warm_start=True, random_state=seed),
    param_grid = {
        'n_estimators': [100, 200],   #森林中树的个数
        'criterion': ['gini', 'entropy'], #采用Gini指标还是信息增益指标
        'max_features': [18, 20],        #寻找最佳分割时需要考虑的特征数目
        'max_depth': [8, 10],          #（决策）树的最大深度。
        'bootstrap': [True]         #建立决策树时，是否使用有放回抽样。
    }, 
    cv = kfold, 
    scoring = scoring, 
    n_jobs = processors)

rf_grid.fit(X, Y)

#寻找的最佳参数值
print(rf_grid.best_score_)
print(rf_grid.best_params_)

In [None]:

preds = rf_grid.predict_proba(data_submit) #结果的第一列为我们想要的值
preds[:10]

In [None]:
submission = pd.DataFrame()
submission.add_prefix('shot_made_flag')

submission["shot_made_flag"]= preds[:,1]

submission.to_csv("submiss.csv", index=False)