In [1]:
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
from ipywidgets import interact, fixed

# Modeling

## Importing Data
In this section the data for the past seasons and past tournaments will be loaded from pickles.

In [2]:
past_seasons_data = pd.read_pickle("past_season_detailed_results")
past_tourney_data = pd.read_pickle("past_tourney_detailed_results")
teams = pd.read_pickle("teams")

In [3]:
past_seasons_data.head()

Unnamed: 0,Season,Daynum,Wteam,Wscore,Lteam,Lscore,Wloc,Numot,Wfgm,Wfga,...,Lfga3,Lftm,Lfta,Lor,Ldr,Last,Lto,Lstl,Lblk,Lpf
0,2003,10,1104,68,1328,62,N,0,27,58,...,10,16,22,10,22,8,18,9,2,20
1,2003,10,1272,70,1393,63,N,0,26,62,...,24,9,20,20,25,7,12,8,6,16
2,2003,11,1266,73,1437,61,N,0,24,58,...,26,14,23,31,22,9,12,2,5,23
3,2003,11,1296,56,1457,50,N,0,18,38,...,22,8,15,17,20,9,19,4,3,23
4,2003,11,1400,77,1208,71,N,0,30,61,...,16,17,27,21,15,12,10,7,1,14


In [4]:
assert len(past_seasons_data) == 65872
assert len(past_seasons_data.columns) == 34

In [5]:
past_tourney_data.head()

Unnamed: 0,Season,Daynum,Wteam,Wscore,Lteam,Lscore,Wloc,Numot,Wfgm,Wfga,...,Lfga3,Lftm,Lfta,Lor,Ldr,Last,Lto,Lstl,Lblk,Lpf
0,2003,134,1421,92,1411,84,N,1,32,69,...,31,14,31,17,28,16,15,5,0,22
1,2003,136,1112,80,1436,51,N,0,31,66,...,16,7,7,8,26,12,17,10,3,15
2,2003,136,1113,84,1272,71,N,0,31,59,...,28,14,21,20,22,11,12,2,5,18
3,2003,136,1141,79,1166,73,N,0,29,53,...,17,12,17,14,17,20,21,6,6,21
4,2003,136,1143,76,1301,74,N,1,27,64,...,21,15,20,10,26,16,14,5,8,19


In [6]:
assert len(past_tourney_data) == 847
assert len(past_tourney_data.columns) == 34

In [7]:
teams.head()

Unnamed: 0_level_0,Team_Name
Team_Id,Unnamed: 1_level_1
1101,Abilene Chr
1102,Air Force
1103,Akron
1104,Alabama
1105,Alabama A&M


In [8]:
assert len(teams) == 364
assert len(teams.columns) == 1
assert teams.index.name == "Team_Id"

## Creating The Data Set We Need
In this section we will convert our game-by-game data set into a data set that we can use to predict game matchups.

**One important note in the following box is that I left out the Wscore and Lscore features. These features give away whether or not a team will win and thus can skew the data.**

In [9]:
# The following variables are used to calculate the final order of columns for our model
win_columns = ["Wfgp", "Wfgp3", "Wftp", "Wor", "Wdr", "Wast", "Wto", "Wstl", "Wblk", "Wpf"]
lose_columns = ["Lfgp", "Lfgp3", "Lftp", "Lor", "Ldr", "Last", "Lto", "Lstl", "Lblk", "Lpf"]
target_column = ["Win"]
final_columns = win_columns + lose_columns + target_column

The following function is used below for calculate field goal percentages for regular field goals, free throws, and 3-pointers. This data is used below to analyze the importance of the columns as well as setting up the machine learning model later.

In [10]:
def calc_fg_percents(games_data, is_winning_team):
    prefix = "L"
    
    if is_winning_team:
        prefix = "W"
    
    fgp = prefix + "fgp"
    fgp3 = prefix + "fgp3"
    ftp = prefix + "ftp"
    
    fgm = prefix + "fgm"
    fga = prefix + "fga"
    fgm3 = prefix + "fgm3"
    fga3 = prefix + "fga3"
    ftm = prefix + "ftm"
    fta = prefix + "fta"
    
    games_data[fgp] = games_data[fgm] / games_data[fga]
    games_data[fgp3] = games_data[fgm3] / games_data[fga3]
    games_data[ftp] = games_data[ftm] / games_data[fta]
    
    return games_data.drop([fgm, fga, fgm3, fga3, ftm, fta], axis=1)

The following cell calcs the field goal percentages for the winning team and losing team in each row of data.

In [11]:
joined_season_data = past_seasons_data.append(past_tourney_data)
joined_season_data = calc_fg_percents(joined_season_data, True)
joined_season_data = calc_fg_percents(joined_season_data, False)
joined_season_data["Win"] = True

In [12]:
joined_season_data.head()

Unnamed: 0,Season,Daynum,Wteam,Wscore,Lteam,Lscore,Wloc,Numot,Wor,Wdr,...,Lstl,Lblk,Lpf,Wfgp,Wfgp3,Wftp,Lfgp,Lfgp3,Lftp,Win
0,2003,10,1104,68,1328,62,N,0,14,24,...,9,2,20,0.465517,0.214286,0.611111,0.415094,0.2,0.727273,True
1,2003,10,1272,70,1393,63,N,0,15,28,...,8,6,16,0.419355,0.4,0.526316,0.358209,0.25,0.45,True
2,2003,11,1266,73,1437,61,N,0,17,26,...,2,5,23,0.413793,0.444444,0.586207,0.30137,0.115385,0.608696,True
3,2003,11,1296,56,1457,50,N,0,6,19,...,4,3,23,0.473684,0.333333,0.548387,0.367347,0.272727,0.533333,True
4,2003,11,1400,77,1208,71,N,0,17,22,...,7,1,14,0.491803,0.428571,0.846154,0.387097,0.375,0.62963,True


In [13]:
assert len(joined_season_data) == 66719
assert len(joined_season_data.columns) == 29

From here we will extract out all of the columns we would like to use to model our data.

In [14]:
cleaned_joined_data = joined_season_data[final_columns]

In [15]:
cleaned_joined_data.head()

Unnamed: 0,Wfgp,Wfgp3,Wftp,Wor,Wdr,Wast,Wto,Wstl,Wblk,Wpf,...,Lfgp3,Lftp,Lor,Ldr,Last,Lto,Lstl,Lblk,Lpf,Win
0,0.465517,0.214286,0.611111,14,24,13,23,7,1,22,...,0.2,0.727273,10,22,8,18,9,2,20,True
1,0.419355,0.4,0.526316,15,28,16,13,4,4,18,...,0.25,0.45,20,25,7,12,8,6,16,True
2,0.413793,0.444444,0.586207,17,26,15,10,5,2,25,...,0.115385,0.608696,31,22,9,12,2,5,23,True
3,0.473684,0.333333,0.548387,6,19,11,12,14,2,18,...,0.272727,0.533333,17,20,9,19,4,3,23,True
4,0.491803,0.428571,0.846154,17,22,12,14,4,4,20,...,0.375,0.62963,21,15,12,10,7,1,14,True


In [16]:
assert len(cleaned_joined_data) == 66719
assert list(cleaned_joined_data.columns) == final_columns

## Split Train/Test The Data
So an interesting problem with this data is that for ever row in the DataFrame our target variable that we would like the machine learning algorithm to learn (the "Win" column) only consists of the value **True**. This will not work. For the machine learning algorithm to work properly there must be two values for the target variable. This is where split Train/Test comes into play. It will split the dataset into two halves randomly and from there we can flip and append the data back on to the DataFrame for analysis.

To split this data we need to import the sklearn library function:

In [17]:
from sklearn import cross_validation

Now we need to extract the feature columns and the target column:

In [18]:
feature_columns = list(final_columns[:len(final_columns) - 1])
target_column = list(final_columns[-1:])

Once that is done we can split the data into a training set and a testing set with a 50/50 split. We can then use this to flip half the data and use it as an accurate representation of the "losing game" classification.

In [19]:
Xtrain, Xtest, ytrain, ytest = cross_validation.train_test_split(cleaned_joined_data[feature_columns], 
                                                                 cleaned_joined_data[target_column], test_size=0.5)

In [20]:
print(Xtrain.shape)
print(ytrain.shape)

(33359, 20)
(33359, 1)


In [21]:
print(Xtest.shape)
print(ytest.shape)

(33360, 20)
(33360, 1)


## Flip The Test Values
From here the data stored in the testing variables can be reflected accross its column axis so that it can be reattached and our data will have two values for the "Win" target variable.

This piece of code reverses the **final_columns** without the "Win" column:

In [22]:
flipped_columns = lose_columns + win_columns

Here we will flip the columns on the test DataFrame and set all values of "Win" to false:

In [23]:
Xtest = Xtest[flipped_columns]
ytest["Win"] = False

In [24]:
Xtest.head()

Unnamed: 0,Lfgp,Lfgp3,Lftp,Lor,Ldr,Last,Lto,Lstl,Lblk,Lpf,Wfgp,Wfgp3,Wftp,Wor,Wdr,Wast,Wto,Wstl,Wblk,Wpf
38492,0.490909,0.5,0.6,8,22,21,15,1,6,23,0.355932,0.5,0.857143,15,20,11,5,7,4,12
62494,0.436364,0.3,0.647059,8,25,8,9,4,3,23,0.469388,0.375,0.791667,4,22,10,8,5,3,15
16965,0.413793,0.526316,0.5,10,19,13,13,1,5,15,0.491525,0.291667,0.625,11,29,21,13,7,4,16
24382,0.442308,0.222222,0.642857,13,21,12,16,7,2,25,0.511628,0.6,0.766667,4,23,15,14,3,1,24
51021,0.383333,0.375,0.666667,10,20,11,19,12,4,21,0.385965,0.266667,0.787879,20,28,11,21,7,5,17


In [25]:
assert len(Xtest) == 33360
assert len(Xtest.columns) == len(win_columns + lose_columns)
assert list(Xtest.columns) == flipped_columns

In [26]:
ytest.head()

Unnamed: 0,Win
38492,False
62494,False
16965,False
24382,False
51021,False


In [27]:
assert ytest["Win"].all() == False
assert len(ytest) == 33360
assert list(ytest.columns) == target_column

## Append All The Data Back Together
Now that we have created our flipped data for use in the machine learning algorithm let's join all the data back together and save it to a pickle so that we can use it in our MachineLearning notebook.

In [28]:
training_joined = pd.concat([Xtrain, ytrain], axis=1)
test_joined = pd.concat([Xtest, ytest], axis=1)

# make sure to set the columns of the test data to be the same as the training data
test_joined.columns = training_joined.columns

In [29]:
training_joined.head()

Unnamed: 0,Wfgp,Wfgp3,Wftp,Wor,Wdr,Wast,Wto,Wstl,Wblk,Wpf,...,Lfgp3,Lftp,Lor,Ldr,Last,Lto,Lstl,Lblk,Lpf,Win
46253,0.454545,0.3125,0.708333,11,26,15,12,5,2,16,...,0.066667,0.736842,13,22,9,15,5,1,23,True
62629,0.42,0.307692,0.714286,11,32,13,19,7,9,16,...,0.136364,0.52381,14,23,5,12,9,4,19,True
21495,0.510638,0.533333,0.652174,14,21,19,14,5,9,14,...,0.222222,0.684211,21,12,11,13,7,3,25,True
55532,0.451613,0.333333,0.529412,11,25,11,8,7,4,23,...,0.25,0.774194,13,29,11,13,2,3,15,True
42770,0.48,0.473684,0.904762,4,23,14,9,4,4,14,...,0.4,0.5,9,22,11,11,3,0,19,True


In [30]:
test_joined.head()

Unnamed: 0,Wfgp,Wfgp3,Wftp,Wor,Wdr,Wast,Wto,Wstl,Wblk,Wpf,...,Lfgp3,Lftp,Lor,Ldr,Last,Lto,Lstl,Lblk,Lpf,Win
38492,0.490909,0.5,0.6,8,22,21,15,1,6,23,...,0.5,0.857143,15,20,11,5,7,4,12,False
62494,0.436364,0.3,0.647059,8,25,8,9,4,3,23,...,0.375,0.791667,4,22,10,8,5,3,15,False
16965,0.413793,0.526316,0.5,10,19,13,13,1,5,15,...,0.291667,0.625,11,29,21,13,7,4,16,False
24382,0.442308,0.222222,0.642857,13,21,12,16,7,2,25,...,0.6,0.766667,4,23,15,14,3,1,24,False
51021,0.383333,0.375,0.666667,10,20,11,19,12,4,21,...,0.266667,0.787879,20,28,11,21,7,5,17,False


In [31]:
assert list(training_joined.columns) == list(test_joined.columns) == final_columns

Final appending of the data:

In [32]:
# append the test data back to the training data
final_dataset = training_joined.append(test_joined)

In [33]:
final_dataset.head()

Unnamed: 0,Wfgp,Wfgp3,Wftp,Wor,Wdr,Wast,Wto,Wstl,Wblk,Wpf,...,Lfgp3,Lftp,Lor,Ldr,Last,Lto,Lstl,Lblk,Lpf,Win
46253,0.454545,0.3125,0.708333,11,26,15,12,5,2,16,...,0.066667,0.736842,13,22,9,15,5,1,23,True
62629,0.42,0.307692,0.714286,11,32,13,19,7,9,16,...,0.136364,0.52381,14,23,5,12,9,4,19,True
21495,0.510638,0.533333,0.652174,14,21,19,14,5,9,14,...,0.222222,0.684211,21,12,11,13,7,3,25,True
55532,0.451613,0.333333,0.529412,11,25,11,8,7,4,23,...,0.25,0.774194,13,29,11,13,2,3,15,True
42770,0.48,0.473684,0.904762,4,23,14,9,4,4,14,...,0.4,0.5,9,22,11,11,3,0,19,True


In [34]:
assert len(final_dataset) == 66719
assert len(final_dataset[final_dataset.Win == True]) == 33359
assert len(final_dataset[final_dataset.Win == False]) == 33360

### Check Values For Nan
In the case of calculating percentages on a given column there may be columns that end up being Nan because of a divide by zero case. Let's find those values and set them to 0.

In [35]:
final_dataset[final_dataset.isnull().any(axis=1)]

Unnamed: 0,Wfgp,Wfgp3,Wftp,Wor,Wdr,Wast,Wto,Wstl,Wblk,Wpf,...,Lfgp3,Lftp,Lor,Ldr,Last,Lto,Lstl,Lblk,Lpf,Win
14054,0.571429,0.533333,0.666667,10,17,12,16,5,2,4,...,0.384615,,8,8,8,13,6,3,12,True
25034,0.346154,0.296296,0.705882,12,21,12,10,8,0,10,...,0.428571,,8,25,13,16,3,3,21,True
27984,0.438596,0.375,0.444444,10,17,12,4,5,0,11,...,0.357143,,5,25,12,14,2,6,12,True
33036,0.282609,0.176471,0.818182,8,24,6,13,8,2,8,...,0.1875,,9,23,10,15,3,4,13,True
18298,0.44186,0.3125,0.615385,11,20,13,11,15,2,6,...,0.388889,,8,18,13,17,2,2,19,True
8652,0.509091,0.407407,,8,22,23,12,10,1,21,...,0.272727,0.55,7,19,8,18,3,2,9,True
55426,0.490566,0.222222,0.647059,14,27,12,17,9,10,8,...,0.47619,,6,17,9,16,6,0,15,True
64652,0.425926,0.357143,1.0,9,21,7,9,8,3,6,...,0.421053,,10,21,14,12,5,3,10,True
9560,0.375,0.25,0.764706,8,24,9,11,8,3,11,...,0.285714,,11,23,7,17,4,2,17,True
28079,0.590909,0.2,0.6875,8,15,14,14,9,2,8,...,0.318182,,10,14,10,20,4,0,17,True


In [36]:
final_dataset.fillna(value=0, inplace=True)

In [37]:
assert final_dataset.isnull().values.any() == False

## Feature Selection
As of now, we have all of our data that we need. But how much of it is useful? We need to do some variance testing on our features to see if they are similar or different among all values. The less the feature is variant the more errors this could produce while fitting our model.

In [38]:
from sklearn import feature_selection

Here we will create our X variable (our DataFrame of only features we are testing):

In [39]:
feature_columns = win_columns + lose_columns

X = final_dataset[feature_columns]

Here we will use the VarianceTreshold model to see how variant our features are in the data and remove the ones that are least variant:

In [40]:
variant_model = feature_selection.VarianceThreshold()
variant_model.fit(X)

VarianceThreshold(threshold=0.0)

In [41]:
# This helper function is for zipping columns together
def zip_variances(X, model):
    return pd.DataFrame(list(zip(X.columns, model.variances_))).sort_values(1, ascending=True)

In [42]:
zip_variances(X, variant_model)

Unnamed: 0,0,1
0,Wfgp,0.005682
10,Lfgp,0.005718
1,Wfgp3,0.014361
11,Lfgp3,0.014568
2,Wftp,0.01563
12,Lftp,0.015746
18,Lblk,5.396665
8,Wblk,5.476646
17,Lstl,9.158863
7,Wstl,9.330618


In the above box we can see what the variances are of columns accross multiple features. The top features have the least variance while the bottom features have more variance. A rule of thumb is to remove near zero variant features because they will have little predictive power in the final outcome of the game. In this case I am going to leave this alone. This variance analysis was useful, but maybe there are more ways I can analyze this data to determine which features are important.

## More Feature Analysis
Here we will use a random forest classifier to determine the importance of certain features and remove features that do not meet a certain threshold.

In [43]:
from sklearn import ensemble

We need to make another split for train/test:

In [44]:
Xtrain, Xtest, ytrain, ytest = cross_validation.train_test_split(final_dataset[feature_columns], 
                                                                 final_dataset[target_column[0]])

Here we make our forest classifier for determining importance of features using decision trees:

In [45]:
forest_model = ensemble.RandomForestClassifier()
forest_model.fit(Xtrain, ytrain)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [46]:
# This helper function is for zipping columns together
def zip_importances(X, model):
    return pd.DataFrame(list(zip(X.columns, model.feature_importances_))).sort_values(1, ascending=True)

In [47]:
zip_importances(final_dataset, forest_model)

Unnamed: 0,0,1
8,Wblk,0.01274
18,Lblk,0.014853
3,Wor,0.019666
13,Lor,0.019841
17,Lstl,0.02485
7,Wstl,0.024998
2,Wftp,0.029221
12,Lftp,0.030175
1,Wfgp3,0.033742
16,Lto,0.040959


This chart displays the features that have the least amount of importance in the outcome of the solution. Let's remove the following features from our dataset and see what happens to the importances:
- Wblk, Lblk
- Wor, Lor
- Wstl, Lstl
- Wftp, Lftp

In [48]:
# This function removes features from a list of features
def remove_features(features, features_to_remove):
    temp_features = features.copy()
    
    for feature in features_to_remove:
        if feature in temp_features:
            temp_features.remove(feature)
    
    return temp_features

In [49]:
features_to_remove = ["Wblk", "Wor", "Wstl", "Wftp", "Lblk", "Lor", "Lstl", "Lftp"]
new_feature_cols = remove_features(feature_columns, features_to_remove)

### Refit The Data
Here we will refit the data and calculate feature importances again to see how well our removal did.

In [50]:
data_subset = final_dataset[new_feature_cols + target_column]

In [51]:
Xtrain, Xtest, ytrain, ytest = cross_validation.train_test_split(data_subset[new_feature_cols], 
                                                                 data_subset[target_column[0]])

In [52]:
forest_model.fit(Xtrain, ytrain)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [53]:
zip_importances(data_subset, forest_model)

Unnamed: 0,0,1
1,Wfgp3,0.049812
7,Lfgp3,0.055694
2,Wdr,0.056935
10,Lto,0.058338
4,Wto,0.059188
8,Ldr,0.061145
9,Last,0.061338
11,Lpf,0.076007
5,Wpf,0.076513
3,Wast,0.077427


After removing some of the features that had less correlation with other variables, it seems like our dataset may be ready to fit to a machine learning model. Removing unneeded features may help us prevent overfitting of our learning models in the next section.

### Save Data To Pickle
Here we will save the dataset to a pickle for use in the next notebook 04-MachineLearning.ipynb.

In [54]:
data_subset.to_pickle("machine_learning_dataset")