In [1]:
import pandas as pd

from pandas import DataFrame, Series

import numpy as np

dataset = pd.read_csv('data.csv')

dataset = dataset.drop(['Unnamed: 6'], axis=1)

dataset.columns = ['Date', 'StartTime', 'Visitor Team', "VisitorPts", "Home Team", 'HomePts', 'OT?', 'Notes']

dataset['HomeWin'] = dataset['VisitorPts'] < dataset['HomePts']

y_true = dataset['HomeWin'].values

from collections import defaultdict

won_last = defaultdict(int)

dataset['HomeLastWin'] = False

dataset['VisitorLastWin'] = False

for index, row in dataset.iterrows():
    home_team = row['Home Team']
    visitor_team = row['Visitor Team']
    row['HomeLastWin'] = won_last[home_team]
    row['VisitorLastWin'] = won_last[visitor_team]
    dataset.loc[index] = row
    won_last[home_team] = row['HomeWin']
    won_last[visitor_team] = not row['HomeWin']

In [3]:
from sklearn.model_selection import cross_val_score

In [4]:
standings = pd.read_csv('data1.csv', skiprows=[0])
dataset['HomeTeamRanksHigher'] = 0

for index, row in dataset.iterrows():
    home_team = row['Home Team']
    visitor_team = row['Visitor Team']
    home_rank = standings[standings['Team'] == home_team]['Rk'].values[0]
    visitor_rank = standings[standings['Team'] == visitor_team]['Rk'].values[0]
    row['HomeTeamRanksHigher'] = int(home_rank > visitor_rank)
    dataset.iloc[index] = row

In [5]:
last_match_winner = defaultdict(int)

dataset['HomeTeamWonLast'] = 0

for index, row in dataset.iterrows():
    home_team = row['Home Team']
    visitor_team = row['Visitor Team']
    teams = tuple(sorted([home_team, visitor_team]))
    row['HomeTeamWonLast'] = 1 if last_match_winner[teams] == row['Home Team'] else 0
    dataset.iloc[index] = row
    winner = row['Home Team'] if row['HomeWin'] else row['Visitor Team']
    last_match_winner[teams] = winner

In [7]:
from sklearn.tree import DecisionTreeClassifier

In [59]:
X_lastwinner = dataset[[ 'HomeTeamWonLast', 'HomeTeamRanksHigher']].values

In [60]:
clf = DecisionTreeClassifier(random_state=14)

In [61]:
scores = cross_val_score(clf, X_lastwinner, y_true)

In [62]:
scores

array([ 0.6969697 ,  0.68350168,  0.70608108])

In [27]:
from sklearn.preprocessing import LabelEncoder

In [28]:
encoding = LabelEncoder()

In [29]:
encoding.fit(dataset["Home Team"].values)

LabelEncoder()

In [30]:
home_teams = encoding.transform(dataset['Home Team'].values)
visitor_teams = encoding.transform(dataset['Visitor Team'].values)

In [33]:
X_teams = np.vstack([home_teams, visitor_teams]).T

In [35]:
from sklearn.preprocessing import OneHotEncoder

In [36]:
onehot = OneHotEncoder()

In [37]:
X_teams_expanded = onehot.fit_transform(X_teams).todense()

In [38]:
X_teams_expanded

matrix([[ 1.,  0.,  0., ...,  0.,  0.,  0.],
        [ 0.,  0.,  0., ...,  0.,  0.,  0.],
        [ 0.,  0.,  0., ...,  0.,  0.,  0.],
        ..., 
        [ 0.,  0.,  0., ...,  0.,  0.,  0.],
        [ 0.,  0.,  0., ...,  0.,  0.,  0.],
        [ 0.,  0.,  0., ...,  0.,  0.,  0.]])

In [39]:
from sklearn.ensemble import RandomForestClassifier

In [40]:
clf = RandomForestClassifier(random_state=14)

In [41]:
scores = cross_val_score(clf, X_lastwinner, y_true)

In [42]:
scores

array([ 0.6969697 ,  0.68350168,  0.70608108])

In [70]:
X_all = np.hstack([X_lastwinner, dataset[['VisitorLastWin', 'HomeLastWin']].values])

In [71]:
X_all

array([[0, 0, 0, 0],
       [0, 1, 0, 0],
       [0, 0, 0, 0],
       ..., 
       [0, 1, 0, 0],
       [1, 1, 0, 0],
       [1, 0, 0, 1]], dtype=int64)

In [72]:
clf = RandomForestClassifier(random_state=14)

In [73]:
score = cross_val_score(clf, X_all, y_true, scoring='accuracy')

In [74]:
score

array([ 0.69023569,  0.67340067,  0.65540541])

In [75]:
from sklearn.model_selection import GridSearchCV

In [88]:
parameter_space = {
#     "max_features": [2, 10],
    "n_estimators": [100,],
    "criterion": ["gini", "entropy"],
    "min_samples_leaf": [2, 4, 6]
}

In [89]:
cld = RandomForestClassifier(random_state=14)

In [90]:
grid = GridSearchCV(cld, parameter_space)

In [91]:
grid.fit(X_all, y_true)

GridSearchCV(cv=None, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=14,
            verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'n_estimators': [100], 'criterion': ['gini', 'entropy'], 'min_samples_leaf': [2, 4, 6]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [93]:
grid.best_score_

0.6910112359550562

In [95]:
data = DataFrame(np.arange(12).reshape((3, 4)), columns=['a', 'b', 'c', 'd'])

In [96]:
data

Unnamed: 0,a,b,c,d
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11


In [98]:
data['sum'] = pd.feature_creator?

Object `pd.feature_creator` not found.


In [None]:
data['sum'] = pd.feature_creator

In [99]:
data['sum'] = [22, 33, 44]

In [105]:
data

Unnamed: 0,a,b,c,d,sum
0,0,1,2,3,22
1,4,5,6,7,33
2,8,9,10,11,44


In [109]:
data.loc[2, 'c']

10

In [107]:
data.iloc[2, 2]

10