In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from scipy.stats import pearsonr

# evaluation metrics
from scipy.stats import pearsonr
from sklearn.metrics import r2_score
from sklearn.metrics import confusion_matrix

In [2]:
df = pd.read_csv("data_final_updated_date.csv", encoding = "ISO-8859-1") 

In [3]:
print("COLUMN              DATATYPE       EXAMPLE")
print("----------------------------------------------------------------------")
for col in df.columns:
    print(col," "*(18-len(col)),df[col].dtype," "*(8-len(df[col].dtype)),df.iloc[8925][col])

COLUMN              DATATYPE       EXAMPLE
----------------------------------------------------------------------
Race_ID             int64          2163
Trap                int64          2
Odds                float64          5.5
BSP                 float64          8.2
Public_Estimate     int64          5
Last_Run            int64          4
Distance_All        float64          387.11
Finish_All          float64          3.61
Distance_Places_All  float64          380.0
Races_All           int64          45
Distance_Recent     float64          425.71
Finish_Recent       float64          2.92
Odds_Recent         float64          3.88
Early_Recent        float64          2.77
Races_380           int64          34
Wins_380            float64          0.24
Finish_380          float64          2.57
Odds_380            float64          4.68
Early_380           float64          2.43
Grade_380           float64          7.86
Time_380            float64          24.31
Early_Time_380      floa

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13842 entries, 0 to 13841
Data columns (total 30 columns):
Race_ID                13842 non-null int64
Trap                   13842 non-null int64
Odds                   13842 non-null float64
BSP                    13842 non-null float64
Public_Estimate        13842 non-null int64
Last_Run               13842 non-null int64
Distance_All           13842 non-null float64
Finish_All             13842 non-null float64
Distance_Places_All    13842 non-null float64
Races_All              13842 non-null int64
Distance_Recent        13842 non-null float64
Finish_Recent          13842 non-null float64
Odds_Recent            13842 non-null float64
Early_Recent           13842 non-null float64
Races_380              13842 non-null int64
Wins_380               13842 non-null float64
Finish_380             13842 non-null float64
Odds_380               13842 non-null float64
Early_380              13842 non-null float64
Grade_380              13842 

In [5]:
factors = ['Race_ID','Trap','Odds','BSP','Public_Estimate','Last_Run','Distance_All','Finish_All',
'Distance_Places_All','Races_All','Distance_Recent','Finish_Recent','Odds_Recent','Early_Recent',
'Races_380','Wins_380','Finish_380','Odds_380','Early_380','Grade_380','Time_380','Early_Time_380',
'Stay_380','Favourite','Finished','Wide_380','Dist_By','Gng','Winner']

In [6]:
r_p = [] # Create a list of tuples with column names, r^2 and p-value. Then print them in order by p-value
for x in factors:
    result = pearsonr(df[x],df['Finished'])
    pearson_r, p__value = result[0] , result[1]
    r_p.append((x,pearson_r,p__value))
cor_p_val = sorted(r_p, key=lambda x: x[2])
print("Feature           | Pearson R   | P-value")
print("- - - - - - - - - - - - - - - - - - - - - - ")
for tup in cor_p_val:
    print(tup[0],(20 - len(tup[0]))*" ",round(tup[1],2),"     ",round(tup[2],2))

Feature           | Pearson R   | P-value
- - - - - - - - - - - - - - - - - - - - - - 
Finished              1.0       0.0
Winner                -0.65       0.0
BSP                   0.23       0.0
Public_Estimate       0.22       0.0
Odds                  0.21       0.0
Finish_All            0.07       0.0
Odds_380              0.06       0.0
Odds_Recent           0.06       0.0
Finish_380            0.06       0.0
Finish_Recent         0.06       0.0
Dist_By               -0.04       0.0
Stay_380              0.04       0.0
Last_Run              0.04       0.0
Distance_Recent       0.03       0.0
Early_Time_380        0.03       0.0
Distance_All          0.02       0.02
Distance_Places_All   0.02       0.05
Trap                  0.01       0.19
Races_All             0.01       0.19
Gng                   -0.01       0.26
Grade_380             0.01       0.33
Races_380             0.01       0.33
Wins_380              0.01       0.34
Time_380              -0.0       0.62
Early_380     

In [7]:
df_train_validate = df[0:11670]# Training and validation
df_test = df[11670:]# Final Testing

In [8]:
drop_cols = ['Winner','Public_Estimate','Odds','Race_ID','Distance_Places_All','Trap',
             'Races_All','Grade_380','Races_380','Wins_380','Time_380','Early_380',
             'Early_Recent','Wide_380','Favourite','Date']
df_m = df_train_validate.drop(columns=drop_cols)

In [9]:
train_size = 9870
train = df_m[0:train_size:]
validation = df_m[train_size:]
validation_all_features = df_train_validate[train_size:]
target="Finished"

train_X = train.drop(columns=[target])
train_y = train[target]
validation_X = validation.drop(columns=[target])
validation_y = validation[target]

In [10]:
model = LogisticRegression(random_state=0, solver='sag',
                           max_iter=10000,multi_class='multinomial')

model.fit(train_X,train_y)
print("Training Accuracy",round(accuracy_score(train_y, model.predict(train_X)),3))

Training Accuracy 0.23


In [11]:
predicted_fin = model.predict(validation_X)
print("Validation Accuracy",round(accuracy_score(validation_y, predicted_fin),3))

Validation Accuracy 0.231


In [12]:
public_y = list(validation_all_features['Public_Estimate'])

In [13]:
print("Public Accuracy",round(accuracy_score(validation_y,public_y),3))

Public Accuracy 0.211


R-squared Score/Error
- - - - - - - - - - - - - - -


NameError: name 'test_y' is not defined

In [14]:
df_test = df_test.drop(columns=drop_cols)

In [15]:
test_X = df_test.drop(columns=[target])
test_y = df_test[target]

In [16]:
predicted_fin = model.predict(test_X)
print("Test Accuracy",round(accuracy_score(test_y, predicted_fin),3))

Test Accuracy 0.213


In [17]:
public_y = list(df[11670:]['Public_Estimate'])

In [18]:
print("Public Accuracy",round(accuracy_score(test_y,public_y),3))

Public Accuracy 0.204


In [19]:
matrix1 = confusion_matrix(test_y, predicted_fin)
labls = ["First","Second","Third","Fourth","Fifth","Sixth"]
acc_scores = [round(x,2) for x in list(matrix1.diagonal()/matrix1.sum(axis=1))]
accs = zip(labls,acc_scores)
print("Accuracy Score Each Class")
print("- - - - - - - - - - - - - - - - - - - - ")
for a in accs:
    print(a[0]," "*(9-len(a[0])),a[1])

Accuracy Score Each Class
- - - - - - - - - - - - - - - - - - - - 
First      0.56
Second     0.13
Third      0.09
Fourth     0.04
Fifth      0.09
Sixth      0.38


In [20]:
print(matrix1)

[[201  38  24  13  27  59]
 [155  47  26  21  30  83]
 [152  36  31  17  38  90]
 [139  34  28  16  39 104]
 [150  27  36  13  31 105]
 [113  19  33  22  39 136]]


In [21]:
matrix = confusion_matrix(test_y, public_y)  ## MARKET ACCURACY 
labls = ["First","Second","Third","Fourth","Fifth","Sixth"]
acc_scores = [round(x,2) for x in list(matrix.diagonal()/matrix.sum(axis=1))]
accs = zip(labls,acc_scores)
print("Accuracy Score Each Class")
print("- - - - - - - - - - - - - - - - - - - - ")
for a in accs:
    print(a[0]," "*(9-len(a[0])),a[1])

Accuracy Score Each Class
- - - - - - - - - - - - - - - - - - - - 
First      0.28
Second     0.19
Third      0.17
Fourth     0.16
Fifth      0.18
Sixth      0.25


In [22]:
print(matrix)

[[101  74  66  48  38  35]
 [ 71  70  53  59  50  59]
 [ 57  58  63  67  62  57]
 [ 46  64  56  56  79  59]
 [ 51  55  61  66  64  65]
 [ 28  52  52  80  60  90]]


In [25]:
print("R-squared Score/Error")
print("- - - - - - - - - - - - - - -")
print("Model: ",round(r2_score(test_y,predicted_fin),3))
print("Market:",round(r2_score(test_y,public_y),3))

R-squared Score/Error
- - - - - - - - - - - - - - -
Model:  -1.206
Market: -0.615
