In [22]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from quickda.clean_data import *

In [259]:
datos = pd.read_csv('baseball_reference_2016_scrape.csv')

In [260]:
datos["attendance"]= datos["attendance"].str.replace("']", "", regex=True)
datos["attendance"]= datos["attendance"].str.replace(",", "", regex=True)
datos = datos[datos["attendance"].str.isdigit()]
datos = datos.astype({"attendance": int})

datos["venue"]= datos["venue"].str.replace(": ", "", regex=True)

datos = datos[["venue", "attendance"]]
print(datos)

                            venue  attendance
0                Kauffman Stadium       40030
1        Great American Ball Park       21621
2     Oriole Park at Camden Yards       12622
3                    Turner Field       18531
4                     Chase Field       18572
...                           ...         ...
2458              Tropicana Field       31042
2459                     PNC Park       39500
2460                  Miller Park       20098
2461                 Marlins Park       17883
2462            Progressive Field       10298

[2460 rows x 2 columns]


In [261]:
def codif_y_ligar(dataframe_original, variables_a_codificar):
    dummies = pd.get_dummies(dataframe_original[[variables_a_codificar]])
    res = pd.concat([dataframe_original, dummies], axis = 1)
    res = res.drop([variables_a_codificar], axis = 1)
    return(res) 

variables_a_codificar = ["venue"]   #  Esta es una lista de variables
for variable in variables_a_codificar:
    datos = codif_y_ligar(datos, variable)

In [263]:
X = datos.iloc[:, 1:].values
y = datos.iloc[:, 0].values
print(X)
print(y)

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
[40030 21621 12622 ... 20098 17883 10298]


In [264]:
transCol = ColumnTransformer(transformers=[('encoder', OneHotEncoder(sparse_output=False), [])], remainder='passthrough')
X = np.array(transCol.fit_transform(X))

In [265]:
X_entreno, X_prueba, y_entreno, y_prueba = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [266]:
regresor = LinearRegression()
regresor.fit(X_entreno, y_entreno)

In [267]:
y_pred = regresor.predict(X_prueba)
np.set_printoptions(precision=2)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_prueba.reshape(len(y_prueba),1)),1))

[[ 1.63e+04  1.51e+04]
 [ 1.89e+04  2.14e+04]
 [ 2.37e+04  2.86e+04]
 [ 4.60e+04  5.27e+04]
 [ 2.50e+04  1.83e+04]
 [ 3.13e+04  3.12e+04]
 [ 3.73e+04  4.22e+04]
 [ 4.24e+04  4.11e+04]
 [ 3.17e+04  3.07e+04]
 [ 3.13e+04  2.57e+04]
 [ 2.19e+04  1.50e+04]
 [ 2.66e+04  3.12e+04]
 [ 2.66e+04  3.19e+04]
 [ 2.50e+04  3.28e+04]
 [ 4.03e+04  3.44e+04]
 [ 2.81e+04  2.94e+04]
 [ 2.85e+04  1.87e+04]
 [ 3.23e+04  3.47e+04]
 [ 3.17e+04  3.48e+04]
 [ 3.23e+04  3.60e+04]
 [ 2.81e+04  2.77e+04]
 [ 2.37e+04  1.40e+04]
 [ 3.17e+04  3.98e+04]
 [ 4.26e+04  4.62e+04]
 [ 4.26e+04  4.78e+04]
 [ 4.14e+04  4.14e+04]
 [ 2.37e+04  2.88e+04]
 [ 3.23e+04  2.78e+04]
 [ 2.37e+04  2.14e+04]
 [ 2.78e+04  1.75e+04]
 [ 3.13e+04  2.65e+04]
 [ 2.24e+04  3.07e+04]
 [ 2.33e+04  2.81e+04]
 [ 2.85e+04  2.30e+04]
 [ 3.73e+04  3.70e+04]
 [ 3.73e+04  3.63e+04]
 [ 3.13e+04  3.10e+04]
 [ 1.63e+04  1.03e+04]
 [ 4.26e+04  4.32e+04]
 [ 2.24e+04  1.92e+04]
 [ 3.13e+04  3.31e+04]
 [ 3.52e+04  3.79e+04]
 [ 2.24e+04  1.50e+04]
 [ 2.50e+04

In [270]:
print(regresor.predict([[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]))
print(regresor.coef_)
print(regresor.intercept_)

[36609.5]
[-3.39e+15 -3.39e+15 -3.39e+15 -3.39e+15 -3.39e+15 -3.39e+15 -3.39e+15
 -3.39e+15 -3.39e+15 -3.39e+15 -5.51e+15 -3.39e+15 -3.39e+15 -3.39e+15
 -3.39e+15 -3.39e+15 -3.39e+15 -3.39e+15 -3.39e+15 -3.39e+15 -3.39e+15
 -3.39e+15 -3.39e+15 -3.39e+15 -3.39e+15 -3.39e+15 -3.39e+15 -3.39e+15
 -3.39e+15 -3.39e+15 -3.39e+15]
3387846812011284.0
