# Importar librerias

In [65]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from quickda.clean_data import *

# Limpiar los data y seleccionar las columnas a utilizar

In [66]:
data = pd.read_csv('baseball_reference_2016_scrape.csv')

data["attendance"] = data["attendance"].str.replace("']", "", regex=True)
data["attendance"] = data["attendance"].str.replace(",", "", regex=True)
data = data[data["attendance"].str.isdigit()]
data = data.astype({"attendance": int})

data['game_duration'] = data['game_duration'].str.replace(": ", "")
v = data["game_duration"].str.split(':', expand=True).astype(int)
data["game_duration"] = v[0]*60 +v[1]
data = data.astype({"attendance": int}, {'game_duration': int})

data["venue"] = data["venue"].str.replace(": ", "", regex=True)

data = data[["attendance", "home_team", "away_team", "game_duration", "start_time", "venue"]]

# Codificar los data

In [67]:
X = data.iloc[:, 1:]
y = data.iloc[:, 0]

variables_a_codificar = ["home_team", "away_team", "venue", "start_time", "game_duration"]
transCol = ColumnTransformer(transformers=[('encoder', OneHotEncoder(sparse_output=False), variables_a_codificar)], remainder='passthrough')
X = np.array(transCol.fit_transform(X))

# Crear el modelo

In [68]:
X_entreno, X_prueba, y_entreno, y_prueba = train_test_split(X, y, test_size = 0.2, random_state = 0)
regresor = LinearRegression()
regresor.fit(X_entreno, y_entreno)

# Cálculo de R cuadrado

In [69]:
r_squared = regresor.score(X_entreno, y_entreno)
print(r_squared)

0.7454042906456203


# Ecuación del modelo

In [97]:
coef = regresor.coef_
intercept = regresor.intercept_
names = [x.replace("encoder__", "") for x in transCol.get_feature_names_out()]

print(pd.DataFrame({"Coeficientes": names,"coef": coef}))
print("\nIntercepto: ", intercept)

                       Coeficientes          coef
0    home_team_Arizona Diamondbacks  8.747263e+15
1          home_team_Atlanta Braves  8.720881e+15
2       home_team_Baltimore Orioles -1.122926e+15
3          home_team_Boston Red Sox -9.440734e+15
4            home_team_Chicago Cubs -1.166746e+17
..                              ...           ...
461               game_duration_334  8.161667e+15
462               game_duration_347  8.161667e+15
463               game_duration_348  8.161667e+15
464               game_duration_356  8.161667e+15
465               game_duration_373  0.000000e+00

[466 rows x 2 columns]

Intercepto:  -2.509040427658476e+16


# Predecir un resultado

In [71]:
varToPredict = []
    
homeTeam = "Kansas City Royals"
awayTeam = "New York Mets"
gameDuration = "193"
startTime = "Start Time: 7:38 p.m. Local"
venue = "Kauffman Stadium"

for column in transCol.get_feature_names_out():
    splitVal = column.replace("encoder__", "").split("_")
    valType = splitVal[0]
    testVal = splitVal[len(splitVal) - 1].strip()

    if venue == testVal and valType == "venue":
        varToPredict.append(1)
    elif homeTeam == testVal and valType == "home":
        varToPredict.append(1)
    elif awayTeam == testVal and valType == "away":
        varToPredict.append(1)
    elif gameDuration == testVal and valType == "game":
        varToPredict.append(1)
    elif startTime == testVal and valType == "start":
        varToPredict.append(1)
    else:
        varToPredict.append(0)

print(regresor.predict([varToPredict]))

[43040.]
