In [1]:
import warnings  
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import OneHotEncoder, StandardScaler

# Datos


In [2]:
train = pd.read_csv('../csv/csv_limpios/train.csv')

In [3]:
train.head()

Unnamed: 0,Jugador,Pais,Posc,Equipo,Edad,Nacimiento,PJ,Titular,Min,Goles,Asistencias,Goles+Asistencias,Tarjetas Amarillas,Tarjetas Rojas,Goles Esperados,Asistencias Esperadas,Valor de mercado,jugador_id
0,Aaron Connolly,IRL,Delantero,Brighton,21,2000.0,15.0,8.0,735.0,1.666667,0.666667,2.333333,0.0,0.0,2.8,0.266667,"3,50 mill. €",0
1,Aaron Cresswell,ENG,Defensa,West Ham,33,1989.0,25.6,24.6,2188.6,1.0,2.4,3.4,3.2,0.0,0.76,3.34,"1,20 mill. €",1
2,Aaron Hickey,SCO,Defensa,Brentford,21,2002.0,20.5,19.0,1551.75,1.25,0.5,1.75,5.5,0.25,0.4,0.6,"30,00 mill. €",2
3,Aaron Ramsdale,ENG,PO,Arsenal,25,1998.0,30.4,30.4,2736.0,0.0,0.2,0.2,0.6,0.0,0.0,0.16,"32,00 mill. €",3
4,Aaron Ramsey,ENG,Delantero,Burnley,30,1993.25,14.0,6.75,619.0,1.25,1.25,2.5,1.0,0.0,1.7,1.225,"2,50 mill. €",4


# Limpieza para generar el modelo

In [4]:
# Seguimos limpieza, convertir valor de mercado a float
def valor_mercado(value):

    value = value.replace('€', '').replace(',', '.').strip()
    parts = value.split()
    
    if len(parts) == 2:
        number, unit = parts
        if unit == 'mill.':
            return float(number) * 1e6
        elif unit == 'mil':
            return float(number) * 1e3
    else:
        return float(value)

In [5]:
train['Valor de mercado'] = train['Valor de mercado'].apply(valor_mercado)

In [6]:
train['Valor de mercado'].value_counts()

2000000.0     155
1000000.0     147
1500000.0     141
3000000.0     135
2500000.0     118
             ... 
950000.0        1
325000.0        1
21000000.0      1
1900000.0       1
375000.0        1
Name: Valor de mercado, Length: 108, dtype: int64

In [7]:
# Veo si hay algun valor de mercado menor a un millon
train[train['Valor de mercado'] == 0.0]

Unnamed: 0,Jugador,Pais,Posc,Equipo,Edad,Nacimiento,PJ,Titular,Min,Goles,Asistencias,Goles+Asistencias,Tarjetas Amarillas,Tarjetas Rojas,Goles Esperados,Asistencias Esperadas,Valor de mercado,jugador_id
40,Adri Bosch,ESP,Defensa,Granada,22,2001.0,1.0,0.000000,26.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,40
41,Adri Embarba,ESP,Centrocampista,Rayo Vallecano,31,1992.0,21.5,16.166667,1339.333333,1.833333,2.500000,4.333333,3.333333,0.333333,2.916667,3.133333,0.0,41
60,Ahmed Elmohamady,EGY,Defensa,Aston Villa,32,1987.0,16.0,9.500000,935.000000,0.500000,0.500000,1.000000,1.500000,0.000000,0.300000,1.900000,0.0,60
260,Angelo da Costa Júnior,BRA,PO,Bologna,36,1983.0,3.5,3.500000,315.000000,0.000000,0.000000,0.000000,1.000000,0.000000,0.000000,0.000000,0.0,260
285,Antonio Iervolino,ITA,Centrocampista,Salernitana,19,2003.0,1.0,0.000000,5.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,285
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3064,Vitorino Antunes,POR,Defensa,Getafe,32,1987.0,21.0,18.666667,1662.000000,0.333333,1.666667,2.000000,6.333333,0.000000,0.900000,2.166667,0.0,3064
3102,William Thomas Fish,ENG,Delantero,Manchester Utd,17,2003.0,1.0,0.000000,1.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,3102
3139,Yeremi Pino,ESP,Delantero,Villarreal,21,2002.0,24.5,16.250000,1349.250000,3.250000,2.750000,6.000000,4.500000,0.000000,3.975000,2.650000,0.0,3139
3167,Álex Cantero,ESP,Delantero,Levante,21,2000.0,14.5,3.500000,423.500000,0.000000,0.500000,0.500000,1.000000,0.000000,1.300000,0.650000,0.0,3167


In [8]:
# Elimino los valores de mercado igual a 0
train = train[train['Valor de mercado'] != 0.0]

In [9]:
# Elimino la columna jugadores ya que solo importan sus estadisticas
train.drop(['Jugador'], axis=1, inplace=True)

In [10]:
train.shape

(3166, 17)

In [11]:
train.head()

Unnamed: 0,Pais,Posc,Equipo,Edad,Nacimiento,PJ,Titular,Min,Goles,Asistencias,Goles+Asistencias,Tarjetas Amarillas,Tarjetas Rojas,Goles Esperados,Asistencias Esperadas,Valor de mercado,jugador_id
0,IRL,Delantero,Brighton,21,2000.0,15.0,8.0,735.0,1.666667,0.666667,2.333333,0.0,0.0,2.8,0.266667,3500000.0,0
1,ENG,Defensa,West Ham,33,1989.0,25.6,24.6,2188.6,1.0,2.4,3.4,3.2,0.0,0.76,3.34,1200000.0,1
2,SCO,Defensa,Brentford,21,2002.0,20.5,19.0,1551.75,1.25,0.5,1.75,5.5,0.25,0.4,0.6,30000000.0,2
3,ENG,PO,Arsenal,25,1998.0,30.4,30.4,2736.0,0.0,0.2,0.2,0.6,0.0,0.0,0.16,32000000.0,3
4,ENG,Delantero,Burnley,30,1993.25,14.0,6.75,619.0,1.25,1.25,2.5,1.0,0.0,1.7,1.225,2500000.0,4


#### Vectorizo columnas categoricas

In [12]:
categoricas = train.select_dtypes(include=['object']).columns

In [13]:
encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')

train_cat = encoder.fit_transform(train[categoricas])

In [14]:
train_cat.shape

(3166, 197)

In [15]:
# Normalizo los datos numericos
numericos = train.select_dtypes(exclude=['object']).columns

# Saco valor de mercado de numericos
numericos = numericos.drop('Valor de mercado')

In [16]:
scaler = StandardScaler()

scaled_numerical_data = scaler.fit_transform(train[numericos])

In [17]:
# Llevo los numericos y categoricos a un solo dataframe
train_final = np.concatenate([scaled_numerical_data, train_cat], axis=1)

In [18]:
# Lo convierto a dataframe
train_final = pd.DataFrame(train_final)

In [19]:
train_final

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,200,201,202,203,204,205,206,207,208,209
0,-1.007280,0.909153,0.030089,-0.279110,-0.277306,0.228818,-0.120386,0.111241,-1.006135,-0.429023,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.580883,-1.418917,1.158914,1.605086,1.626733,-0.070637,1.246909,0.443806,0.482454,-0.429023,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,-1.007280,1.332439,0.615800,0.969453,0.792537,0.041659,-0.251857,-0.070630,1.552377,0.663744,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,-0.144559,0.485868,1.670080,2.263419,2.343760,-0.519818,-0.488504,-0.553887,-0.727024,-0.429023,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.933842,-0.519435,-0.076404,-0.420992,-0.429251,0.041659,0.339761,0.163205,-0.540951,-0.429023,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3161,0.933842,-1.418917,-1.141334,-0.846639,-0.937484,-0.519818,-0.646269,-0.616243,-0.540951,-0.429023,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3162,-0.791600,0.485868,-1.354320,-1.073651,-1.174572,-0.519818,-0.646269,-0.616243,-1.006135,-0.429023,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3163,2.443604,-2.265488,1.973585,2.586911,2.593947,-0.519818,-0.646269,-0.616243,-0.308359,-0.429023,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3164,1.365203,-0.995632,1.648782,2.240718,2.320183,-0.519818,-0.646269,-0.616243,0.110307,-0.429023,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
