In [1]:
# Limpiar valores de dinero → (€4.8M) to (4800000.0)
def clean_data_money(df):
    temp_data = df.str.replace('[M, K, €]', '').astype(float)
    for values, indx in zip(df, df.index):
        if values[-1] == 'M':
            temp_data[indx] = temp_data[indx] * 1000000
        elif values[-1] == 'K':
            temp_data[indx] = temp_data[indx] * 1000
        else:
            pass
    return temp_data

In [2]:
# Añadir columna de si es español
def add_spanish_col(df):
    is_spanish = []
    for rows in df:
        if rows == 'Spain':
            is_spanish.append(1)
        else:
            is_spanish.append(0)
    return is_spanish

In [3]:
# Modulos para trabajar la data
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

# Modulos para el modelo
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV

# Creamos el seed, asi podemos trabajar la data sin alterar la df
import numpy as np

np.random.seed(42)

# Limpiamos la data que esta en un mal formato (Value y Wage), y eliminamos las columnas que no necesitamos
df = pd.read_csv('basic_info.csv')
df.dropna(inplace=True)
df['Value'] = clean_data_money(df['Value'])
df['Wage'] = clean_data_money(df['Wage'])
df['is_spanish'] = add_spanish_col(df['Nationality'])
df.drop(['Unnamed: 0', 'Contract', 'Nationality'], axis=1)

# Creamos la norma para transformar las columnas de strings
categorical_features = ['Name', 'Club']
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')), # valores vacios se reemplazan por 'missing'
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])
# Creamos la norma para transformar los valores numericos(mas que nada para asignar 0 a los valores vacios)
money_feature = ['Wage', 'Value']
money_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value=0))
])
# Creamos el preprocessor con toda la info
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical_features),
        ('money', money_transformer, money_feature)
    ])
# Creamos el model y preprocessor con Pipeline
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', RandomForestClassifier())
])

# Separamos la data en X y
X = df.drop('is_spanish', axis=1)
y = df['is_spanish']
X_train, X_test, y_train, y_test = train_test_split(X, y)

# Comprobamos el modelo
model.fit(X_train, y_train)
model.score(X_test, y_test)

  temp_data = df.str.replace('[M, K, €]', '').astype(float)


0.9644244530770804

In [11]:
model

In [13]:
# GridSearchCV con Pipeline
pipe_grid = {
    'preprocessor__money__imputer__strategy': ['constant', 'mean'],
    'model__n_estimators': [100, 1000],
    'model__max_depth': [None, 5],
    'model__max_features': ['auto'],
    'model__min_samples_split': [2, 4]
}

gs_model = GridSearchCV(model, pipe_grid, cv=5, verbose=2)