In [1]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error

In [2]:
df = pd.read_csv('/kaggle/input/pp-haxophone/ratings_features.csv')
# df=df[1000:]
df['age'] = (2025 - df['born']).clip(lower=17, upper=45)
df = df.dropna()

def get_features_for_rating(rating_type):
    features = ['Playing Time_stats', 'Expected_stats', 'Progression_stats', 'Per 90 Minutes_stats']
    
    if rating_type == 'passing':
        features += ['Total_passing', 'Short_passing', 'Medium_passing', 'Long_passing', 'Expected_passing']
    elif rating_type == 'shooting':
        features += ['Standard_shooting', 'Expected_shooting']
    elif rating_type == 'defense':
        features += ['Tackles_defense', 'Challenges_defense', 'Blocks_defense']
    elif rating_type == 'keeping_skills':
        features += ['Performance_stats']  # Assuming relevant data for keeping skills
    elif rating_type == 'overall_rating':
        features += ['age', 'pos', 'potential_rating']  # Assuming 'potential_rating' in the dataset
    return features


In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
import joblib  # for saving models and scalers

def train_model_for_rating(rating_type, model_type):
    features = get_features_for_rating(rating_type)
    X = df[features]  
    y = df[rating_type]  
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    if model_type == 'linear':
        model = LinearRegression()
    elif model_type == 'random_forest':
        model = RandomForestRegressor(n_estimators=100, random_state=42)
    elif model_type == 'svr':
        model = SVR()
    elif model_type == 'xgboost':
        model = XGBRegressor(n_estimators=100, random_state=42)
    
    model.fit(X_train_scaled, y_train)
    
    y_pred = model.predict(X_test_scaled)
    
    mse = mean_squared_error(y_test, y_pred)
    rmse = mse**0.5
    print(f'{rating_type} Model RMSE: {rmse}')
    
    # Save the model and scaler
    joblib.dump(model, f'{rating_type}_{model_type}_model.pkl')
    joblib.dump(scaler, f'{rating_type}_scaler.pkl')
    
    return model, scaler

passing_model, passing_scaler = train_model_for_rating('passing', model_type='xgboost')
shooting_model, shooting_scaler = train_model_for_rating('shooting', model_type='linear')
defense_model, defense_scaler = train_model_for_rating('defense', model_type='random_forest')
keeping_model, keeping_scaler = train_model_for_rating('keeping_skills', model_type='svr')


passing Model RMSE: 7.371529935726901
shooting Model RMSE: 8.565462676251105
defense Model RMSE: 9.953111194582306
keeping_skills Model RMSE: 15.518528002736412


In [4]:
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import pandas as pd

# Function to train a model for predicting potential
def train_potential_model():
    features = ['Playing Time_stats', 'Expected_stats', 'Progression_stats', 'Per 90 Minutes_stats', 'age', 'pos']
    X = df[features]
    y = df['potential']
    
    # Split the dataset
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Preprocessing: Standardize numeric features and label encode 'pos'
    label_encoder = LabelEncoder()
    X_train['pos'] = label_encoder.fit_transform(X_train['pos'])
    X_test['pos'] = label_encoder.transform(X_test['pos'])
    
    # Standardize numeric features
    scaler = StandardScaler()
    X_train[['Playing Time_stats', 'Expected_stats', 'Progression_stats', 'Per 90 Minutes_stats', 'age']] = scaler.fit_transform(X_train[['Playing Time_stats', 'Expected_stats', 'Progression_stats', 'Per 90 Minutes_stats', 'age']])
    X_test[['Playing Time_stats', 'Expected_stats', 'Progression_stats', 'Per 90 Minutes_stats', 'age']] = scaler.transform(X_test[['Playing Time_stats', 'Expected_stats', 'Progression_stats', 'Per 90 Minutes_stats', 'age']])

    # Train a model (Random Forest)
    model = RandomForestRegressor(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)
    
    # Predict on test set
    y_pred = model.predict(X_test)
    
    # Evaluate performance
    mse = mean_squared_error(y_test, y_pred)
    rmse = mse**0.5
    print(f'Potential Model RMSE: {rmse}')
    joblib.dump(model, f'potential_model.pkl')
    joblib.dump(scaler, f'potential_num_scaler.pkl')
    joblib.dump(label_encoder, f'potential_str_scaler.pkl')

    return model, label_encoder, scaler  # Return model, label encoder, and scaler for later use

# Train the potential model
potential_model, p_label_encoder, p_scaler = train_potential_model()


Potential Model RMSE: 3.981986800909586


In [5]:
from sklearn.preprocessing import StandardScaler, LabelEncoder
import pandas as pd

def get_potential_predictions(df, potential_model, p_scaler):
    numerical_features = ['Playing Time_stats', 'Expected_stats', 'Progression_stats', 'Per 90 Minutes_stats', 'age']
    numerical_data = df[numerical_features]
    numerical_data_transformed = p_scaler.transform(numerical_data)
    
    # Fit the label encoder on 'pos' and transform it
    label_encoder = LabelEncoder()
    label_encoder.fit(df['pos'])  # Fit on the column
    categorical_data_transformed = label_encoder.transform(df['pos'])  # Transform it into encoded values
    categorical_data_transformed = categorical_data_transformed.reshape(-1, 1)  # Reshape for concatenation
    
    # Combine numerical and categorical data
    combined_features = pd.concat([pd.DataFrame(numerical_data_transformed), pd.DataFrame(categorical_data_transformed)], axis=1)

    # Make predictions using the potential model
    potential_preds = potential_model.predict(combined_features)
    
    return potential_preds


In [6]:
from sklearn.preprocessing import LabelEncoder
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

# Function to generate features for the meta-classifier
def get_meta_features_with_potential(df):
    passing_preds = passing_model.predict(passing_scaler.transform(df[get_features_for_rating('passing')]))
    shooting_preds = shooting_model.predict(shooting_scaler.transform(df[get_features_for_rating('shooting')]))
    defense_preds = defense_model.predict(defense_scaler.transform(df[get_features_for_rating('defense')]))
    keeping_preds = keeping_model.predict(keeping_scaler.transform(df[get_features_for_rating('keeping_skills')]))
    
    potential_preds = get_potential_predictions(df, potential_model, p_scaler)

    meta_df = pd.DataFrame({
        'passing': passing_preds,
        'shooting': shooting_preds,
        'defense': defense_preds,
        'keeping_skills': keeping_preds,
        'potential': potential_preds,
        'age': df['age']
    })

    df['pos_encoded'] = p_label_encoder.transform(df['pos']) 
    pos_encoded_df = df[['pos_encoded']] 
    return pd.concat([meta_df, pos_encoded_df], axis=1)


X_meta = get_meta_features_with_potential(df)
y_meta = df['overallrating']

# Split dataset
X_train_meta, X_test_meta, y_train_meta, y_test_meta = train_test_split(X_meta, y_meta, test_size=0.2, random_state=42)

meta_classifier = RandomForestRegressor(n_estimators=100, random_state=42)
meta_classifier.fit(X_train_meta, y_train_meta)

joblib.dump(meta_classifier, f'meta_model.pkl')

meta_preds = meta_classifier.predict(X_test_meta)
meta_mse = mean_squared_error(y_test_meta, meta_preds)
meta_rmse = meta_mse**0.5
print(f'Meta Classifier RMSE: {meta_rmse}')




Meta Classifier RMSE: 3.9361022478427357


In [7]:

def generate_predictions_and_save(test_df):
    test_df['age'] = (2025 - test_df['born']).clip(lower=17, upper=45)
    passing_preds = passing_model.predict(passing_scaler.transform(test_df[get_features_for_rating('passing')]))
    shooting_preds = shooting_model.predict(shooting_scaler.transform(test_df[get_features_for_rating('shooting')]))
    defense_preds = defense_model.predict(defense_scaler.transform(test_df[get_features_for_rating('defense')]))
    keeping_preds = keeping_model.predict(keeping_scaler.transform(test_df[get_features_for_rating('keeping_skills')]))
    
    # Generate potential predictions using the get_potential_predictions function
    potential_preds = get_potential_predictions(test_df, potential_model, p_scaler)
    test_df['defense_ratings'] = defense_preds
    test_df['passing_ratings'] = passing_preds
    test_df['shooting_ratings'] = shooting_preds
    test_df['keeping_ratings'] = keeping_preds
    test_df['potential_ratings'] = potential_preds

    meta_df = get_meta_features_with_potential(test_df)
    
    # Predict overall ratings using the trained meta-classifier
    overall_rating_preds = meta_classifier.predict(meta_df)
    
    # Save the overall predictions as a new column
    test_df['overall_ratings'] = overall_rating_preds

    return test_df

test_df=pd.read_csv(r'/kaggle/input/pp-haxophone/merged_output.csv')
# test_df=test_df[100:]
test_df=test_df.dropna()
# print(test_df.columns)
test_df_with_predictions = generate_predictions_and_save(test_df)




In [8]:
test_df_with_predictions.to_csv('krazi_database.csv', index=False)

In [9]:
numerical_df = df.select_dtypes(include=['number'])

# Find min and max for each numerical column
min_values = numerical_df.min()
max_values = numerical_df.max()

# Display the results
print("Minimum values:\n", min_values)
print("\nMaximum values:\n", max_values)

Minimum values:
 born                       0.000000
ID                      1819.000000
Tackles_defense            0.000000
Challenges_defense         0.000000
Blocks_defense             0.000000
Total_passing              0.000000
Short_passing              0.000000
Medium_passing             0.000000
Long_passing               0.000000
Expected_passing          -0.250000
Standard_shooting          0.000000
Expected_shooting          0.000000
Playing Time_stats         0.500000
Performance_stats          0.000000
Expected_stats             0.000000
Progression_stats          0.000000
Per 90 Minutes_stats       0.000000
overallrating             51.000000
potential                 61.000000
passing                   15.666667
shooting                  16.142857
defense                   10.600000
keeping_skills             2.400000
age                       17.000000
pos_encoded                0.000000
dtype: float64

Maximum values:
 born                     2008.000000
ID           