# Project - Option 1

## Task 1

In [32]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline,Transformer
from pyspark.ml.feature import Imputer,StandardScaler,StringIndexer,OneHotEncoder, VectorAssembler
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql.window import Window
import numpy as np
import os
import sys
from pyspark.sql.functions import udf, col, when
from pyspark.sql.types import ArrayType, FloatType, StringType, IntegerType
from pyspark.sql import functions as F
from pyspark.sql.types import FloatType

In [2]:
appName = "Big Data Analytics"
master = "local[*]"

# Create Configuration object for Spark.
conf = pyspark.SparkConf()\
    .set('spark.driver.host','127.0.0.1')\
    .setAppName(appName)\
    .setMaster(master)

spark = SparkSession.builder.config(conf = conf).getOrCreate()

24/11/13 11:59:31 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [3]:
db_properties={}
db_properties['username']="postgres"
db_properties['password']=""
db_properties['url']= "jdbc:postgresql://localhost:5432/postgres"
db_properties['table']="fifa.fifa"
db_properties['driver']="org.postgresql.Driver"

In [4]:
df_male = spark.read.csv('./Data/players_15.csv', header = True)
combined_df = df_male.withColumn("year", lit(2015))
combined_df = combined_df.withColumn("record_id", monotonically_increasing_id()).select("record_id", *combined_df.columns)
combined_df = combined_df.withColumn("gender", lit("Male"))
folder_path = "./Data"
for file_name in os.listdir(folder_path):
    if file_name == "players_15.csv":
        continue
    year = "20" + file_name[-6:-4]
    file_path = os.path.join(folder_path, file_name)
    df_read = spark.read.csv(file_path, header = True)
    df_read = df_read.withColumn("year", lit(int(year)))
    df_read = df_read.withColumn("record_id", monotonically_increasing_id()).select("record_id", *df_read.columns)
    if "female" in file_name:
        df_read = df_read.withColumn("gender", lit("Female"))
    else:
        df_read = df_read.withColumn("gender", lit("Male"))
    combined_df = combined_df.union(df_read)

In [5]:
# Write to PostgreSQL
combined_df.write.format("jdbc")\
.mode("overwrite")\
.option("url", db_properties['url'])\
.option("dbtable", db_properties['table'])\
.option("user", db_properties['username'])\
.option("password", db_properties['password'])\
.option("Driver", db_properties['driver'])\
.save()

24/11/13 11:59:36 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

In [6]:
# Read from PostgreSQL to verify
df_from_postgres = spark.read \
    .format("jdbc") \
    .option("url", db_properties['url'])\
    .option("dbtable", db_properties['table'])\
    .option("user", db_properties['username'])\
    .option("password", db_properties['password'])\
    .option("Driver", db_properties['driver'])\
    .load()

In [7]:
df_from_postgres.columns

['record_id',
 'sofifa_id',
 'player_url',
 'short_name',
 'long_name',
 'player_positions',
 'overall',
 'potential',
 'value_eur',
 'wage_eur',
 'age',
 'dob',
 'height_cm',
 'weight_kg',
 'club_team_id',
 'club_name',
 'league_name',
 'league_level',
 'club_position',
 'club_jersey_number',
 'club_loaned_from',
 'club_joined',
 'club_contract_valid_until',
 'nationality_id',
 'nationality_name',
 'nation_team_id',
 'nation_position',
 'nation_jersey_number',
 'preferred_foot',
 'weak_foot',
 'skill_moves',
 'international_reputation',
 'work_rate',
 'body_type',
 'real_face',
 'release_clause_eur',
 'player_tags',
 'player_traits',
 'pace',
 'shooting',
 'passing',
 'dribbling',
 'defending',
 'physic',
 'attacking_crossing',
 'attacking_finishing',
 'attacking_heading_accuracy',
 'attacking_short_passing',
 'attacking_volleys',
 'skill_dribbling',
 'skill_curve',
 'skill_fk_accuracy',
 'skill_long_passing',
 'skill_ball_control',
 'movement_acceleration',
 'movement_sprint_speed',


In [8]:
df_from_postgres.count()

144323

## Task 2

In [9]:
def read_from_spark(spark, db_properties):
    df_from_postgres = spark.read \
        .format("jdbc") \
        .option("url", db_properties['url'])\
        .option("dbtable", db_properties['table'])\
        .option("user", db_properties['username'])\
        .option("password", db_properties['password'])\
        .option("Driver", db_properties['driver'])\
        .load()
    df = df_from_postgres.filter(df_from_postgres["gender"] == "Male")
    return df

In [10]:
def get_top_clubs_with_contracts_ending(spark, db_properties, X, Y, Z):
    df = read_from_spark(spark, db_properties)
    df_filtered = df.filter(col("year") == X)
    df_expiring = df_filtered.filter(col("club_contract_valid_until").cast("int") >= Z)
    result = df_expiring.groupBy("club_name") \
        .count() \
        .orderBy(col("count").desc()) \
        .limit(Y)
    return result.collect()

In [11]:
def find_clubs_by_average_age(spark, db_properties, X, Y, highest=True):
    if X <= 0:
        return "X must be a positive integer"
    if Y < 2015 or Y > 2022:
        return "Y must be a year between 2015 and 2022 inclusively"
    df = read_from_spark(spark, db_properties)
    
    # Filter data for specified year Y
    df_filtered = df.filter(col("year") == Y)
    avg_age_per_club = df_filtered.groupBy("club_name") \
        .agg(round(avg("age").cast("float"),2).alias("average_age"))
    if highest:
        sorted_clubs = avg_age_per_club.orderBy(desc("average_age"))
    else:
        sorted_clubs = avg_age_per_club.orderBy(asc("average_age"))

    top_clubs = sorted_clubs.limit(X)
    last_club = top_clubs.collect()[-1]
    threshold_age = last_club["average_age"]
    if highest:
        result_clubs = sorted_clubs.filter(col("average_age") >= threshold_age).collect()
    else:
        result_clubs = sorted_clubs.filter(col("average_age") <= threshold_age).collect()
    return result_clubs

In [12]:
def get_most_popular_nationality(spark, db_properties):
    df = read_from_spark(spark, db_properties)
    # df_filtered = df.filter((col("year") >= 2015) & (col("year") <= 2022))
    nationality_counts = df.groupBy("year", "nationality_name") \
        .agg(count("*").alias("count"))
    # Create a window partitioned by year and ordered by count descending
    window = Window.partitionBy("year").orderBy(desc("count"))
    
    # Add row number within each year partition
    ranked_nationalities = nationality_counts.withColumn("rank", row_number().over(window))
    # Filter for the top nationality for each year
    most_popular_nationalities = ranked_nationalities.filter(col("rank") == 1) \
        .select("year", "nationality_name", "count") \
        .orderBy("year")
    
    return most_popular_nationalities.collect()

<h1>Task-2.1</h1>

In [13]:
top_clubs = get_top_clubs_with_contracts_ending(spark=spark, db_properties=db_properties, X=2021, Y=10, Z=2023)

In [14]:
top_clubs

[Row(club_name='GwangJu FC', count=28),
 Row(club_name='Zamora Fútbol Club', count=27),
 Row(club_name='Club Plaza de Deportes Colonia', count=27),
 Row(club_name='SL Benfica', count=26),
 Row(club_name='Club Deportivo El Nacional', count=26),
 Row(club_name='Sociedad Deportiva Aucas', count=26),
 Row(club_name='Gangwon FC', count=26),
 Row(club_name='Club Atlético Nacional Potosí', count=26),
 Row(club_name='Busan IPark', count=26),
 Row(club_name='Club Sportivo Luqueño', count=25)]

<h1>Task-2.2</h1>

In [15]:
clubs_by_age = find_clubs_by_average_age(spark=spark, db_properties=db_properties, X=10, Y=2017, highest=False)

In [16]:
clubs_by_age

[Row(club_name='Sevilla Atlético', average_age=19.920000076293945),
 Row(club_name='Swindon Town', average_age=21.3700008392334),
 Row(club_name='CD Huachipato', average_age=21.40999984741211),
 Row(club_name='FC Nordsjælland', average_age=21.40999984741211),
 Row(club_name='FC Twente', average_age=21.59000015258789),
 Row(club_name='Envigado FC', average_age=21.610000610351562),
 Row(club_name='KRC Genk', average_age=21.6299991607666),
 Row(club_name='Crewe Alexandra', average_age=21.81999969482422),
 Row(club_name='Barnsley', average_age=21.8700008392334),
 Row(club_name='Ajax', average_age=21.969999313354492)]

<h1>Task-2.3</h1>

In [17]:
popular_nationalities = get_most_popular_nationality(spark=spark, db_properties=db_properties)

In [18]:
popular_nationalities

[Row(year=2015, nationality_name='England', count=1627),
 Row(year=2016, nationality_name='England', count=1519),
 Row(year=2017, nationality_name='England', count=1627),
 Row(year=2018, nationality_name='England', count=1633),
 Row(year=2019, nationality_name='England', count=1625),
 Row(year=2020, nationality_name='England', count=1670),
 Row(year=2021, nationality_name='England', count=1685),
 Row(year=2022, nationality_name='England', count=1719)]

<h1>Task-3</h1>

In [46]:
spark_df = read_from_spark(spark, db_properties)

In [47]:
# Define UDF for safe evaluation
def safe_eval(x):
    try:
        return float(eval(str(x)))
    except:
        return None

In [48]:
# Define UDF for trait assignment
def assign_traits(player_positions, *traits):
    traits = [float(t) if t is not None else None for t in traits]
    if player_positions and 'GK' in player_positions:
        return traits[6:12]
    else:
        return traits[:6]

In [49]:
def data_engineering_pipeline(spark_df):
    # Drop columns
    columns_to_drop = ['record_id', 'sofifa_id', 'player_url', 'short_name', 'long_name',
                       'dob', 'club_name', 'league_name', 'club_position', 'club_jersey_number', 'club_loaned_from',
                       'club_contract_valid_until', 'nationality_id', 'nationality_name', 'nation_team_id',
                       'nation_position', 'nation_jersey_number', 'real_face', 'player_tags', 'player_traits', 
                       'player_face_url', 'club_logo_url', 'club_flag_url', 'nation_logo_url', 'nation_flag_url',
                       'year', 'gender', 'club_joined']

    positions = ['ls', 'st', 'rs', 'lw', 'lf', 'cf', 'rf', 'rw',
             'lam', 'cam', 'ram', 'lm', 'lcm', 'cm', 'rcm', 'rm',
             'lwb', 'ldm', 'cdm', 'rdm', 'rwb', 'lb', 'lcb', 'cb',
             'rcb', 'rb', 'gk']
    attacking_positions = ['ls', 'st', 'rs', 'lw', 'lf', 'cf', 'rf', 'rw',
                           'lam', 'cam', 'ram']
    midfield_positions = ['lam', 'cam', 'ram', 'lm', 'lcm', 'cm', 'rcm', 'rm',
                          'ldm', 'cdm', 'rdm']
    defensive_positions = ['rwb', 'lb', 'lcb', 'cb', 'lwb', 'rcb', 'rb',
                           'ldm', 'cdm', 'rdm']
    gk_positions = ['gk']

    # Define main traits
    main_traits_to_merge = ['pace', 'shooting', 'passing', 'dribbling', 'defending', 'physic', 
                            'goalkeeping_diving', 'goalkeeping_handling', 'goalkeeping_kicking',
                            'goalkeeping_positioning', 'goalkeeping_reflexes', 'goalkeeping_speed']

    # List of skill columns to average
    skill_columns = ['skill_dribbling', 'skill_curve', 'skill_fk_accuracy', 'skill_long_passing', 'skill_ball_control']

    #Trait1 = pace/goalkeeping_diving
    #Trait2 = shooting/goalkeeping_handling
    #Trait3 = passing/goalkeeping_kicking
    #Trait4 = dribbling/goalkeeping_positioning
    #Trait5 = defending/goalkeeping_reflexes
    #Trait6 = physic/goalkeeping_speed
    
    safe_eval_udf = F.udf(safe_eval, FloatType())
    assign_traits_udf = udf(assign_traits, ArrayType(FloatType()))
    
    #Drop Columns
    spark_df = spark_df.drop(*columns_to_drop)
    
    # Apply safe_eval to position columns
    for pos in positions:
        spark_df = spark_df.withColumn(pos, safe_eval_udf(pos))

    #Add new columns 'average_val_attacking', 'average_val_midfield', 'average_val_defensive', 'average_val_gk'
    spark_df = spark_df.withColumn(
        'average_val_attacking', 
        F.aggregate(
            F.array(*[F.col(pos) for pos in attacking_positions]),
            F.lit(0.0),
            lambda acc, x: acc + x,
            lambda acc: acc / F.size(F.array(*[F.col(pos) for pos in attacking_positions]))
        )
    )
    spark_df = spark_df.withColumn(
        'average_val_midfield', 
        F.aggregate(
            F.array(*[F.col(pos) for pos in midfield_positions]),
            F.lit(0.0),
            lambda acc, x: acc + x,
            lambda acc: acc / F.size(F.array(*[F.col(pos) for pos in midfield_positions]))
        )
    )
    spark_df = spark_df.withColumn(
        'average_val_defensive', 
        F.aggregate(
            F.array(*[F.col(pos) for pos in defensive_positions]),
            F.lit(0.0),
            lambda acc, x: acc + x,
            lambda acc: acc / F.size(F.array(*[F.col(pos) for pos in defensive_positions]))
        )
    )
    spark_df = spark_df.withColumn(
        'average_val_gk', 
        F.aggregate(
            F.array(*[F.col(pos) for pos in gk_positions]),
            F.lit(0.0),
            lambda acc, x: acc + x,
            lambda acc: acc / F.size(F.array(*[F.col(pos) for pos in gk_positions]))
        )
    )

    # Apply trait assignment
    spark_df = spark_df.withColumn('traits', 
        assign_traits_udf(col('player_positions'), *[col(trait) for trait in main_traits_to_merge])
    )
    
    # Extract individual traits
    for i in range(1, 7):
        spark_df = spark_df.withColumn(f'trait{i}', col('traits').getItem(i-1))

    # Drop unnecessary columns
    spark_df = spark_df.drop(*positions, *main_traits_to_merge, 'player_positions', 'traits')
    
    # Drop rows with null values in specific columns
    spark_df = spark_df.na.drop(subset=['wage_eur', 'value_eur', 'trait6'])
    
    # Drop additional columns
    spark_df = spark_df.drop('release_clause_eur', 'league_level')
    
    # Fill null values in club_team_id with -1
    spark_df = spark_df.fillna({'club_team_id': -1})

    #Average out skills as highly correlated
    spark_df = spark_df.withColumn(
        'corr_av_skills', 
        F.aggregate(
            F.array(*[F.col(pos) for pos in skill_columns]),
            F.lit(0.0),
            lambda acc, x: acc + x,
            lambda acc: acc / F.size(F.array(*[F.col(pos) for pos in skill_columns]))
        )
    )
    #Drop the skills as added one new average skill column
    spark_df = spark_df.drop('skill_dribbling', 'skill_curve', 'skill_fk_accuracy', 'skill_long_passing', 'skill_ball_control')

    #Drop movement_acceleration as highly correlated to movement_speed
    spark_df = spark_df.drop('movement_acceleration')

    #Drop movement_acceleration as highly correlated to movement_speed
    spark_df = spark_df.drop('preferred_foot')

    s = spark_df.toPandas()
    print(s)
    print(f'Shape {s.shape}')
    return s

In [54]:
s = data_engineering_pipeline(spark_df)

                                                                                

       overall potential  value_eur wage_eur age height_cm weight_kg  \
0           64        75  1100000.0    500.0  22       189        77   
1           64        76  1400000.0   1000.0  19       179        69   
2           64        65   550000.0   2000.0  29       188        79   
3           64        75  1300000.0   1000.0  20       178        79   
4           64        73  1300000.0   3000.0  21       168        68   
...        ...       ...        ...      ...  ..       ...       ...   
140176      64        72  1300000.0   5000.0  22       191        84   
140177      64        64   300000.0   3000.0  31       185        80   
140178      64        69   800000.0   1000.0  25       177        72   
140179      64        71  1100000.0   2000.0  21       174        70   
140180      64        68   850000.0   4000.0  24       191        87   

       club_team_id weak_foot skill_moves  ... average_val_midfield  \
0             211.0         2           1  ...            29.181

From correlation analysis
average - skill columns
remove - movement_acceleration
Preffered foot is not correlated to overall. so drop

Models

In [55]:
s['mentality_composure'].isna().value_counts()

mentality_composure
False    108947
True      31234
Name: count, dtype: int64

In [56]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from torch.utils.data import DataLoader, TensorDataset
from ray import tune
from ray.tune.schedulers import ASHAScheduler
from sklearn.preprocessing import OrdinalEncoder
from itertools import product


#For mentality_composure

In [58]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

# Prepare the data
train_data = s.dropna(subset=['mentality_composure'])
X = train_data[['mentality_aggression', 'mentality_interceptions', 'mentality_positioning', 'mentality_vision', 'mentality_penalties']]
Y = train_data['mentality_composure']

# Fit the linear regression model
model = LinearRegression()
model.fit(X, Y)

# Predict missing values
X_missing = s[s['mentality_composure'].isnull()][['mentality_aggression', 'mentality_interceptions', 'mentality_positioning', 'mentality_vision', 'mentality_penalties']]
predicted_values = model.predict(X_missing)
rounded_predicted_values = np.round(predicted_values).astype(int)
# Impute the predicted values
s.loc[s['mentality_composure'].isnull(), 'mentality_composure'] = rounded_predicted_values

In [59]:
s['mentality_composure'].isna().value_counts()

mentality_composure
False    140181
Name: count, dtype: int64

In [60]:
# Load the data
df = s

# Separate features and target
X = df.drop('overall', axis=1)
y = df['overall']

# Split the data into train+validation and test sets
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Further split train+validation into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.2, random_state=42)

In [61]:
# Check if MPS is available
if torch.backends.mps.is_available():
    device = torch.device("mps")
else:
    device = torch.device("cpu")
print(f"Using device: {device}")

Using device: mps


In [62]:
# Assuming you already have X_train, X_val, X_test

# Define the order for work_rate
work_rate_order = ['Low/Low', 'Low/Medium', 'Low/High',
                   'Medium/Low', 'Medium/Medium', 'Medium/High',
                   'High/Low', 'High/Medium', 'High/High']

# Initialize OrdinalEncoder for work_rate
work_rate_encoder = OrdinalEncoder(categories=[work_rate_order])

# Encode work_rate
X_train['work_rate_encoded'] = work_rate_encoder.fit_transform(X_train[['work_rate']])
X_val['work_rate_encoded'] = work_rate_encoder.transform(X_val[['work_rate']])
X_test['work_rate_encoded'] = work_rate_encoder.transform(X_test[['work_rate']])

# Perform one-hot encoding for body_type using pd.get_dummies
X_train_body_type_dummies = pd.get_dummies(X_train['body_type'], prefix='body_type')
X_val_body_type_dummies = pd.get_dummies(X_val['body_type'], prefix='body_type')
X_test_body_type_dummies = pd.get_dummies(X_test['body_type'], prefix='body_type')

# Combine the original DataFrames with the encoded columns
X_train = pd.concat([X_train, X_train_body_type_dummies], axis=1)
X_val = pd.concat([X_val, X_val_body_type_dummies], axis=1)
X_test = pd.concat([X_test, X_test_body_type_dummies], axis=1)

# Drop the original 'work_rate' and 'body_type' columns
X_train = X_train.drop(['work_rate', 'body_type'], axis=1)
X_val = X_val.drop(['work_rate', 'body_type'], axis=1)
X_test = X_test.drop(['work_rate', 'body_type'], axis=1)

In [63]:
# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)
y_train = pd.to_numeric(y_train)
y_val = pd.to_numeric(y_val)
y_test = pd.to_numeric(y_test)

# Convert to PyTorch tensors
X_train_tensor = torch.FloatTensor(X_train_scaled).to(device)
y_train_tensor = torch.FloatTensor(y_train.values).reshape(-1, 1).to(device)
X_val_tensor = torch.FloatTensor(X_val_scaled).to(device)
y_val_tensor = torch.FloatTensor(y_val.values).reshape(-1, 1).to(device)
X_test_tensor = torch.FloatTensor(X_test_scaled).to(device)
y_test_tensor = torch.FloatTensor(y_test.values).reshape(-1, 1).to(device)

# Create DataLoaders
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
val_dataset = TensorDataset(X_val_tensor, y_val_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

In [64]:
X_train.isna().value_counts()

potential  value_eur  wage_eur  age    height_cm  weight_kg  club_team_id  weak_foot  skill_moves  international_reputation  attacking_crossing  attacking_finishing  attacking_heading_accuracy  attacking_short_passing  attacking_volleys  movement_sprint_speed  movement_agility  movement_reactions  movement_balance  power_shot_power  power_jumping  power_stamina  power_strength  power_long_shots  mentality_aggression  mentality_interceptions  mentality_positioning  mentality_vision  mentality_penalties  mentality_composure  defending_marking_awareness  defending_standing_tackle  defending_sliding_tackle  average_val_attacking  average_val_midfield  average_val_defensive  average_val_gk  trait1  trait2  trait3  trait4  trait5  trait6  corr_av_skills  work_rate_encoded  body_type_Lean (170-)  body_type_Lean (170-185)  body_type_Lean (185+)  body_type_Normal (170-)  body_type_Normal (170-185)  body_type_Normal (185+)  body_type_Stocky (170-)  body_type_Stocky (170-185)  body_type_Stocky (1

In [69]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Assuming X_train, X_val, X_test, y_train, y_val, y_test are already defined

# Create the Linear Regression model
linear_model = LinearRegression()
# linear_model.fit(X_train.drop(['mentality_composure'], axis=1), y_train)
# # Define a simple parameter grid for hyperparameter tuning
param_grid = {'fit_intercept': [True, False]}

# Set up the grid search with cross-validation
grid_search = GridSearchCV(estimator=linear_model, param_grid=param_grid, scoring='neg_mean_squared_error', cv=5)

# Fit the grid search model on the training data
grid_search.fit(X_train, y_train)

# Get the best parameters and best score
best_params = grid_search.best_params_
best_score = -grid_search.best_score_  # Convert negative MSE to positive

print(f"Best parameters: {best_params}")
print(f"Best cross-validated MSE: {best_score}")

# Train the Linear Regression model with the best parameters on the full training set
best_linear_model = LinearRegression(**best_params)
best_linear_model.fit(X_train, y_train)

# Validate on validation data
val_predictions = best_linear_model.predict(X_val)
val_mse = mean_squared_error(y_val, val_predictions)
val_r2 = r2_score(y_val, val_predictions)

print(f"Validation MSE: {val_mse}")
print(f"Validation R^2: {val_r2}")

# Evaluate the model on the test data
test_predictions = best_linear_model.predict(X_test)
test_mse = mean_squared_error(y_test, test_predictions)
test_r2 = r2_score(y_test, test_predictions)

print(f"Test MSE: {test_mse}")
print(f"Test R^2: {test_r2}")

Best parameters: {'fit_intercept': False}
Best cross-validated MSE: 3.216390474750002
Validation MSE: 3.2322356988764955
Validation R^2: 0.9352421616598825
Test MSE: 3.1640544671530844
Test R^2: 0.9370388300045707


In [97]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import mean_squared_error, r2_score

# Assuming X_train, X_val, X_test, y_train, y_val, y_test are already defined

# Define the parameter grid for RandomizedSearchCV
param_grid = {
    'n_estimators': [100, 200, 300, 400, 500],
    'max_depth': [None, 10, 20, 30, 40, 50],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt', 'log2']
}

# Create a base model
rf_model = RandomForestRegressor(random_state=42)

# Set up RandomizedSearchCV
rf_random = RandomizedSearchCV(estimator=rf_model, param_distributions=param_grid, 
                               n_iter=100, cv=5, verbose=2, random_state=42, n_jobs=-1)

# Fit the random search model
rf_random.fit(X_train.drop(['mentality_composure'], axis=1), y_train)

# Get the best parameters
best_params = rf_random.best_params_
print("Best parameters:", best_params)

# Create a new model with the best parameters
best_rf_model = RandomForestRegressor(**best_params, random_state=42)

# Train the model on the full training set
best_rf_model.fit(X_train.drop(['mentality_composure'], axis=1), y_train)

# Validate the model
val_predictions = best_rf_model.predict(X_val.drop(['mentality_composure'], axis=1))
val_mse = mean_squared_error(y_val, val_predictions)
val_r2 = r2_score(y_val, val_predictions)

print(f"Validation MSE: {val_mse}")
print(f"Validation R^2: {val_r2}")

# Evaluate the model on the test set
test_predictions = best_rf_model.predict(X_test.drop(['mentality_composure'], axis=1))
test_mse = mean_squared_error(y_test, test_predictions)
test_r2 = r2_score(y_test, test_predictions)

print(f"Test MSE: {test_mse}")
print(f"Test R^2: {test_r2}")

# Feature importance
feature_importance = best_rf_model.feature_importances_
feature_names = X_train.drop(['mentality_composure'], axis=1).columns
feature_importance_df = pd.DataFrame({'feature': feature_names, 'importance': feature_importance})
feature_importance_df = feature_importance_df.sort_values('importance', ascending=False).head(10)

print("Top 10 Most Important Features:")
print(feature_importance_df)

Fitting 5 folds for each of 100 candidates, totalling 500 fits
[CV] END max_depth=10, max_features=log2, min_samples_leaf=2, min_samples_split=5, n_estimators=300; total time=  24.9s
[CV] END max_depth=20, max_features=auto, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time=   0.0s
[CV] END max_depth=20, max_features=auto, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time=   0.0s
[CV] END max_depth=20, max_features=auto, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time=   0.0s
[CV] END max_depth=30, max_features=log2, min_samples_leaf=4, min_samples_split=10, n_estimators=400; total time=  57.0s




[CV] END max_depth=10, max_features=log2, min_samples_leaf=2, min_samples_split=5, n_estimators=300; total time=  28.8s
[CV] END max_depth=30, max_features=log2, min_samples_leaf=4, min_samples_split=10, n_estimators=400; total time=  56.4s
[CV] END max_depth=10, max_features=log2, min_samples_leaf=2, min_samples_split=5, n_estimators=300; total time=  30.8s
[CV] END max_depth=30, max_features=log2, min_samples_leaf=4, min_samples_split=10, n_estimators=400; total time=  56.9s
[CV] END max_depth=10, max_features=log2, min_samples_leaf=2, min_samples_split=5, n_estimators=300; total time=  24.7s
[CV] END max_depth=20, max_features=auto, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time=   0.0s
[CV] END max_depth=30, max_features=log2, min_samples_leaf=4, min_samples_split=10, n_estimators=400; total time= 1.0min
[CV] END max_depth=40, max_features=log2, min_samples_leaf=4, min_samples_split=10, n_estimators=100; total time=  13.8s
[CV] END max_depth=10, max_features=

150 fits failed out of a total of 500.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
150 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/aravjain/miniforge3/envs/spark_env/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/aravjain/miniforge3/envs/spark_env/lib/python3.9/site-packages/sklearn/base.py", line 1466, in wrapper
    estimator._validate_params()
  File "/Users/aravjain/miniforge3/envs/spark_env/lib/python3.9/site-packages/sklearn/base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "/Users/aravjain/miniforge3/envs/spark_env/lib/python3.9/site-pack

Best parameters: {'n_estimators': 500, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': 50}
Validation MSE: 0.5154926366757322
Validation R^2: 0.9897592357453177
Test MSE: 0.49938213689053756
Test R^2: 0.9898676910908777
Top 10 Most Important Features:
                  feature  importance
1               value_eur    0.178956
17     movement_reactions    0.109087
2                wage_eur    0.107092
33   average_val_midfield    0.068376
39                 trait4    0.057930
32  average_val_attacking    0.053525
40                 trait5    0.049282
34  average_val_defensive    0.037945
0               potential    0.037661
38                 trait3    0.032995
[CV] END max_depth=40, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=500; total time= 1.8min
[CV] END max_depth=20, max_features=auto, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time=   0.0s
[CV] END max_depth=20, max_features=sqrt, min_samples_

In [70]:
# Model 1: Multi-layer Perceptron (MLP)
class MLP(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(MLP, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, 1)

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        return x

In [81]:
class ResidualBlock(nn.Module):
    def __init__(self, in_features):
        super(ResidualBlock, self).__init__()
        self.fc1 = nn.Linear(in_features, in_features)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(in_features, in_features)

    def forward(self, x):
        residual = x
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        x += residual
        return x

class ResNet(nn.Module):
    def __init__(self, input_size, num_blocks):
        super(ResNet, self).__init__()
        self.fc_in = nn.Linear(input_size, 64)
        self.blocks = nn.Sequential(*[ResidualBlock(64) for _ in range(num_blocks)])
        self.fc_out = nn.Linear(64, 1)

    def forward(self, x):
        x = self.fc_in(x)
        x = self.blocks(x)
        x = self.fc_out(x)
        return x

In [79]:
from sklearn.metrics import r2_score

In [75]:
def train_model(model, train_loader, val_loader, criterion, optimizer, num_epochs=100):
    model.to(device)
    for epoch in range(num_epochs):
        model.train()
        train_loss = 0.0
        for inputs, targets in train_loader:
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()
            train_loss += loss.item()

        model.eval()
        val_loss = 0.0
        with torch.no_grad():
            for inputs, targets in val_loader:
                outputs = model(inputs)
                val_loss += criterion(outputs, targets).item()

        train_loss /= len(train_loader)
        val_loss /= len(val_loader)
        
        if (epoch + 1) % 10 == 0:
            print(f'Epoch [{epoch+1}/{num_epochs}], Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}')
    
    return val_loss

In [82]:
import torch
from sklearn.metrics import r2_score

def train_model(model, train_loader, val_loader, criterion, optimizer, num_epochs=50):
    model.to(device)
    for epoch in range(num_epochs):
        model.train()
        train_loss = 0.0
        train_true = []
        train_pred = []
        
        for inputs, targets in train_loader:
            inputs, targets = inputs.to(device), targets.to(device)
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()
            train_loss += loss.item()
            
            train_true.extend(targets.cpu().numpy())
            train_pred.extend(outputs.detach().cpu().numpy())

        model.eval()
        val_loss = 0.0
        val_true = []
        val_pred = []
        
        with torch.no_grad():
            for inputs, targets in val_loader:
                inputs, targets = inputs.to(device), targets.to(device)
                outputs = model(inputs)
                val_loss += criterion(outputs, targets).item()
                
                val_true.extend(targets.cpu().numpy())
                val_pred.extend(outputs.cpu().numpy())

        train_loss /= len(train_loader)
        val_loss /= len(val_loader)
        
        train_r2 = r2_score(train_true, train_pred)
        val_r2 = r2_score(val_true, val_pred)
        
        if (epoch + 1) % 10 == 0:
            print(f'Epoch [{epoch+1}/{num_epochs}], '
                  f'Train Loss: {train_loss:.4f}, Train R2: {train_r2:.4f}, '
                  f'Val Loss: {val_loss:.4f}, Val R2: {val_r2:.4f}')
    
    return val_loss, val_r2

# # Usage example:
# best_val_loss, best_val_r2 = train_model(model, train_loader, val_loader, criterion, optimizer, num_epochs=100)
# print(f'Best Validation Loss: {best_val_loss:.4f}, Best Validation R2: {best_val_r2:.4f}')

In [83]:
# Hyperparameter tuning
def tune_hyperparameters(model_class, param_grid):
    best_val_loss = float('inf')
    best_params = None
    best_model = None

    for params in product(*param_grid.values()):
        current_params = dict(zip(param_grid.keys(), params))
        print(f"Training with parameters: {current_params}")

        if model_class == MLP:
            model = model_class(input_size=X_train.shape[1], hidden_size=current_params['hidden_size'])
        else:  # ResNet
            model = model_class(input_size=X_train.shape[1], num_blocks=current_params['num_blocks'])

        optimizer = optim.Adam(model.parameters(), lr=current_params['lr'])
        criterion = nn.MSELoss()
        train_loader = DataLoader(train_dataset, batch_size=current_params['batch_size'], shuffle=True)
        val_loader = DataLoader(val_dataset, batch_size=current_params['batch_size'])

        val_loss, val_r2 = train_model(model, train_loader, val_loader, criterion, optimizer)

        if val_loss < best_val_loss:
            best_val_r2 = val_r2
            best_val_loss = val_loss
            best_params = current_params
            best_model = model.state_dict()

    return best_params, best_val_loss, best_model, best_val_r2

In [84]:
# Define hyperparameter grids
mlp_param_grid = {
    'lr': [0.001, 0.01],
    'batch_size': [32, 64],
    'hidden_size': [32, 64]
}

resnet_param_grid = {
    'lr': [0.001, 0.01],
    'batch_size': [32, 64],
    'num_blocks': [2, 3]
}

In [85]:
# Tune and train models
print("Tuning MLP model...")
mlp_best_params, mlp_best_loss, mlp_best_model, mlp_best_r2 = tune_hyperparameters(MLP, mlp_param_grid)
print(f"Best MLP parameters: {mlp_best_params}")
print(f"Best MLP validation loss: {mlp_best_loss}")
print(f"Best MLP validation loss: {mlp_best_r2}")

Tuning MLP model...
Training with parameters: {'lr': 0.001, 'batch_size': 32, 'hidden_size': 32}
Epoch [10/50], Train Loss: 0.8847, Train R2: 0.9823, Val Loss: 0.8576, Val R2: 0.9828
Epoch [20/50], Train Loss: 0.6477, Train R2: 0.9871, Val Loss: 0.6679, Val R2: 0.9866
Epoch [30/50], Train Loss: 0.5924, Train R2: 0.9882, Val Loss: 0.5954, Val R2: 0.9881
Epoch [40/50], Train Loss: 0.5653, Train R2: 0.9887, Val Loss: 0.5672, Val R2: 0.9886
Epoch [50/50], Train Loss: 0.5567, Train R2: 0.9889, Val Loss: 0.5739, Val R2: 0.9885
Training with parameters: {'lr': 0.001, 'batch_size': 32, 'hidden_size': 64}
Epoch [10/50], Train Loss: 0.7999, Train R2: 0.9840, Val Loss: 0.8791, Val R2: 0.9824
Epoch [20/50], Train Loss: 0.6304, Train R2: 0.9874, Val Loss: 0.6195, Val R2: 0.9876
Epoch [30/50], Train Loss: 0.5921, Train R2: 0.9882, Val Loss: 0.5578, Val R2: 0.9888
Epoch [40/50], Train Loss: 0.5677, Train R2: 0.9887, Val Loss: 0.5718, Val R2: 0.9885
Epoch [50/50], Train Loss: 0.5541, Train R2: 0.9889,

KeyboardInterrupt: 

In [None]:
print("\nTuning ResNet model...")
resnet_best_params, resnet_best_loss, resnet_best_model = tune_hyperparameters(ResNet, resnet_param_grid)
print(f"Best ResNet parameters: {resnet_best_params}")
print(f"Best ResNet validation loss: {resnet_best_loss}")

In [None]:


class ResidualBlock(nn.Module):
    def __init__(self, in_features):
        super(ResidualBlock, self).__init__()
        self.fc1 = nn.Linear(in_features, in_features)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(in_features, in_features)

    def forward(self, x):
        residual = x
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        x += residual
        return x

class ResNet(nn.Module):
    def __init__(self, input_size, num_blocks):
        super(ResNet, self).__init__()
        self.fc_in = nn.Linear(input_size, 64)
        self.blocks = nn.Sequential(*[ResidualBlock(64) for _ in range(num_blocks)])
        self.fc_out = nn.Linear(64, 1)

    def forward(self, x):
        x = self.fc_in(x)
        x = self.blocks(x)
        x = self.fc_out(x)
        return x

# Training function
def train_model(model, train_loader, val_loader, criterion, optimizer, num_epochs=100):
    model.to(device)
    for epoch in range(num_epochs):
        model.train()
        train_loss = 0.0
        for inputs, targets in train_loader:
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()
            train_loss += loss.item()

        model.eval()
        val_loss = 0.0
        with torch.no_grad():
            for inputs, targets in val_loader:
                outputs = model(inputs)
                val_loss += criterion(outputs, targets).item()

        train_loss /= len(train_loader)
        val_loss /= len(val_loader)
        
        if (epoch + 1) % 10 == 0:
            print(f'Epoch [{epoch+1}/{num_epochs}], Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}')
    
    return val_loss

# Hyperparameter tuning
def tune_hyperparameters(model_class, param_grid):
    best_val_loss = float('inf')
    best_params = None
    best_model = None

    for params in product(*param_grid.values()):
        current_params = dict(zip(param_grid.keys(), params))
        print(f"Training with parameters: {current_params}")

        if model_class == MLP:
            model = model_class(input_size=X_train.shape[1], hidden_size=current_params['hidden_size'])
        else:  # ResNet
            model = model_class(input_size=X_train.shape[1], num_blocks=current_params['num_blocks'])

        optimizer = optim.Adam(model.parameters(), lr=current_params['lr'])
        criterion = nn.MSELoss()
        train_loader = DataLoader(train_dataset, batch_size=current_params['batch_size'], shuffle=True)
        val_loader = DataLoader(val_dataset, batch_size=current_params['batch_size'])

        val_loss = train_model(model, train_loader, val_loader, criterion, optimizer)

        if val_loss < best_val_loss:
            best_val_loss = val_loss
            best_params = current_params
            best_model = model.state_dict()

    return best_params, best_val_loss, best_model

# Define hyperparameter grids
mlp_param_grid = {
    'lr': [0.001, 0.01],
    'batch_size': [32, 64],
    'hidden_size': [32, 64]
}

resnet_param_grid = {
    'lr': [0.001, 0.01],
    'batch_size': [32, 64],
    'num_blocks': [2, 3]
}

# Tune and train models
print("Tuning MLP model...")
mlp_best_params, mlp_best_loss, mlp_best_model = tune_hyperparameters(MLP, mlp_param_grid)
print(f"Best MLP parameters: {mlp_best_params}")
print(f"Best MLP validation loss: {mlp_best_loss}")

print("\nTuning ResNet model...")
resnet_best_params, resnet_best_loss, resnet_best_model = tune_hyperparameters(ResNet, resnet_param_grid)
print(f"Best ResNet parameters: {resnet_best_params}")
print(f"Best ResNet validation loss: {resnet_best_loss}")

# Save best models
torch.save(mlp_best_model, 'best_mlp_model.pth')
torch.save(resnet_best_model, 'best_resnet_model.pth')