# Feature Extraction
**In this file, we will analyze the initial dataset consisting of CSV files in order to extract the relevant informations and structured them to address the valuation prediction task**

Imported libraries

In [1]:
import pyspark
from pyspark.sql import *
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.sql import SparkSession
from pyspark.conf import SparkConf

Creating the pyspark session

In [2]:
# Create the session
conf = SparkConf(). \
    set('spark.ui.port', "4050"). \
    set('spark.executor.memory', '15G'). \
    set('spark.driver.memory', '50G'). \
    set('spark.driver.maxResultSize', '40G'). \
    setAppName("PySparkProject"). \
    set('spark.executor.cores', "10"). \
    setMaster("local[*]")

sc = pyspark.SparkContext.getOrCreate(conf=conf)
spark = SparkSession.builder.getOrCreate()

sc._conf.getAll()

[('spark.executor.memory', '15G'),
 ('spark.driver.host', 'laptop-jllvbepm.tailaca27.ts.net.'),
 ('spark.executor.id', 'driver'),
 ('spark.driver.memory', '50G'),
 ('spark.app.startTime', '1686341582676'),
 ('spark.executor.cores', '10'),
 ('spark.app.name', 'PySparkProject'),
 ('spark.driver.extraJavaOptions',
  '-Djava.net.preferIPv6Addresses=false -XX:+IgnoreUnrecognizedVMOptions --add-opens=java.base/java.lang=ALL-UNNAMED --add-opens=java.base/java.lang.invoke=ALL-UNNAMED --add-opens=java.base/java.lang.reflect=ALL-UNNAMED --add-opens=java.base/java.io=ALL-UNNAMED --add-opens=java.base/java.net=ALL-UNNAMED --add-opens=java.base/java.nio=ALL-UNNAMED --add-opens=java.base/java.util=ALL-UNNAMED --add-opens=java.base/java.util.concurrent=ALL-UNNAMED --add-opens=java.base/java.util.concurrent.atomic=ALL-UNNAMED --add-opens=java.base/sun.nio.ch=ALL-UNNAMED --add-opens=java.base/sun.nio.cs=ALL-UNNAMED --add-opens=java.base/sun.security.action=ALL-UNNAMED --add-opens=java.base/sun.util.cal

**Work on dataset begins**

Loading dataframes

In [3]:
# load the cvs files into dataframes
df_appearances = spark.read.csv("../archive/appearances.csv", header=True, inferSchema=True)
df_club_games = spark.read.csv("../archive/club_games.csv", header=True, inferSchema=True)
df_clubs = spark.read.csv("../archive/clubs.csv", header=True, inferSchema=True)
df_competitions = spark.read.csv("../archive/competitions.csv", header=True, inferSchema=True)
df_game_events = spark.read.csv("../archive/game_events.csv", header=True, inferSchema=True)
df_games = spark.read.csv("../archive/games.csv", header=True, inferSchema=True)
df_player_valuations = spark.read.csv("../archive/player_valuations.csv", header=True, inferSchema=True)
df_players = spark.read.csv("../archive/players.csv", header=True, inferSchema=True)

In [4]:
# drop duplicates df_club_games
df_club_games = df_club_games.dropDuplicates(['club_id', 'own_goals', 'own_position', 'own_manager_name', 'opponent_id', 'opponent_goals', 'opponent_position', 'opponent_manager_name', 'hosting', 'is_win'])

In [5]:
# drop duplicates df_games
df_games = df_games.dropDuplicates(['home_club_id', 'away_club_id', 'date'])

In [6]:
# delete from df_appareances the instances where player_club_id is not in the df_clubs dataframe (club ids)
# beacuse in df_appereances there are some player_club_id that do not exist
df_appearances = df_appearances.join(df_clubs, df_appearances.player_club_id == df_clubs.club_id, 'inner').select(df_appearances.columns)

In [7]:
# join players and appearances
df_players_appearances = df_players.join(df_appearances, ["player_id"], how='inner')

In [8]:
 # join players_appearances and club_games to extract information about the games played by the player
#rinomina player_club_id in club_id
df_players_appearances = df_players_appearances.withColumnRenamed("player_club_id", "club_id")
df_players_appearances = df_players_appearances.join(df_club_games, ["game_id", "club_id"], how='inner')

In [9]:
# drop useless and duplicated features from df_players_appearances
df_players_appearances = df_players_appearances.drop("current_club_id", "appearance_id",
                                                        "highest_market_value_in_eur", "current_club_name",
                                                        "city_of_birth", "market_value_in_eur",
                                                        "contract_expiration_date", "agent_name",
                                                        "current_club_domestic_competition_id", "image_url",
                                                        "last_season", "url", "player_current_club_id",
                                                        "first_name", "last_name", "player_name", "player_code")

In [10]:
# drop useless and duplicated features from df_players_valuations
df_player_valuations = df_player_valuations.drop("datetime", "dateweek")

In [11]:
# rename the column date of df_players_valuations in date_v to avoid confusion with the date of df_players_appearances
df_player_valuations = df_player_valuations.withColumnRenamed("date", "date_v")

In [12]:
# Join the two dataframes on player_id
df = df_player_valuations.join(df_players_appearances, "player_id")

In [13]:
# we want to keep only the rows where the appearence date is within 1 year from the valuation date
df = df.filter(
    (year(df.date_v) == year(df.date) + 1) & (month(df.date_v) < month(df.date)) |
    (year(df.date_v) == year(df.date)) & (month(df.date_v) > month(df.date)) |
    (year(df.date_v) == year(df.date)) & (month(df.date_v) == month(df.date)) & (dayofmonth(df.date_v) > dayofmonth(df.date)) |
    (year(df.date_v) == year(df.date) + 1) & (month(df.date_v) == month(df.date)) & (dayofmonth(df.date_v) < dayofmonth(df.date))
).dropDuplicates(["player_id", "date", "date_v"])

We add the statistics of the teams in which the player played during the year and in which the player took the field (appearance): appearances, games won, games lost, games drae in which the player was present

In [14]:
# add the is_draw to count the draws
df = df.withColumn("is_draw", when(df.own_goals == df.opponent_goals, 1).otherwise(0))

# add column games_played that counts the instances with key (player_id, date_v)
df = df.withColumn("appearances", count(df.date_v).over(Window.partitionBy("date_v", "player_id")))

# add column games_won that counts the instances with key (player_id, date_v) and is_win = 1
df = df.withColumn("games_won_pl",
                                count(when(df.own_goals > df.opponent_goals, 1)).over(Window.partitionBy("date_v", "player_id")))

# add column games_draw_pl that counts the instances with key (player_id, date_v) and is_draw = 1
df = df.withColumn("games_draw_pl",
                                count(when(df.is_draw == 1, 1)).over(Window.partitionBy("date_v", "player_id")))

df = df.withColumn("games_lost_pl", count(when(df.own_goals < df.opponent_goals, 1)).over(Window.partitionBy("date_v", "player_id")))

We add the feature winning rate of the player. The winning rate is calculated by the formula: (games_won_player*3 + games_draw_player) / games_played_player

In [15]:
# delete some columns
df = df.drop("game_id", "own_goals", "own_position", "own_manager_name", "opponent_id",
                        "opponent_goals", "opponent_position", "opponent_manager_name", "hosting", "is_win",
                        "is_draw")

# for each (player_id, date_v) add a column with the result of (games_won_pl*3 + games_draw_pl)/games_played
df = df.withColumn("winning_rate_pl",
                                    (df.games_won_pl * 3 + df.games_draw_pl) / df.appearances)

In [16]:
# Group by the player_id and the valuation date and extract all the important features
df = df.groupBy(
    "player_id", "name", col("market_value_in_eur").alias("market_value"), "date_v",
    col("current_club_id").alias("current_club_id"), col("height_in_cm").alias("height"),
    col("country_of_citizenship").alias("citizenship"), col("date_of_birth").alias("date_birth"), "position",
    "sub_position", "appearances", "games_won_pl", "games_draw_pl", "games_lost_pl", "winning_rate_pl") \
    .agg(collect_set("competition_id").alias("competition_id"),
            collect_set("club_id").alias("club_id"),
            sum("assists").alias("assists"),
            count("date_v").alias("appearances2"),
            sum("goals").alias("goals"),
            sum("minutes_played").alias("minutes_played"),
            sum("red_cards").alias("red_cards"),
            sum("yellow_cards").alias("yellow_cards"))

In [17]:
# add last valuation in temporal terms
df = df.withColumn("last_valuation", lag(df.market_value).over(Window.partitionBy("player_id").orderBy("date_v")))

We add the statistics of the teams in which the player played during the year

In [18]:
# join club_games and games to extract the date from games
df_club_games_join = df_club_games.join(df_games.select("game_id", "date"), "game_id", how='inner')

In [19]:
# Expands the player_club_id list into separate columns
df_result_expanded = df.withColumn("club_id", explode(col("club_id")))

In [20]:
# Peform join based on club_id and apply condition on date
df = df_result_expanded.join(df_club_games_join, ["club_id"]) \
    .where(expr("date <= date_v AND date >= date_v - INTERVAL 1 YEAR"))

In [21]:
# Select columns
df = df.select(df.columns + df_club_games_join.columns)

We add the statistics of the teams in which the player played during the year and in which the player took the field (appearance): appearances, games won, games lost, games drae in which the player was present and also those in which he was not present

In [22]:
# add column draw that is equal to 1 if own_goals = opponent_goals
df = df.withColumn("is_draw", when(df.own_goals == df.opponent_goals, 1).otherwise(0))

#add column games_played_club that counts the instances with key (player_id, date_v)
df = df.withColumn("games_played_club", count(df.date_v).over(Window.partitionBy("date_v", "player_id")))

# add column games_won_club that counts the instances with key (player_id, date_v) and is_win = 1
df = df.withColumn("games_won_club", count(when(df.own_goals > df.opponent_goals, 1)).over(Window.partitionBy("date_v", "player_id")))

# add column games_draw_club that counts the instances with key (player_id, date_v) and is_draw = 1
df = df.withColumn("games_draw_club", count(when(df.is_draw == 1, 1)).over(Window.partitionBy("date_v", "player_id")))

# add column games_lost_club that counts the instances with key (player_id, date_v) and is_win = 0 and is_draw = 0
df = df.withColumn("games_lost_club", count(when(df.own_goals < df.opponent_goals, 1)).over(Window.partitionBy("date_v", "player_id")))

In [23]:
#delete some columns
df = df.drop("game_id", "own_goals",  "own_position", "own_manager_name", "opponent_id", "opponent_goals", "opponent_position", "opponent_manager_name", "hosting", "is_win", "date", "is_draw")

In [24]:
#df = df.dropDuplicates(["player_id", "date_v"])
df = df.dropDuplicates()

We add the feature winning rate of the club. The winning rate is calculated by the formula: (games_won_club*3 + games_draw_club) / games_played_club

In [25]:
# for each (player_id, date_v) add a column with the result of (games_won_club*3 + games_draw_club)/games_played_club
df = df.withColumn("winning_rate_club", (df.games_won_club*3 + df.games_draw_club)/df.games_played_club)

In [26]:
#arrange columns, delete and rename some columns, aggregate club_id
df = df.groupBy(
    "player_id", "date_v", "market_value", "name", "date_birth", "current_club_id", "height", "citizenship", "position", "sub_position", col("competition_id").alias("competitions_id"),
    "assists", "goals", "minutes_played", "red_cards", "yellow_cards", "last_valuation", "appearances", "games_won_pl", "games_draw_pl", "games_lost_pl", "winning_rate_pl",
    "games_played_club", "games_won_club", "games_draw_club", "games_lost_club", "winning_rate_club") \
    .agg(collect_set("club_id").alias("clubs_id"))

In [27]:
# add column age that is equal to date_v - date_birth (integer value)
df = df.withColumn("age", floor(datediff(df.date_v, df.date_birth)/365))

In [28]:
# take only the first digit after the comma of winning_rate_pl and winning_rate_club
df = df.withColumn("winning_rate_pl", round(df.winning_rate_pl, 1))
df = df.withColumn("winning_rate_club", round(df.winning_rate_club, 1))

In [29]:
df = df.select("player_id", "date_v", "market_value", "name", "date_birth", "age", "current_club_id", "height", "citizenship", "position", "sub_position", "competitions_id", "clubs_id",
    "assists", "goals", "minutes_played", "red_cards", "yellow_cards", "last_valuation", "appearances", "games_won_pl", "games_draw_pl", "games_lost_pl", "winning_rate_pl",
    "games_played_club", "games_won_club", "games_draw_club", "games_lost_club", "winning_rate_club")

This is an example of the dataframe (filtered with Mo Salah) after the end of features extraction.
**The stats of the (player_id, date_v) pair are for the year preceding date_v**

In [None]:
df.filter(df.player_id == 148455).show(50)

Now we will handle the cases in which the features values ​​are null or equal to 0.
The features that we have to manage are:
- last valuation (13,45 % null --> from null to 0)
- sub position (8,19 % null --> position)
- age (0,04 % null --> delete examples)
- date_birth (0,04 % null --> delete examples)
- height (some values are 0 --> average height)

In [30]:
# the null values in the column last_valuation must be replaced with 0
df = df.fillna({'last_valuation': 0})

In [31]:
# the null values in the column last_position must be replaced with the value in the column position
df = df.withColumn("sub_position", coalesce(col("sub_position"), col("position")))

In [32]:
# drop instances in which the column age or date_of_birth are null
df = df.dropna(subset=('age', 'date_birth'))

In [None]:
# filter the dataframe to keep only the rows in which the column height is not 0
filtered_df = df.filter(col("height") != 0)

# average height of filtered_df
average_height = filtered_df.selectExpr("avg(height) as height_average").first()["height_average"]

# replace the value 0 in the column height with the mean of the column
df = df.withColumn("height", when(col("height") == 0, average_height).otherwise(col("height")))

In [35]:
#trasform the column height, last_valuation and age in integer
df = df.withColumn("height", df["height"].cast(IntegerType()))
df = df.withColumn("last_valuation", df["last_valuation"].cast(IntegerType()))
df = df.withColumn("age", df["age"].cast(IntegerType()))

In [36]:
#transform the column competitions_id and clubs_id in string
df = df.withColumn("competitions_id", df["competitions_id"].cast(StringType()))
df = df.withColumn("clubs_id", df["clubs_id"].cast(StringType()))

In [None]:
df.filter(df.player_id == 148455).show(50)

In [37]:
import pandas as pd
df_pandas = df.toPandas()
df_pandas.to_csv('dataset.csv', sep=',', encoding='utf-8', index=False)