# Pre Processing
**In this file, we will analyze the initial dataset consisting of CSV files in order to create the final dataset. The final dataset, composed of the relevant columns, will be used to train the predictive models**

Imported libraries

In [1]:
import pyspark
from pyspark.sql import *
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark import SparkContext, SparkConf
import numpy as np
from pyspark.sql.functions import udf
from pyspark.sql.types import BinaryType
from pyspark.sql import SparkSession
from pyspark.conf import SparkConf

Creating the pyspark session

In [2]:
# Create the session
conf = SparkConf(). \
    set('spark.ui.port', "4050"). \
    set('spark.executor.memory', '15G'). \
    set('spark.driver.memory', '50G'). \
    set('spark.driver.maxResultSize', '40G'). \
    setAppName("PySparkProject"). \
    set('spark.executor.cores', "10"). \
    setMaster("local[*]")

sc = pyspark.SparkContext.getOrCreate(conf=conf)
spark = SparkSession.builder.getOrCreate()

sc._conf.getAll()

[('spark.executor.memory', '15G'),
 ('spark.executor.id', 'driver'),
 ('spark.driver.memory', '50G'),
 ('spark.driver.port', '51067'),
 ('spark.app.startTime', '1683728276005'),
 ('spark.executor.cores', '10'),
 ('spark.app.submitTime', '1683728275813'),
 ('spark.app.name', 'PySparkProject'),
 ('spark.driver.extraJavaOptions',
  '-Djava.net.preferIPv6Addresses=false -XX:+IgnoreUnrecognizedVMOptions --add-opens=java.base/java.lang=ALL-UNNAMED --add-opens=java.base/java.lang.invoke=ALL-UNNAMED --add-opens=java.base/java.lang.reflect=ALL-UNNAMED --add-opens=java.base/java.io=ALL-UNNAMED --add-opens=java.base/java.net=ALL-UNNAMED --add-opens=java.base/java.nio=ALL-UNNAMED --add-opens=java.base/java.util=ALL-UNNAMED --add-opens=java.base/java.util.concurrent=ALL-UNNAMED --add-opens=java.base/java.util.concurrent.atomic=ALL-UNNAMED --add-opens=java.base/sun.nio.ch=ALL-UNNAMED --add-opens=java.base/sun.nio.cs=ALL-UNNAMED --add-opens=java.base/sun.security.action=ALL-UNNAMED --add-opens=java.b

**Work on dataset begins**

Loading dataframes

In [3]:
# load the cvs files into dataframes
path = "C:/Users/marco/Desktop"

try:
    df_appearances = spark.read.csv("archive/appearances.csv", header=True, inferSchema=True)
    df_club_games = spark.read.csv("archive/club_games.csv", header=True, inferSchema=True)
    df_clubs = spark.read.csv("archive/clubs.csv", header=True, inferSchema=True)
    df_competitions = spark.read.csv("archive/competitions.csv", header=True, inferSchema=True)
    df_game_events = spark.read.csv("archive/game_events.csv", header=True, inferSchema=True)
    df_games = spark.read.csv("archive/games.csv", header=True, inferSchema=True)
    df_player_valuations = spark.read.csv("archive/player_valuations.csv", header=True, inferSchema=True)
    df_players = spark.read.csv("archive/players.csv", header=True, inferSchema=True)
except Exception:
    df_appearances = spark.read.csv(path+"/archive/appearances.csv", header=True, inferSchema=True)
    df_club_games = spark.read.csv(path+"/archive/club_games.csv", header=True, inferSchema=True)
    df_clubs = spark.read.csv(path+"/archive/clubs.csv", header=True, inferSchema=True)
    df_competitions = spark.read.csv(path+"/archive/competitions.csv", header=True, inferSchema=True)
    df_game_events = spark.read.csv(path+"/archive/game_events.csv", header=True, inferSchema=True)
    df_games = spark.read.csv(path+"/archive/games.csv", header=True, inferSchema=True)
    df_player_valuations = spark.read.csv(path+"/archive/player_valuations.csv", header=True, inferSchema=True)
    df_players = spark.read.csv(path+"/archive/players.csv", header=True, inferSchema=True)

In [4]:
df_games_join = df_games.join(df_club_games, df_games.game_id == df_club_games.game_id, how='left')
df_appearances_c = df_appearances.withColumnRenamed("date", "datec")
df_appearances_join = df_games.join(df_appearances_c, df_games.game_id == df_appearances_c.game_id, how='left')

In [5]:
#print(df_games_join.filter((df_games_join.club_id == 31) & (df_games_join.date.between('2020-12-23', '2021-12-23'))).count())
#df_games_join.filter((df_games_join.club_id == 31) & (df_games_join.date.between('2020-12-23', '2021-12-23'))).show()

In [6]:
#print(df_appearances_join.filter((df_appearances_join.player_id == 148455) & (df_appearances_join.datec.between('2020-12-23', '2021-12-23'))).count())
#df_appearances_join.filter((df_appearances_join.player_id == 148455) & (df_appearances_join.datec.between('2020-12-23', '2021-12-23'))).show()

In [7]:
# CELLA DA CANCELLARE --> CI SONO DEI DUPLICATI NELLA TABELLA PRESENZE --> NON RISOLVE IL PROBLEMA DI SALAH...  ;(
df_appearances.printSchema()
#conta righe df_appearances
#print(df_appearances.count())
# droppiamo duplicati in base a date e player_id
df_appearances = df_appearances.dropDuplicates(['date', 'player_id'])
#conta righe df_appearances
#print(df_appearances.count())

root
 |-- appearance_id: string (nullable = true)
 |-- game_id: integer (nullable = true)
 |-- player_id: integer (nullable = true)
 |-- player_club_id: integer (nullable = true)
 |-- player_current_club_id: integer (nullable = true)
 |-- date: date (nullable = true)
 |-- player_name: string (nullable = true)
 |-- competition_id: string (nullable = true)
 |-- yellow_cards: integer (nullable = true)
 |-- red_cards: integer (nullable = true)
 |-- goals: integer (nullable = true)
 |-- assists: integer (nullable = true)
 |-- minutes_played: integer (nullable = true)



In [8]:
# drop duplicates df_club_games
#df_club_games.printSchema()
#print(df_club_games.count())
df_club_games = df_club_games.dropDuplicates(['club_id', 'own_goals', 'own_position', 'own_manager_name', 'opponent_id', 'opponent_goals', 'opponent_position', 'opponent_manager_name', 'hosting', 'is_win'])
#print(df_club_games.count())

In [9]:
# drop duplicates df_games
#df_games.printSchema()
#print(df_games.count())
df_games = df_games.dropDuplicates(['home_club_id', 'away_club_id', 'date'])
#print(df_games.count())

In [10]:
# join players and appearances
df_players_appearances = df_players.join(df_appearances, ["player_id"], how='inner')

In [11]:
 # join players_appearances and club_games to extract information about the games played by the player
#rinomina player_club_id in club_id
df_players_appearances = df_players_appearances.withColumnRenamed("player_club_id", "club_id")
df_players_appearances = df_players_appearances.join(df_club_games, ["game_id", "club_id"], how='inner')

In [12]:
# drop useless and duplicated features from df_players_appearances
df_players_appearances = df_players_appearances.drop("current_club_id", "appearance_id",
                                                        "highest_market_value_in_eur", "current_club_name",
                                                        "city_of_birth", "market_value_in_eur",
                                                        "contract_expiration_date", "agent_name",
                                                        "current_club_domestic_competition_id", "image_url",
                                                        "last_season", "url", "player_current_club_id",
                                                        "first_name", "last_name", "player_name", "player_code")

In [13]:
# drop useless and duplicated features from df_players_valuations
df_player_valuations = df_player_valuations.drop("datetime", "dateweek")

In [14]:
# rename the column date of df_players_valuations in date_v to avoid confusion with the date of df_players_appearances
df_player_valuations = df_player_valuations.withColumnRenamed("date", "date_v")

In [15]:
# Join the two dataframes on player_id
df = df_player_valuations.join(df_players_appearances, "player_id")

In [16]:
# TODO decide if we want to keep the players with no appearances, in case we have to do a union with valuations
# adding before the zeroed column of df_players_appearances

In [17]:
# we want to keep only the rows where the appearence date is within 1 year from the valuation date
df = df.filter(
    (year(df.date_v) == year(df.date) + 1) & (month(df.date_v) < month(df.date)) |
    (year(df.date_v) == year(df.date)) & (month(df.date_v) > month(df.date)) |
    (year(df.date_v) == year(df.date)) & (month(df.date_v) == month(df.date)) & (dayofmonth(df.date_v) > dayofmonth(df.date)) |
    (year(df.date_v) == year(df.date) + 1) & (month(df.date_v) == month(df.date)) & (dayofmonth(df.date_v) < dayofmonth(df.date))
).dropDuplicates(["player_id", "date", "date_v"])

In [18]:
#df.filter(df.player_id == 148455).show(100)

We add the statistics of the teams in which the player played during the year and in which the player took the field (appearance)

In [19]:
# add the is_draw to count the draws
df = df.withColumn("is_draw", when(df.own_goals == df.opponent_goals, 1).otherwise(0))

# add column games_played that counts the instances with key (player_id, date_v)
df = df.withColumn("appearances", count(df.date_v).over(Window.partitionBy("date_v", "player_id")))

# add column games_won that counts the instances with key (player_id, date_v) and is_win = 1
df = df.withColumn("games_won_pl",
                                count(when(df.own_goals > df.opponent_goals, 1)).over(Window.partitionBy("date_v", "player_id")))

# add column games_draw_pl that counts the instances with key (player_id, date_v) and is_draw = 1
df = df.withColumn("games_draw_pl",
                                count(when(df.is_draw == 1, 1)).over(Window.partitionBy("date_v", "player_id")))

df = df.withColumn("games_lost_pl", count(when(df.own_goals < df.opponent_goals, 1)).over(Window.partitionBy("date_v", "player_id")))

In [20]:
#df.filter(df.player_id == 148455).sort(col("date_v").desc()).show()

In [21]:
# delete some columns
df = df.drop("game_id", "own_goals", "own_position", "own_manager_name", "opponent_id",
                        "opponent_goals", "opponent_position", "opponent_manager_name", "hosting", "is_win",
                        "is_draw")

# for each (player_id, date_v) add a column with the result of (games_won_pl*3 + games_draw_pl)/games_played
df = df.withColumn("winning_rate_pl",
                                    (df.games_won_pl * 3 + df.games_draw_pl) / df.appearances)

In [22]:
df.printSchema()

root
 |-- player_id: integer (nullable = true)
 |-- date_v: date (nullable = true)
 |-- current_club_id: integer (nullable = true)
 |-- market_value_in_eur: integer (nullable = true)
 |-- player_club_domestic_competition_id: string (nullable = true)
 |-- club_id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- country_of_citizenship: string (nullable = true)
 |-- country_of_birth: string (nullable = true)
 |-- date_of_birth: date (nullable = true)
 |-- position: string (nullable = true)
 |-- sub_position: string (nullable = true)
 |-- foot: string (nullable = true)
 |-- height_in_cm: integer (nullable = true)
 |-- date: date (nullable = true)
 |-- competition_id: string (nullable = true)
 |-- yellow_cards: integer (nullable = true)
 |-- red_cards: integer (nullable = true)
 |-- goals: integer (nullable = true)
 |-- assists: integer (nullable = true)
 |-- minutes_played: integer (nullable = true)
 |-- appearances: long (nullable = false)
 |-- games_won_pl: long (nulla

In [23]:
print(df.count())
df = df.dropDuplicates(["date", "player_id", "date_v"])
print(df.count())

2665805
2665805


In [24]:
# Group by the player_id and the valuation date and extract all the important features
df = df.groupBy(
    "player_id", "name", col("market_value_in_eur").alias("market_value"), "date_v",
    col("current_club_id").alias("current_club_id"), col("height_in_cm").alias("height"),
    col("country_of_citizenship").alias("citizenship"), col("date_of_birth").alias("date_birth"), "position",
    "sub_position", "appearances", "games_won_pl", "games_draw_pl", "games_lost_pl", "winning_rate_pl") \
    .agg(collect_set("competition_id").alias("competition_id"),
            collect_set("club_id").alias("club_id"),
            sum("assists").alias("assists"),
            count("date_v").alias("appearances2"),
            sum("goals").alias("goals"),
            sum("minutes_played").alias("minutes_played"),
            sum("red_cards").alias("red_cards"),
            sum("yellow_cards").alias("yellow_cards"))

#df.filter(df.player_id == 148455).show()

In [25]:
#add last valuation in temporal terms
df = df.withColumn("last_valuation", lag(df.market_value).over(Window.partitionBy("player_id").orderBy("date_v")))

In [26]:
df.filter(df.date_v == '2021-12-23').filter(df.player_id == 148455).show()

+---------+-------------+------------+----------+---------------+------+-----------+----------+--------+------------+-----------+------------+-------------+-------------+------------------+--------------+-------+-------+------------+-----+--------------+---------+------------+--------------+
|player_id|         name|market_value|    date_v|current_club_id|height|citizenship|date_birth|position|sub_position|appearances|games_won_pl|games_draw_pl|games_lost_pl|   winning_rate_pl|competition_id|club_id|assists|appearances2|goals|minutes_played|red_cards|yellow_cards|last_valuation|
+---------+-------------+------------+----------+---------------+------+-----------+----------+--------+------------+-----------+------------+-------------+-------------+------------------+--------------+-------+-------+------------+-----+--------------+---------+------------+--------------+
|   148455|Mohamed Salah|   100000000|2021-12-23|             31|   175|      Egypt|1992-06-15|  Attack|Right Winger|    

We add the statistics of the teams in which the player played during the year

In [27]:
# join club_games and games to extract the date from games
df_club_games_join = df_club_games.join(df_games.select("game_id", "date"), "game_id", how='inner')

In [28]:
# Expands the player_club_id list into separate columns
df_result_expanded = df.withColumn("club_id", explode(col("club_id")))

In [29]:
# Peform join based on club_id and apply condition on date
df = df_result_expanded.join(df_club_games_join, ["club_id"]) \
    .where(expr("date <= date_v AND date >= date_v - INTERVAL 1 YEAR"))

In [30]:
# Select columns
df = df.select(df.columns + df_club_games_join.columns)

In [31]:
# add column draw that is equal to 1 if own_goals = opponent_goals
df = df.withColumn("is_draw", when(df.own_goals == df.opponent_goals, 1).otherwise(0))

#add column games_played_club that counts the instances with key (player_id, date_v)
df = df.withColumn("games_played_club", count(df.date_v).over(Window.partitionBy("date_v", "player_id")))

# add column games_won_club that counts the instances with key (player_id, date_v) and is_win = 1
df = df.withColumn("games_won_club", count(when(df.own_goals > df.opponent_goals, 1)).over(Window.partitionBy("date_v", "player_id")))

# add column games_draw_club that counts the instances with key (player_id, date_v) and is_draw = 1
df = df.withColumn("games_draw_club", count(when(df.is_draw == 1, 1)).over(Window.partitionBy("date_v", "player_id")))

# add column games_lost_club that counts the instances with key (player_id, date_v) and is_win = 0 and is_draw = 0
df = df.withColumn("games_lost_club", count(when(df.own_goals < df.opponent_goals, 1)).over(Window.partitionBy("date_v", "player_id")))

In [32]:
#df.filter(df.player_id == 148455).show(1000)

In [33]:
#delete some columns
df = df.drop("game_id", "own_goals",  "own_position", "own_manager_name", "opponent_id", "opponent_goals", "opponent_position", "opponent_manager_name", "hosting", "is_win", "date", "is_draw")

In [34]:
#df = df.dropDuplicates(["player_id", "date_v"])
df = df.dropDuplicates()

In [35]:
# for each (player_id, date_v) add a column with the result of (games_won_club*3 + games_draw_club)/games_played_club
df = df.withColumn("winning_rate_club", (df.games_won_club*3 + df.games_draw_club)/df.games_played_club)

In [36]:
#df.filter(df.player_id == 148455).show(50)

In [37]:
#df.filter(df.player_id == 148455).show()

In [38]:
#arrange columns, delete and rename some columns, aggregate club_id
df = df.groupBy(
    "player_id", "date_v", "market_value", "name", "date_birth", "current_club_id", "height", "citizenship", "position", "sub_position", col("competition_id").alias("competitions_id"),
    "assists", "goals", "minutes_played", "red_cards", "yellow_cards", "last_valuation", "appearances", "games_won_pl", "games_draw_pl", "games_lost_pl", "winning_rate_pl",
    "games_played_club", "games_won_club", "games_draw_club", "games_lost_club", "winning_rate_club") \
    .agg(collect_set("club_id").alias("clubs_id"))

In [39]:
# add column age that is equal to date_v - date_birth cifra intera
df = df.withColumn("age", floor(datediff(df.date_v, df.date_birth)/365))

In [40]:
# take only the first digit after the comma of winning_rate_pl and winning_rate_club
df = df.withColumn("winning_rate_pl", round(df.winning_rate_pl, 1))
df = df.withColumn("winning_rate_club", round(df.winning_rate_club, 1))

In [41]:
df = df.select("player_id", "date_v", "market_value", "name", "date_birth", "age", "current_club_id", "height", "citizenship", "position", "sub_position", "competitions_id", "clubs_id",
    "assists", "goals", "minutes_played", "red_cards", "yellow_cards", "last_valuation", "appearances", "games_won_pl", "games_draw_pl", "games_lost_pl", "winning_rate_pl",
    "games_played_club", "games_won_club", "games_draw_club", "games_lost_club", "winning_rate_club")

In [42]:
#df.filter(df.player_id == 148455).show(50)

In [43]:
# TODO possiamo pensare di eliminare games_played_club, games_won_club, games_draw_club, games_lost_club e lasciare solo winning_rate_club

In [44]:
import pandas as pd
df_pandas = df.toPandas()

We transform the column competitions id and clubs id, that are array of strings in a single binary value using label binarization.

In [46]:
# apply pd.get_dummies to the column of arrays
dummies = pd.get_dummies(df_pandas["competitions_id"].apply(pd.Series).stack()).sum(level=0)
# concatenate the dummy variables into a single string
dummies["comp_string"] = dummies.apply(lambda x: "".join(x.astype(str)), axis=1)
# join the dummies dataframe with the original dataframe
df_pandas = df_pandas.join(dummies["comp_string"])
df_pandas = df_pandas.drop("competitions_id")


  dummies = pd.get_dummies(df_pandas["competitions_id"].apply(pd.Series).stack()).sum(level=0)


In [None]:
# apply pd.get_dummies to the column of arrays
dummies = pd.get_dummies(df_pandas["clubs_id"].apply(pd.Series).stack()).sum(level=0)
# concatenate the dummy variables into a single string
dummies["club_str"] = dummies.apply(lambda x: "".join(x.astype(str)), axis=1)
# join the dummies dataframe with the original dataframe
df_pandas = df_pandas.join(dummies["club_str"])
df_pandas = df_pandas.drop("club_str")

In [None]:
df_pandas.info()

In [None]:
#write the dataframe in csv format
df_pandas.to_csv('data.csv', sep=',', encoding='utf-8', index=False)

In [49]:
#from autoviz.AutoViz_Class import AutoViz_Class
#AV = AutoViz_Class()
#df = AV.AutoViz('data.csv')