# Task 2
**Clustering**

In task 2 a new dataset will be introduced. The new dataset will be partially merged with the one obtained in feature engineering and preprocessing. The goal will be to create clusters made up of players with similar characteristics. The final partition will be used to scout young talents.

Imported libraries

In [1]:
import pyspark
from pyspark.sql import *
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark import SparkContext, SparkConf
import numpy as np
from pyspark.sql.functions import udf
from pyspark.sql.types import BinaryType
from pyspark.sql import SparkSession
from pyspark.conf import SparkConf

Creating the pyspark session

In [2]:
# Create the session
conf = SparkConf(). \
    set('spark.ui.port', "4050"). \
    set('spark.executor.memory', '15G'). \
    set('spark.driver.memory', '50G'). \
    set('spark.driver.maxResultSize', '40G'). \
    setAppName("PySparkProject"). \
    set('spark.executor.cores', "10"). \
    setMaster("local[*]")

sc = pyspark.SparkContext.getOrCreate(conf=conf)
spark = SparkSession.builder.getOrCreate()

sc._conf.getAll()

[('spark.executor.memory', '15G'),
 ('spark.driver.host', 'LAPTOP-JLLVBEPM'),
 ('spark.executor.id', 'driver'),
 ('spark.driver.memory', '50G'),
 ('spark.executor.cores', '10'),
 ('spark.driver.port', '55716'),
 ('spark.app.name', 'PySparkProject'),
 ('spark.driver.extraJavaOptions',
  '-Djava.net.preferIPv6Addresses=false -XX:+IgnoreUnrecognizedVMOptions --add-opens=java.base/java.lang=ALL-UNNAMED --add-opens=java.base/java.lang.invoke=ALL-UNNAMED --add-opens=java.base/java.lang.reflect=ALL-UNNAMED --add-opens=java.base/java.io=ALL-UNNAMED --add-opens=java.base/java.net=ALL-UNNAMED --add-opens=java.base/java.nio=ALL-UNNAMED --add-opens=java.base/java.util=ALL-UNNAMED --add-opens=java.base/java.util.concurrent=ALL-UNNAMED --add-opens=java.base/java.util.concurrent.atomic=ALL-UNNAMED --add-opens=java.base/sun.nio.ch=ALL-UNNAMED --add-opens=java.base/sun.nio.cs=ALL-UNNAMED --add-opens=java.base/sun.security.action=ALL-UNNAMED --add-opens=java.base/sun.util.calendar=ALL-UNNAMED --add-open

**Work on dataset begins**

In [3]:
# Load cvs fifa23 dataset files pyspark
df = spark.read.csv("C:/Users/marco/Desktop/male_players.csv", header=True, inferSchema=True)

In [4]:
# Load csv dataset, obtained from the feature engineering and econdinf process
df_ = spark.read.csv("dataset.csv", header=True, inferSchema=True)

**Pre Processing Phase**

In [5]:
# Drop some df_ columns that are not useful for the task
df_ = df_.drop("clubs_id_binary", "competitions_id_binary", "sub_position_binary", "position_binary", "citizenship_encoded_binary", "current_club_id_binary", "winning_rate_club", "winning_rate_pl", "games_lost_pl",
         "games_draw_pl", "games_won_pl", "appearances", "yellow_cards", "red_cards", "minutes_played", "goals", "assists", "height", "age", "market_value", )

In [6]:
# delete the instances from df_ in which date_v is not in the year 2022
df_ = df_.filter(year(df_.date_v) == "2022")

In [7]:
# multiply the player_id by 100000
df_ = df_.withColumn("player_id", df_.player_id * 100000)
# convert player_id to int (without decimal places)
df_ = df_.withColumn("player_id", df_.player_id.cast(IntegerType()))

In [8]:
# Define the reference date as "2022-06-15" using lit
target_date = lit("2022-06-15").cast("date")

# Compute the difference in days between date_v and the reference date
df_with_diff = df_.withColumn("date_diff", when(col("date_v") > target_date, col("date_v") - target_date)
                                 .otherwise(target_date - col("date_v")))

# Find the closest date to "2022-06-15" for each player_id
closest_date = df_with_diff.groupBy("player_id").agg(min("date_diff").alias("min_date_diff"))

# Join the original DataFrame with the DataFrame of the closest dates
df_ = df_with_diff.join(closest_date, on=["player_id"]).where(col("date_diff") == col("min_date_diff"))

# Remove the date_diff column if it is not necessary
df_ = df_.drop("date_diff", "min_date_diff")

In [9]:
# Remove (player_id, date_v) duplicates
df_ = df_.dropDuplicates(["player_id", "date_v"])

In [10]:
# remove the player_id with last_valuation equal to 0 (if their last_valuation is 0, they are probably not present in the fifa23players dataset)
df_ = df_.filter(df_.last_valuation != 0)

In [11]:
# Load the cvs players to obtain some crucial information for the merge
df_players = spark.read.csv("archive/players.csv", header=True, inferSchema=True)

In [12]:
# join the two datasets on player_id
df_ = df_.join(df_players, on=["player_id"])

In [13]:
# drop some columns that are not useful for the task
df_ = df_.drop("current_club_id", "current_club_name", "country_of_birth", "city_of_birth", "foot", "height_in_cm", "market_value_in_eur", "highest_market_value_in_eur", "agent_name", "contract_expiration_date", "current_club_domestic_competition_id", "player_code", "image_url", "last_season", "url")

In [14]:
# select only the fifa_version == 23
df = df.filter(df.fifa_version == 23)

In [15]:
# select only the fifa_update == 1
df = df.filter(df.fifa_update == 1)

In [16]:
# drop duplicates on player_id
df = df.dropDuplicates(["player_id"])

In [17]:
# TODO aumentare peso di age e last_valuation. aumentare di peso ma non di troppo overall

In [18]:
# from df drop:
'''
 |-- ls: string (nullable = true)
 |-- st: string (nullable = true)
 |-- rs: string (nullable = true)
 |-- lw: string (nullable = true)
 |-- lf: string (nullable = true)
 |-- cf: string (nullable = true)
 |-- rf: string (nullable = true)
 |-- rw: string (nullable = true)
 |-- lam: string (nullable = true)
 |-- cam: string (nullable = true)
 |-- ram: string (nullable = true)
 |-- lm: string (nullable = true)
 |-- lcm: string (nullable = true)
 |-- cm: string (nullable = true)
 |-- rcm: string (nullable = true)
 |-- rm: string (nullable = true)
 |-- lwb: string (nullable = true)
 |-- ldm: string (nullable = true)
 |-- cdm: string (nullable = true)
 |-- rdm: string (nullable = true)
 |-- rwb: string (nullable = true)
 |-- lb: string (nullable = true)
 |-- lcb: string (nullable = true)
 |-- cb: string (nullable = true)
 |-- rcb: string (nullable = true)
 |-- rb: string (nullable = true)
 |-- gk: string (nullable = true)
 |-- player_face_url: string (nullable = true)
 |-- body_type: string (nullable = true)
 |-- real_face: string (nullable = true)
 |-- release_clause_eur: integer (nullable = true)
 |-- player_tags: string (nullable = true)
 |-- player_traits: string (nullable = true)
|-- international_reputation: integer (nullable = true)
 |-- club_jersey_number: integer (nullable = true)
 |-- club_loaned_from: string (nullable = true)
 |-- club_joined_date: date (nullable = true)
 |-- club_contract_valid_until_year: integer (nullable = true)
 |-- nationality_id: integer (nullable = true)
 |-- nationality_name: string (nullable = true)
 |-- nation_team_id: integer (nullable = true)
 |-- nation_position: string (nullable = true)
 |-- nation_jersey_number: integer (nullable = true)
 |-- league_id: integer (nullable = true)
 |-- league_name: string (nullable = true)
 |-- league_level: integer (nullable = true)
 |-- club_team_id: integer (nullable = true)
 |-- club_name: string (nullable = true)
 |-- club_position: string (nullable = true)
 |-- potential: integer (nullable = true)
 |-- value_eur: integer (nullable = true)
 |-- wage_eur: integer (nullable = true)
 |-- age: integer (nullable = true)
 |-- player_positions: string (nullable = true)
 |-- player_id: integer (nullable = true)
 |-- player_url: string (nullable = true)
 |-- fifa_version: integer (nullable = true)
 |-- fifa_update: integer (nullable = true)
 |-- fifa_update_date: date (nullable = true)
'''

'\n |-- ls: string (nullable = true)\n |-- st: string (nullable = true)\n |-- rs: string (nullable = true)\n |-- lw: string (nullable = true)\n |-- lf: string (nullable = true)\n |-- cf: string (nullable = true)\n |-- rf: string (nullable = true)\n |-- rw: string (nullable = true)\n |-- lam: string (nullable = true)\n |-- cam: string (nullable = true)\n |-- ram: string (nullable = true)\n |-- lm: string (nullable = true)\n |-- lcm: string (nullable = true)\n |-- cm: string (nullable = true)\n |-- rcm: string (nullable = true)\n |-- rm: string (nullable = true)\n |-- lwb: string (nullable = true)\n |-- ldm: string (nullable = true)\n |-- cdm: string (nullable = true)\n |-- rdm: string (nullable = true)\n |-- rwb: string (nullable = true)\n |-- lb: string (nullable = true)\n |-- lcb: string (nullable = true)\n |-- cb: string (nullable = true)\n |-- rcb: string (nullable = true)\n |-- rb: string (nullable = true)\n |-- gk: string (nullable = true)\n |-- player_face_url: string (nullable =

In [19]:
import re
str = "|-- ls: string (nullable = true)|-- st: string (nullable = true)|-- rs: string (nullable = true)|-- lw: string (nullable = true)|-- lf: string (nullable = true)|-- cf: string (nullable = true)|-- rf: string (nullable = true)|-- rw: string (nullable = true)|-- lam: string (nullable = true)|-- cam: string (nullable = true)|-- ram: string (nullable = true)|-- lm: string (nullable = true)|-- lcm: string (nullable = true)|-- cm: string (nullable = true)|-- rcm: string (nullable = true)|-- rm: string (nullable = true)|-- lwb: string (nullable = true)|-- ldm: string (nullable = true)|-- cdm: string (nullable = true)|-- rdm: string (nullable = true)|-- rwb: string (nullable = true)|-- lb: string (nullable = true)|-- lcb: string (nullable = true)|-- cb: string (nullable = true)|-- rcb: string (nullable = true)|-- rb: string (nullable = true)|-- gk: string (nullable = true)|-- player_face_url: string (nullable = true)|-- body_type: string (nullable = true)|-- real_face: string (nullable = true)|-- release_clause_eur: integer (nullable = true)|-- player_tags: string (nullable = true)|-- player_traits: string (nullable = true)|-- international_reputation: integer (nullable = true)|-- club_jersey_number: integer (nullable = true)|-- club_loaned_from: string (nullable = true)|-- club_joined_date: date (nullable = true)|-- club_contract_valid_until_year: integer (nullable = true)|-- nationality_id: integer (nullable = true)|-- nation_team_id: integer (nullable = true)|-- nation_position: string (nullable = true)|-- nation_jersey_number: integer (nullable = true)|-- league_id: integer (nullable = true)|-- league_name: string (nullable = true)|-- league_level: integer (nullable = true)|-- club_team_id: integer (nullable = true)|-- club_name: string (nullable = true)|-- club_position: string (nullable = true)|-- potential: integer (nullable = true)|-- value_eur: integer (nullable = true)|-- wage_eur: integer (nullable = true)|-- age: integer (nullable = true)|-- player_positions: string (nullable = true)|-- player_id: integer (nullable = true)|-- player_url: string (nullable = true)|-- fifa_version: integer (nullable = true)|-- fifa_update: integer (nullable = true)|-- fifa_update_date: date (nullable = true)"
pattern = pattern = r"(?<=- ).*?(?=: )"
elements_to_drop = re.findall(pattern, str)

In [20]:
for element in elements_to_drop:
    df = df.drop(element)

In [21]:
# fare il join df e df_ in cui df.dob == df_.date_of_birth e df_.country_of_citizenship == df.nationality_name
joined_df = df.join(df_, (df.dob == df_.date_of_birth) & (df_.country_of_citizenship == df.nationality_name))

In [22]:
#conta il numero di player_id distinct
#print(joined_df.select("player_id").distinct().count())
# conta il numero di istanze
#print(joined_df.count())

In [23]:
count_df = joined_df.groupBy("player_id").agg(count("*").alias("count"))
df = joined_df.join(count_df, "player_id").filter(count_df["count"] == 1).drop("count") # we will ad the instances obtained from the filtered df 
#print(df.select("player_id").distinct().count())

In [24]:
# dataset costituito dalle sole istanze con player_id non univoco (ovvero che compare più di una volta)
count_df = joined_df.groupBy("player_id").agg(count("*").alias("count"))
filtered_df = joined_df.join(count_df, "player_id").filter(count_df["count"] > 1).drop("count")

In [25]:
#filtered_df.count()

In [26]:
# mantain only the instance in which long name is equal to name
filtered_df = filtered_df.filter(filtered_df.long_name == filtered_df.name)

In [27]:
#filtered_df.count()

In [28]:
# add to df the instances of filtered_df
df = df.union(filtered_df)

In [29]:
# check if there are still duplicates
#print(df.count())
#print(df.select("player_id").distinct().count())

In [30]:
# INSERT CURRENT DATE #
import datetime
current_date = datetime.datetime.now().strftime("%Y-%m-%d")

In [31]:
# add to the df the column age
df = df.withColumn("age", floor(datediff(lit(current_date), "dob")/365))

In [32]:
# drop some columns
df = df.drop("first_name", "last_name", "date_of_birth", "date_v", "country_of_citizenship", "dob", "short_name", "long_name")

**Feature Engineering Phase**

In [33]:
# for each features of df, check if there are null values
df.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in df.columns]).show()

+---------+-------+---------+---------+----------------+--------------+---------+-----------+---------+----+--------+-------+---------+---------+------+------------------+-------------------+--------------------------+-----------------------+-----------------+---------------+-----------+-----------------+------------------+------------------+---------------------+---------------------+----------------+------------------+----------------+----------------+-------------+-------------+--------------+----------------+--------------------+-----------------------+---------------------+----------------+-------------------+-------------------+---------------------------+-------------------------+------------------------+------------------+--------------------+-------------------+-----------------------+--------------------+-----------------+--------------+----+--------+------------+---+
|player_id|overall|height_cm|weight_kg|nationality_name|preferred_foot|weak_foot|skill_moves|work_rate|pace|s

In [33]:
# if position is goal keeper then club_position = GK then set pace, shooting, passing, dribbling, defending, physic = 0
df = df.withColumn("pace", when(df.position == "Goalkeeper", 0).otherwise(df.pace))
df = df.withColumn("shooting", when(df.position == "Goalkeeper", 0).otherwise(df.shooting))
df = df.withColumn("passing", when(df.position == "Goalkeeper", 0).otherwise(df.passing))
df = df.withColumn("dribbling", when(df.position == "Goalkeeper", 0).otherwise(df.dribbling))
df = df.withColumn("defending", when(df.position == "Goalkeeper", 0).otherwise(df.defending))
df = df.withColumn("physic", when(df.position == "Goalkeeper", 0).otherwise(df.physic))

In [34]:
# if the sub_position is null, set it to position value
df = df.withColumn("sub_position", when(col("sub_position").isNull(), col("position")).otherwise(col("sub_position")))

In [35]:
# if position is not Goalkeeper then set goalkeeping_speed to 0
df = df.withColumn("goalkeeping_speed", when(df.position != "Goalkeeper", 0).otherwise(df.goalkeeping_speed))

In [46]:
import pandas as pd
df_p = df.toPandas()
df_p.to_csv("task2dataset.csv", index=False)

In [None]:
# save df in csv
# df.write.csv("task2dataset.csv", header=True)

In [None]:
#from autoviz.AutoViz_Class import AutoViz_Class
#AV = AutoViz_Class()
#dfc = AV.AutoViz("task2dataset.csv")

We have to divide by 100k the features player_id to minimize its weight;
We consider two possibilities: k-means and k-prototype.
- k-means:
  - one-hot-encoding on the categorical variables
  - normalization (0,1) of the dataset (no for player_id)
  - increase the weight of some features (age, market_value and maybe others features)
  - apply PCA
  - apply k-means
- k-prototype:
  - normalization (0,1) of the dataset for the numerical features (no for player_id)
  - increase the weight of some features (age, market_value)
  - apply PCA (or maybe no)
  - apply k-prototype
