# Feature Engineering and Encoding
**In this file, we will handle the cases in which the features values ​​are null or equal to 0. Next, some features need to be encoded to prepare the dataset for training**

In [1]:
import pyspark
from pyspark.sql import *
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark import SparkContext, SparkConf
import numpy as np
from pyspark.sql.functions import udf
from pyspark.sql.types import BinaryType
from pyspark.sql import SparkSession
from pyspark.conf import SparkConf
import pandas as pd
from sklearn.preprocessing import LabelBinarizer

In [2]:
# Create the session
conf = SparkConf(). \
    set('spark.ui.port', "4050"). \
    set('spark.executor.memory', '15G'). \
    set('spark.driver.memory', '50G'). \
    set('spark.driver.maxResultSize', '40G'). \
    setAppName("PySparkProject"). \
    set('spark.executor.cores', "10"). \
    setMaster("local[*]")

sc = pyspark.SparkContext.getOrCreate(conf=conf)
spark = SparkSession.builder.getOrCreate()

sc._conf.getAll()

[('spark.executor.memory', '15G'),
 ('spark.app.submitTime', '1683979419185'),
 ('spark.app.id', 'local-1683979419759'),
 ('spark.executor.id', 'driver'),
 ('spark.driver.memory', '50G'),
 ('spark.executor.cores', '10'),
 ('spark.app.name', 'PySparkProject'),
 ('spark.driver.extraJavaOptions',
  '-Djava.net.preferIPv6Addresses=false -XX:+IgnoreUnrecognizedVMOptions --add-opens=java.base/java.lang=ALL-UNNAMED --add-opens=java.base/java.lang.invoke=ALL-UNNAMED --add-opens=java.base/java.lang.reflect=ALL-UNNAMED --add-opens=java.base/java.io=ALL-UNNAMED --add-opens=java.base/java.net=ALL-UNNAMED --add-opens=java.base/java.nio=ALL-UNNAMED --add-opens=java.base/java.util=ALL-UNNAMED --add-opens=java.base/java.util.concurrent=ALL-UNNAMED --add-opens=java.base/java.util.concurrent.atomic=ALL-UNNAMED --add-opens=java.base/sun.nio.ch=ALL-UNNAMED --add-opens=java.base/sun.nio.cs=ALL-UNNAMED --add-opens=java.base/sun.security.action=ALL-UNNAMED --add-opens=java.base/sun.util.calendar=ALL-UNNAMED 

In [3]:
# open data.csv as pyspark dataframe
df = spark.read.csv('data.csv', header=True, inferSchema=True)

In [4]:
# the null values in the column last_valuation must be replaced with 0
df = df.fillna({'last_valuation': 0})

In [5]:
# show the first 5 rows of the dataframe with sub_position null
df.filter(df.sub_position.isNull()).show(5)

+---------+----------+------------+------------------+----------+----+---------------+------+-----------+----------+------------+--------------------+--------+-------+-----+--------------+---------+------------+--------------+-----------+------------+-------------+-------------+---------------+-----------------+--------------+---------------+---------------+-----------------+
|player_id|    date_v|market_value|              name|date_birth| age|current_club_id|height|citizenship|  position|sub_position|     competitions_id|clubs_id|assists|goals|minutes_played|red_cards|yellow_cards|last_valuation|appearances|games_won_pl|games_draw_pl|games_lost_pl|winning_rate_pl|games_played_club|games_won_club|games_draw_club|games_lost_club|winning_rate_club|
+---------+----------+------------+------------------+----------+----+---------------+------+-----------+----------+------------+--------------------+--------+-------+-----+--------------+---------+------------+--------------+-----------+----

In [6]:
# the null values in the column last_position must be replaced with the value in the column position
df = df.withColumn("sub_position", coalesce(col("sub_position"), col("position")))

In [7]:
# drop instances in which the column age or date_of_birth are null
df = df.dropna(subset=('age', 'date_birth'))

In [8]:
print(df.count())
df.dropDuplicates(["date_v", "player_id"])
df.count()

135878

In [10]:
# filter the dataframe to keep only the rows in which the column height is not 0
filtered_df = df.filter(col("age") != 0)

# average height of filtered_df
average_height = filtered_df.selectExpr("avg(height) as height_average").first()["height_average"]

In [11]:
# replace the value 0 in the column height with the mean of the column 
df = df.withColumn("height", when(col("height") == 0, average_height).otherwise(col("height")))
# TODO fare in modo che dopo la virgola ci sia 0

The features that we have to manage are: 
- last valuation (13,45 % null --> from null to 0)
- sub position (8,19 % null --> position)
- age (0,04 % null --> delete examples)
- date_birth (0,04 % null)
- height (some values are 0 --> average height)

We transform the column competitions id and clubs id, that are array of strings in a single binary value using label binarization.

In [12]:
# Convert clubs id and competitions id from list to string
df = df.withColumn("competitions_id", split(expr("substring(competitions_id, 2, length(competitions_id)-2)"), ", "))
df = df.withColumn("clubs_id", split(expr("substring(clubs_id, 2, length(clubs_id)-2)"), ", "))

In [13]:
# convert df in pandas dataframe
df_pandas = df.toPandas()

In [14]:
# apply pd.get_dummies to the column of arrays
dummies = pd.get_dummies(df_pandas["competitions_id"].apply(pd.Series).stack()).sum(level=0)
# concatenate the dummy variables into a single string
dummies["comp_string"] = dummies.apply(lambda x: "".join(x.astype(str)), axis=1)
# join the dummies dataframe with the original dataframe
df_pandas = df_pandas.join(dummies["comp_string"])
#df_pandas = df_pandas.drop("competitions_id")

  dummies = pd.get_dummies(df_pandas["competitions_id"].apply(pd.Series).stack()).sum(level=0)


In [15]:
# apply pd.get_dummies to the column of arrays
dummies = pd.get_dummies(df_pandas["clubs_id"].apply(pd.Series).stack()).sum(level=0)
# concatenate the dummy variables into a single string
dummies["club_string"] = dummies.apply(lambda x: "".join(x.astype(str)), axis=1)
# join the dummies dataframe with the original dataframe
df_pandas = df_pandas.join(dummies["club_string"])
#df_pandas = df_pandas.drop("club_str")

  dummies = pd.get_dummies(df_pandas["clubs_id"].apply(pd.Series).stack()).sum(level=0)


In [16]:
df_pandas = df_pandas.drop(["name", "date_birth", "games_played_club", "games_won_club", "games_draw_club", "games_lost_club", "competitions_id", "clubs_id"], axis=1)

prima fare label binarization e poi nn.Encoding pytorch

da encodare:
  - date_v in timestemp
  - current_club_id
    - citizenship
        - position
- sub_position
- comp_string
- club_str

In [17]:
df_pandas.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 135878 entries, 0 to 135877
Data columns (total 23 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   player_id          135878 non-null  int32  
 1   date_v             135878 non-null  object 
 2   market_value       135878 non-null  int32  
 3   age                135878 non-null  float64
 4   current_club_id    135878 non-null  int32  
 5   height             135878 non-null  float64
 6   citizenship        135878 non-null  object 
 7   position           135878 non-null  object 
 8   sub_position       135878 non-null  object 
 9   assists            135878 non-null  int32  
 10  goals              135878 non-null  int32  
 11  minutes_played     135878 non-null  int32  
 12  red_cards          135878 non-null  int32  
 13  yellow_cards       135878 non-null  int32  
 14  last_valuation     135878 non-null  float64
 15  appearances        135878 non-null  int32  
 16  ga

In [18]:
# divided the column player_id for 1 million
df_pandas["player_id"] = df_pandas["player_id"].apply(lambda x: x/100000)

In [19]:
# copia df_pandas in df_c
df_c = df_pandas.copy()

In [None]:
# drop name, date_birth     - games_played_club     - games_won_club     - games_draw_club     - games_lost_club
