# Feature Engineering and Encoding
**In this file, we will handle the cases in which the features values ​​are null or equal to 0. Next, some features need to be encoded to prepare the dataset for training**

In [1]:
import pyspark
from pyspark.sql import *
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark import SparkContext, SparkConf
import numpy as np
from pyspark.sql.functions import udf
from pyspark.sql.types import BinaryType
from pyspark.sql import SparkSession
from pyspark.conf import SparkConf
import pandas as pd
from sklearn.preprocessing import LabelBinarizer
from pyspark.ml.feature import StringIndexer

In [2]:
# Create the session
conf = SparkConf(). \
    set('spark.ui.port', "4050"). \
    set('spark.executor.memory', '15G'). \
    set('spark.driver.memory', '50G'). \
    set('spark.driver.maxResultSize', '40G'). \
    setAppName("PySparkProject"). \
    set('spark.executor.cores', "10"). \
    setMaster("local[*]")

sc = pyspark.SparkContext.getOrCreate(conf=conf)
spark = SparkSession.builder.getOrCreate()

sc._conf.getAll()

[('spark.app.startTime', '1684163707774'),
 ('spark.app.id', 'local-1684163708938'),
 ('spark.executor.memory', '15G'),
 ('spark.driver.host', 'LAPTOP-JLLVBEPM'),
 ('spark.executor.id', 'driver'),
 ('spark.driver.memory', '50G'),
 ('spark.driver.port', '57928'),
 ('spark.executor.cores', '10'),
 ('spark.app.name', 'PySparkProject'),
 ('spark.driver.extraJavaOptions',
  '-Djava.net.preferIPv6Addresses=false -XX:+IgnoreUnrecognizedVMOptions --add-opens=java.base/java.lang=ALL-UNNAMED --add-opens=java.base/java.lang.invoke=ALL-UNNAMED --add-opens=java.base/java.lang.reflect=ALL-UNNAMED --add-opens=java.base/java.io=ALL-UNNAMED --add-opens=java.base/java.net=ALL-UNNAMED --add-opens=java.base/java.nio=ALL-UNNAMED --add-opens=java.base/java.util=ALL-UNNAMED --add-opens=java.base/java.util.concurrent=ALL-UNNAMED --add-opens=java.base/java.util.concurrent.atomic=ALL-UNNAMED --add-opens=java.base/sun.nio.ch=ALL-UNNAMED --add-opens=java.base/sun.nio.cs=ALL-UNNAMED --add-opens=java.base/sun.secur

In [3]:
# open data.csv as pyspark dataframe
df = spark.read.csv('dataset_preprocessing.csv', header=True, inferSchema=True)

The features that we have to manage are: 
- last valuation (13,45 % null --> from null to 0)
- sub position (8,19 % null --> position)
- age (0,04 % null --> delete examples)
- date_birth (0,04 % null --> delete examples)
- height (some values are 0 --> average height)

In [4]:
# the null values in the column last_valuation must be replaced with 0
df = df.fillna({'last_valuation': 0})

In [5]:
# the null values in the column last_position must be replaced with the value in the column position
df = df.withColumn("sub_position", coalesce(col("sub_position"), col("position")))

In [6]:
# drop instances in which the column age or date_of_birth are null
df = df.dropna(subset=('age', 'date_birth'))

In [7]:
# filter the dataframe to keep only the rows in which the column height is not 0
filtered_df = df.filter(col("height") != 0)

# average height of filtered_df
average_height = filtered_df.selectExpr("avg(height) as height_average").first()["height_average"]

# replace the value 0 in the column height with the mean of the column
df = df.withColumn("height", when(col("height") == 0, average_height).otherwise(col("height")))

In [8]:
#trasform the column height, last_valuation and age in integer
df = df.withColumn("height", df["height"].cast(IntegerType()))
df = df.withColumn("last_valuation", df["last_valuation"].cast(IntegerType()))
df = df.withColumn("age", df["age"].cast(IntegerType()))

In [9]:
# Convert clubs id and competitions id from list to string
df = df.withColumn("competitions_id", split(expr("substring(competitions_id, 2, length(competitions_id)-2)"), ", "))
df = df.withColumn("clubs_id", split(expr("substring(clubs_id, 2, length(clubs_id)-2)"), ", "))

**Encoding**

We have to execute the label binarization for:
- player_id (it will be divided by 100k)
- date_c in timestamp
- current_club_id
- citizenship
- position
- sub_position
- competitions_id
- clubs_id

In [10]:
#### PLAYER_ID ####

# divided the column player_id for 100000
df = df.withColumn("player_id", df["player_id"] / 100000)

In [11]:
df.show()

+---------+-------------------+------------+------------------+-------------------+---+---------------+------+-----------+----------+------------------+--------------------+-----------+-------+-----+--------------+---------+------------+--------------+-----------+------------+-------------+-------------+---------------+-----------------+--------------+---------------+---------------+-----------------+
|player_id|             date_v|market_value|              name|         date_birth|age|current_club_id|height|citizenship|  position|      sub_position|     competitions_id|   clubs_id|assists|goals|minutes_played|red_cards|yellow_cards|last_valuation|appearances|games_won_pl|games_draw_pl|games_lost_pl|winning_rate_pl|games_played_club|games_won_club|games_draw_club|games_lost_club|winning_rate_club|
+---------+-------------------+------------+------------------+-------------------+---+---------------+------+-----------+----------+------------------+--------------------+-----------+-----

In [12]:
#### CURRENT_CLUB_ID, CITIZENSHIP, POSITION, SUBPOSITION ####

def binary_encoding(df, column):

    # count the number of distinct values in the column
    distinct_values = int(np.log2(df.select(column).distinct().count())) + 1

    outputCol = column + "_encoded"
    # Create the StringIndexer instance by specifying the input and output columns
    indexer = StringIndexer(inputCol=column, outputCol=outputCol)
    # Train the StringIndexer on the DataFrame
    indexer_model = indexer.fit(df)
    # Apply the transformation to the DataFrame
    df = indexer_model.transform(df)
    #df = df.withColumn(outputCol, df[outputCol])
    # convert the column outputCol in integer
    df = df.withColumn(outputCol, df[outputCol].cast(IntegerType()))
    # convert the column outputCol in binary
    df = df.withColumn(outputCol, expr("LPAD(CONV(" + outputCol + ", 10, 2), " + str(distinct_values) + ", '0')"))

    return df

df = binary_encoding(df, "current_club_id")
df = binary_encoding(df, "citizenship")
df = binary_encoding(df, "position")
df = binary_encoding(df, "sub_position")

We transform the column competitions id and clubs id, that are array of strings in a single binary value using label binarization.

In [14]:
# convert df in pandas dataframe
df_pandas = df.toPandas()

  series = series.astype(t, copy=False)
  series = series.astype(t, copy=False)


In [15]:
# apply pd.get_dummies to the column of arrays
dummies = pd.get_dummies(df_pandas["competitions_id"].apply(pd.Series).stack()).sum(level=0)
# concatenate the dummy variables into a single string
dummies["comp_string"] = dummies.apply(lambda x: "".join(x.astype(str)), axis=1)
# join the dummies dataframe with the original dataframe
df_pandas = df_pandas.join(dummies["comp_string"])
#df_pandas = df_pandas.drop("competitions_id")

  dummies = pd.get_dummies(df_pandas["competitions_id"].apply(pd.Series).stack()).sum(level=0)


In [16]:
# apply pd.get_dummies to the column of arrays
dummies = pd.get_dummies(df_pandas["clubs_id"].apply(pd.Series).stack()).sum(level=0)
# concatenate the dummy variables into a single string
dummies["club_string"] = dummies.apply(lambda x: "".join(x.astype(str)), axis=1)
# join the dummies dataframe with the original dataframe
df_pandas = df_pandas.join(dummies["club_string"])
#df_pandas = df_pandas.drop("club_str")

  dummies = pd.get_dummies(df_pandas["clubs_id"].apply(pd.Series).stack()).sum(level=0)


In [17]:
df_pandas = df_pandas.drop(["name", "date_birth", "games_played_club", "games_won_club", "games_draw_club", "games_lost_club", "competitions_id", "clubs_id"], axis=1)

In [20]:
df_pandas = df_pandas.drop(["citizenship", "current_club_id", "position", "sub_position"], axis=1)

In [21]:
df_pandas.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 135878 entries, 0 to 135877
Data columns (total 23 columns):
 #   Column                   Non-Null Count   Dtype         
---  ------                   --------------   -----         
 0   player_id                135878 non-null  float64       
 1   date_v                   135878 non-null  datetime64[ns]
 2   market_value             135878 non-null  int32         
 3   age                      135878 non-null  int32         
 4   height                   135878 non-null  int32         
 5   assists                  135878 non-null  int32         
 6   goals                    135878 non-null  int32         
 7   minutes_played           135878 non-null  int32         
 8   red_cards                135878 non-null  int32         
 9   yellow_cards             135878 non-null  int32         
 10  last_valuation           135878 non-null  int32         
 11  appearances              135878 non-null  int32         
 12  games_won_pl    

In [19]:
df_pandas.head()

Unnamed: 0,player_id,date_v,market_value,age,current_club_id,height,citizenship,position,sub_position,assists,...,games_draw_pl,games_lost_pl,winning_rate_pl,winning_rate_club,current_club_id_encoded,citizenship_encoded,position_encoded,sub_position_encoded,comp_string,club_string
0,0.00026,2015-02-04,3000000,34,16,190,Germany,Goalkeeper,Goalkeeper,0,...,3,8,1.3,1.7,1000111,101,11,100,0000010000000000000000001000000000000000000,0000000000000000000000000000000000000000000000...
1,0.00026,2015-07-01,2000000,34,16,190,Germany,Goalkeeper,Goalkeeper,0,...,5,13,1.5,1.6,1000111,101,11,100,0000010000000000000000001000000000000000000,0000000000000000000000000000000000000000000000...
2,0.00026,2015-10-16,1000000,35,16,190,Germany,Goalkeeper,Goalkeeper,0,...,5,10,1.6,1.8,1000111,101,11,100,0000010000001100000000001000000000000000000,0000000000000000000000000000000000000000000000...
3,0.00026,2016-02-15,1000000,35,16,190,Germany,Goalkeeper,Goalkeeper,0,...,3,6,1.8,2.1,1000111,101,11,100,0000010000001100000000001000000000000000000,0000000000000000000000000000000000000000000000...
4,0.00026,2016-07-22,1000000,35,16,190,Germany,Goalkeeper,Goalkeeper,0,...,2,3,2.1,2.3,1000111,101,11,100,0000000000001100000000001000000000000000000,0000000000000000000000000000000000000000000000...


In [23]:
column_rename = {'current_club_id_encoded': 'current_club_id_binary', 'citizenship_encoded': 'citizenship_encoded_binary', 'position_encoded': 'position_binary',
                  'sub_position_encoded': 'sub_position_binary', 'comp_string': 'competitions_id_binary', 'club_string': 'clubs_id_binary'}

df_pandas = df_pandas.rename(columns=column_rename)

In [24]:
df_pandas.to_csv('dataset.csv', sep=',', encoding='utf-8', index=False)