# Binary Encoding

In [1]:
import pyspark
from pyspark.sql import *
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark import SparkContext, SparkConf
import numpy as np
from pyspark.sql.functions import udf
from pyspark.sql.types import BinaryType
from pyspark.sql import SparkSession
from pyspark.conf import SparkConf
import pandas as pd
from sklearn.preprocessing import LabelBinarizer
from pyspark.ml.feature import StringIndexer

In [None]:
# Create the session
conf = SparkConf(). \
    set('spark.ui.port', "4050"). \
    set('spark.executor.memory', '15G'). \
    set('spark.driver.memory', '50G'). \
    set('spark.driver.maxResultSize', '40G'). \
    setAppName("PySparkProject"). \
    set('spark.executor.cores', "10"). \
    setMaster("local[*]")

sc = pyspark.SparkContext.getOrCreate(conf=conf)
spark = SparkSession.builder.getOrCreate()

sc._conf.getAll()

In [None]:
# open data.csv as pyspark dataframe
df = spark.read.csv('dataset.csv', header=True, inferSchema=True)

**Encoding**

We have to execute the mapping for:
- player_id (it will be divider for 100k)
- date_c in timestamp
- current_club_id
- citizenship
- position
- sub_position
- competitions_id
- clubs_id

In [12]:
#### CURRENT_CLUB_ID, CITIZENSHIP, POSITION, SUBPOSITION ####

def binary_encoding(df, column):

    # count the number of distinct values in the column
    distinct_values = int(np.log2(df.select(column).distinct().count())) + 1

    outputCol = column + "_encoded"
    # Create the StringIndexer instance by specifying the input and output columns
    indexer = StringIndexer(inputCol=column, outputCol=outputCol)
    # Train the StringIndexer on the DataFrame
    indexer_model = indexer.fit(df)
    # Apply the transformation to the DataFrame
    df = indexer_model.transform(df)
    #df = df.withColumn(outputCol, df[outputCol])
    # convert the column outputCol in integer
    df = df.withColumn(outputCol, df[outputCol].cast(IntegerType()))
    # convert the column outputCol in binary
    df = df.withColumn(outputCol, expr("LPAD(CONV(" + outputCol + ", 10, 2), " + str(distinct_values) + ", '0')"))

    return df

df = binary_encoding(df, "current_club_id")
df = binary_encoding(df, "citizenship")
df = binary_encoding(df, "position")
df = binary_encoding(df, "sub_position")


We transform the column competitions id and clubs id, that are array of strings in a single binary value using label binarization.

In [14]:
# convert df in pandas dataframe
df_pandas = df.toPandas()

  series = series.astype(t, copy=False)
  series = series.astype(t, copy=False)


In [15]:
# apply pd.get_dummies to the column of arrays

dummies = pd.get_dummies(df_pandas["competitions_id"].apply(pd.Series).stack()).sum(level=0)
# concatenate the dummy variables into a single string
dummies["comp_string"] = dummies.apply(lambda x: "".join(x.astype(str)), axis=1)
# join the dummies dataframe with the original dataframe
df_pandas = df_pandas.join(dummies["comp_string"])
#df_pandas = df_pandas.drop("competitions_id")


  dummies = pd.get_dummies(df_pandas["competitions_id"].apply(pd.Series).stack()).sum(level=0)


In [16]:

# apply pd.get_dummies to the column of arrays
dummies = pd.get_dummies(df_pandas["clubs_id"].apply(pd.Series).stack()).sum(level=0)
# concatenate the dummy variables into a single string
dummies["club_string"] = dummies.apply(lambda x: "".join(x.astype(str)), axis=1)
# join the dummies dataframe with the original dataframe
df_pandas = df_pandas.join(dummies["club_string"])
#df_pandas = df_pandas.drop("club_str")


  dummies = pd.get_dummies(df_pandas["clubs_id"].apply(pd.Series).stack()).sum(level=0)


In [12]:
df_pandas.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 135917 entries, 0 to 135916
Data columns (total 21 columns):
 #   Column             Non-Null Count   Dtype         
---  ------             --------------   -----         
 0   player_id          135917 non-null  int64         
 1   date_v             135917 non-null  datetime64[ns]
 2   market_value       135917 non-null  int64         
 3   age                135878 non-null  float64       
 4   current_club_id    135917 non-null  int16         
 5   height             135917 non-null  int64         
 6   citizenship        135917 non-null  int16         
 7   position           135917 non-null  int8          
 8   sub_position       135917 non-null  int8          
 9   assists            135917 non-null  int64         
 10  goals              135917 non-null  int64         
 11  minutes_played     135917 non-null  int64         
 12  red_cards          135917 non-null  int64         
 13  yellow_cards       135917 non-null  int64   

In [19]:
df_pandas.head()

Unnamed: 0,player_id,date_v,market_value,age,current_club_id,height,citizenship,position,sub_position,assists,...,games_draw_pl,games_lost_pl,winning_rate_pl,winning_rate_club,current_club_id_encoded,citizenship_encoded,position_encoded,sub_position_encoded,comp_string,club_string
0,0.00026,2015-02-04,3000000,34,16,190,Germany,Goalkeeper,Goalkeeper,0,...,3,8,1.3,1.7,1000111,101,11,100,0000010000000000000000001000000000000000000,0000000000000000000000000000000000000000000000...
1,0.00026,2015-07-01,2000000,34,16,190,Germany,Goalkeeper,Goalkeeper,0,...,5,13,1.5,1.6,1000111,101,11,100,0000010000000000000000001000000000000000000,0000000000000000000000000000000000000000000000...
2,0.00026,2015-10-16,1000000,35,16,190,Germany,Goalkeeper,Goalkeeper,0,...,5,10,1.6,1.8,1000111,101,11,100,0000010000001100000000001000000000000000000,0000000000000000000000000000000000000000000000...
3,0.00026,2016-02-15,1000000,35,16,190,Germany,Goalkeeper,Goalkeeper,0,...,3,6,1.8,2.1,1000111,101,11,100,0000010000001100000000001000000000000000000,0000000000000000000000000000000000000000000000...
4,0.00026,2016-07-22,1000000,35,16,190,Germany,Goalkeeper,Goalkeeper,0,...,2,3,2.1,2.3,1000111,101,11,100,0000000000001100000000001000000000000000000,0000000000000000000000000000000000000000000000...


In [23]:

column_rename = {'current_club_id_encoded': 'current_club_id_binary', 'citizenship_encoded': 'citizenship_encoded_binary', 'position_encoded': 'position_binary',
                  'sub_position_encoded': 'sub_position_binary', 'comp_string': 'competitions_id_binary', 'club_string': 'clubs_id_binary'}

df_pandas = df_pandas.rename(columns=column_rename)


In [13]:
#df_pandas.to_csv('dataset.csv', sep=',', encoding='utf-8', index=False)