# Feature Transformation
**In this file, we will do some transformation to rescale the features that have very different scales**

In [29]:
import numpy as np
import pandas as pd
from scipy.stats import zscore
from sklearn.preprocessing import MinMaxScaler

'C:\\Users\\leona\\PycharmProjects\\BDC_project\\valuation forecasting'

In [31]:
import os
os.chdir("..")
# open data.csv as pyspark dataframe
df_pandas = pd.read_csv('dataset.csv')

**Encoding**

We have to execute the mapping for:
- player_id (it will be divider for 100k)
- date_c in timestamp
- current_club_id
- citizenship
- position
- sub_position
- competitions_id
- clubs_id

In [38]:
#drop useless features
df_pandas = df_pandas.drop("name", "date_birth", "games_played_club", "games_won_club", "games_draw_club", "games_lost_club")

In [48]:
df_pandas["date_v"] = pd.to_datetime(df_pandas["date_v"])

#we map the categorical values in their indexes values to have numerical values
df_pandas['position'] = df_pandas['position'].astype('category')
df_pandas['position'] = df_pandas['position'].cat.codes

df_pandas['sub_position'] = df_pandas['sub_position'].astype('category')
df_pandas['sub_position'] = df_pandas['sub_position'].cat.codes

df_pandas['citizenship'] = df_pandas['citizenship'].astype('category')
df_pandas['citizenship'] = df_pandas['citizenship'].cat.codes

df_pandas['current_club_id'] = df_pandas['current_club_id'].astype('category')
df_pandas['current_club_id'] = df_pandas['current_club_id'].cat.codes

df_pandas['competitions_id'] = df_pandas['competitions_id'].astype('category')
df_pandas['competitions_id'] = df_pandas['competitions_id'].cat.codes

df_pandas['clubs_id'] = df_pandas['clubs_id'].astype('category')
df_pandas['clubs_id'] = df_pandas['clubs_id'].cat.codes

In [13]:
df_pandas.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 135878 entries, 0 to 135877
Data columns (total 21 columns):
 #   Column             Non-Null Count   Dtype         
---  ------             --------------   -----         
 0   player_id          135878 non-null  int32         
 1   date_v             135878 non-null  datetime64[ns]
 2   market_value       135878 non-null  int32         
 3   age                135878 non-null  int32         
 4   current_club_id    135878 non-null  int16         
 5   height             135878 non-null  int32         
 6   citizenship        135878 non-null  int16         
 7   position           135878 non-null  int8          
 8   sub_position       135878 non-null  int8          
 9   assists            135878 non-null  int32         
 10  goals              135878 non-null  int32         
 11  minutes_played     135878 non-null  int32         
 12  red_cards          135878 non-null  int32         
 13  yellow_cards       135878 non-null  int32   

## Transformation
We apply zscore to every column except for age, last_valuation, market_value and these to be encoded
We apply min-max scaling to age because with zscore the values would be all Nan
We rescale market_value (labels) and last_valuation through dividing to the maximum/2 (1e+8)

In [53]:
df_pandas['year_v'] = pd.to_datetime(df_pandas['date_v']).dt.year
df_pandas['month_v'] = pd.to_datetime(df_pandas['date_v']).dt.month
df_pandas['day_v'] = pd.to_datetime(df_pandas['date_v']).dt.day

df_pandas = df_pandas.drop("date_v", axis=1)

In [54]:
to_encoding = df_pandas[['citizenship', 'current_club_id', 'position', 'sub_position', 'competitions_id', 'clubs_id']]
df_pandas = df_pandas.drop(['citizenship', 'current_club_id', 'position', 'sub_position', 'competitions_id', 'clubs_id'], axis=1)

In [55]:
#min-max scaling for age
scaler = MinMaxScaler()
age_minmax = scaler.fit_transform(df_pandas[['age']])

#rescaling market_value
scaling_factor_market_value = np.max(df_pandas['market_value'].to_numpy()) / 20
print("the scaling factor of market_valuea is: {}".format(scaling_factor_market_value))
market_val_scaled = df_pandas['market_value'] / scaling_factor_market_value

#rescaling
scaling_factor_valuation = np.max(df_pandas['last_valuation'].to_numpy()) / 20
print("the scaling factor of last_valuations is: {}".format(scaling_factor_valuation))
last_val_scaled = df_pandas['last_valuation'] / scaling_factor_valuation

df_zscore = df_pandas.apply(zscore, axis=0)

df_zscore['market_value'] = market_val_scaled
df_zscore['age'] = age_minmax
df_zscore['last_valuation'] = last_val_scaled

the scaling factor of market_valuea is: 10000000.0
the scaling factor of last_valuations is: 10000000.0


In [56]:
df_zscore.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 135878 entries, 0 to 135877
Data columns (total 19 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   player_id          135878 non-null  float64
 1   market_value       135878 non-null  float64
 2   age                135878 non-null  float64
 3   height             135878 non-null  float64
 4   assists            135878 non-null  float64
 5   goals              135878 non-null  float64
 6   minutes_played     135878 non-null  float64
 7   red_cards          135878 non-null  float64
 8   yellow_cards       135878 non-null  float64
 9   last_valuation     135878 non-null  float64
 10  appearances        135878 non-null  float64
 11  games_won_pl       135878 non-null  float64
 12  games_draw_pl      135878 non-null  float64
 13  games_lost_pl      135878 non-null  float64
 14  winning_rate_pl    135878 non-null  float64
 15  winning_rate_club  135878 non-null  float64
 16  ye

In [57]:
df_zscore = pd.concat([df_zscore, to_encoding], axis=1)

In [58]:
#i wanto to save the dataframe
df_zscore.to_csv("dataset_normalized.csv", index=False)