In [11]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go
import plotly.express as px
from alibi_detect.cd import TabularDrift, KSDrift
from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from scipy import stats

path = "/Users/konstantin/Documents/Projects/McGill/Football_Transfermarkt"
data = pd.\
    read_csv(f"{path}/Modelling/data_merged.csv", index_col=0).\
    drop(columns = ['player_id'])

x = data.drop(columns=['mean_market_value_in_eur', 'max_market_value_in_eur'])
y = data[['year', 'mean_market_value_in_eur']]

### Preparing the data

In [12]:
feature_names = x.columns.tolist()
feature_names.remove('year')

# GET CATEGORICAL FEATURES
cat_cols = ['position', 'sub_position', 'foot', 'continent']
count = 0
categories_per_feature = {}
for col in feature_names:
    if col in cat_cols:
        categories_per_feature[count] = None
    count += 1

# LABEL ENCODING
for col in cat_cols:
    encoder = LabelEncoder()
    res = encoder.fit_transform(x[col])
    x[col] = res

### Feature Drift (distribution of X) - Konstantin

In [3]:
# GET DATA
feature_names = x.columns.tolist()
feature_names.remove('year')

# YEARS TO ITERATE OVER
years = x.year.drop_duplicates()
years = years.tolist()[1:]

# DRIFT
results = []
for year in years:
    res_sng = []
    res_sng.append(int(year))

    x_ref = x.query(f'year == {year-1}').drop(columns=['year'])
    x_test = x.query(f'year == {year}').drop(columns=['year'])

    cd = TabularDrift(x_ref.to_numpy(), p_val=0.05, categories_per_feature=categories_per_feature)
    preds = cd.predict(x_test.to_numpy(), drift_type='feature', return_p_val=True, return_distance=True)

    for col in range(len(feature_names)):
        res_sng.append(preds['data']['p_val'][col])
    results.append(res_sng)

# PREPARE THE DATA
results = pd.DataFrame(results, columns=x.columns)
res_years = results.year.to_list()
res_vars = results.drop(columns=['year']).columns.tolist()
res_dat = results.drop(columns=['year']).T.to_numpy()

# VISUALIZATION
fig = go.Figure(data=go.Heatmap(
                   z=res_dat,
                   x=res_years,
                   y=res_vars,
                   hoverongaps = False,
                   colorscale=['red','green','green','green','green', 'green']))
fig.update_layout(title="Feature Drift - Probability the distribution is the same as last year",
                  xaxis_title="Year",
                  yaxis_title="Feature",)
fig.show(renderer='browser')

### Prior Drift (distribution of Y) - Konstantin

In [4]:
# YEARS TO ITERATE OVER
years = y.year.drop_duplicates()
years = years.tolist()[1:]

# CHECK DRIFT
results = {}
for year in years:
    res_sng = []
    res_sng.append(int(year))

    y_ref = y.query(f'year == {year-1}').drop(columns=['year'])
    y_test = y.query(f'year == {year}').drop(columns=['year'])

    cd = KSDrift(y_ref.to_numpy(), p_val=0.05)
    preds = cd.predict(y_test.to_numpy(), drift_type='feature', return_p_val=True, return_distance=True)

    results[year] = [preds['data']['is_drift'][0]]

# CLEAN RESULTS
results = pd.DataFrame(results).T.\
    reset_index().\
    rename(columns={'index': 'year', 0: 'is_drift'})
results['year'] = results['year'].astype(int)

# REMOVE OUTLIERS
y_rm = y.copy()
y_rm['year'] = y_rm['year'].astype(int)
y_rm = y_rm.dropna(subset=['mean_market_value_in_eur'])
z_limit = 3.29
y_rm['z_score'] = y_rm.groupby('year')['mean_market_value_in_eur'].transform(lambda x : stats.zscore(x,ddof=1))
y_rm['z_score'] = np.abs(y_rm['z_score'])
y_rm = y_rm.query(f"z_score <= {z_limit}")
y_rm = y_rm.merge(results, on='year', how='left').dropna()

# PLOT 
fig = px.box(y_rm, x='year', color='is_drift',
             y='mean_market_value_in_eur',
             color_discrete_sequence=['red','green'],
             points = False)
fig.update_layout(boxmode='overlay',
                  title="Prior Drift",
                  xaxis_title="Year",
                  yaxis_title="Mean Market Value",)
fig.show(renderer='browser')

### Concept Drift (P(Y|X) changes) - Konstantin

In [33]:
# YEARS TO ITERATE OVER
years = x.year.drop_duplicates()
years = years.tolist()[1:]

# PERFORM DISTRIBUTION COMPARISON
results = pd.DataFrame()
for year in years:
    pca = PCA(n_components=2)
    scaled = StandardScaler()

    # GET DATA
    x_ref = x.query(f'year == {year-1}')
    x_test = x.query(f'year == {year}')

    # SCALE DATA
    x_ref_scale = scaled.fit_transform(x_ref.drop(columns=['year']))
    x_test_scale = scaled.fit_transform(x_test.drop(columns=['year']))
    x_ref_scale = pd.DataFrame(x_ref_scale, columns=x_ref.columns.drop('year'), index=x_ref.index)
    x_ref_scale['year'] = year-1
    x_test_scale = pd.DataFrame(x_test_scale, columns=x_test.columns.drop('year'), index=x_test.index)
    x_test_scale['year'] = year-1

    ref = x_ref_scale.merge(y['mean_market_value_in_eur'], left_index=True, right_index=True).dropna()
    test = x_test_scale.merge(y['mean_market_value_in_eur'], left_index=True, right_index=True).dropna()

    # CHECK GENERIC DISTRIBUTION
    x_ref_pca = pca.fit_transform(ref.drop(columns=['year', 'mean_market_value_in_eur']))
    x_test_pca = pca.transform(test.drop(columns=['year', 'mean_market_value_in_eur']))

    # COMBINE DATA
    ref_pca = pd.DataFrame(x_ref_pca, columns=['pca1', 'pca2'])
    ref_pca['market_val'] = ref['mean_market_value_in_eur'].reset_index(drop=True)
    ref_pca['year'] = str(int(year - 1))
    ref_pca['pca_year'] = str(int(year - 1))
    ref_pca = ref_pca.dropna()

    test_pca = pd.DataFrame(x_test_pca, columns=['pca1', 'pca2'])
    test_pca['market_val'] = test['mean_market_value_in_eur'].reset_index(drop=True)
    test_pca['year'] = str(int(year))
    test_pca['pca_year'] = str(int(year - 1))
    test_pca = test_pca.dropna()
    pca_tot = pd.concat([ref_pca, test_pca])

    results = pd.concat([results, pca_tot])

# PLOT
pca_years = ['2018','2019','2020','2021']
fig = px.scatter(results.query(f'pca_year in {pca_years}'), 
                x='pca1', y='pca2', color='year', size='market_val',
                trendline="lowess", trendline_options={'frac':0.1},
                facet_col='pca_year', facet_col_wrap=2)
fig.update_layout(title=f"Concept Drift - distribution of data over time",)
fig.show(renderer='browser')