In [1]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import os
import numpy as np
import pickle
from tqdm import  tqdm
import shutil

pd.options.mode.chained_assignment = None
pd.options.display.min_rows = 50

In [2]:
data_dir = f'/home/samuel-linux/PycharmProjects/Personal/FantasyBasketball/Data'
pickle_dir = f'{data_dir}/pickles'
roto_cleaned_dir = f'{data_dir}/roto-files/cleaned'
roto_shifted_dir = f'{data_dir}/roto-files/shifted'

In [3]:
csv_dict = {i: pd.read_csv(f'{roto_cleaned_dir}/{i}') for i in os.listdir(roto_cleaned_dir) if 'Combined' not in i}


In [4]:
def fill_na(df_dict):
    new_df_dict = {}
    columns = ['Player', 'Date', 'FDP', 'FDSal']
    for key, df in df_dict.items():
#         df = df[[i for i in columns]]
        df_none = df[(df.FDP.isnull()) | (df.FDSal.isnull())]
        pbar = tqdm(df_none.iterrows(), total = len(df_none), desc = f'{key}: Filling in NA Values')
        for idx, row in pbar:
            player = row.Player
            player_df = df[df.Player == player]
            if str(row.FDP) == 'nan':
                mean_fdp = player_df.FDP.mean()
                if str(mean_fdp) == 'nan':
                    mean_fdp = None
                else:
                    mean_fdp = round(mean_fdp, 2)
                df.loc[idx, 'FDP'] = mean_fdp

            if str(row.FDSal) == 'nan':
                mean_fdsal = player_df.FDSal.mean()
                if str(mean_fdsal) == 'nan':
                    mean_fdsal = None
                else:
                    mean_fdsal = round(mean_fdsal, 2)
                df.loc[idx, 'FDSal'] = mean_fdsal          
        pbar.close()
#         df.dropna(inplace = True)
        df.dropna(subset = ['FDP', 'FDSal'])
        new_df_dict[key] = df
    return new_df_dict
cleaned_df_dict = fill_na(csv_dict)
        

2019-20.csv: Filling in NA Values: 100%|██████████| 956/956 [00:02<00:00, 424.68it/s]
2016-17.csv: Filling in NA Values: 100%|██████████| 9/9 [00:00<00:00, 939.26it/s]
2017-18.csv: Filling in NA Values: 100%|██████████| 77/77 [00:00<00:00, 1092.11it/s]
2018-19.csv: Filling in NA Values: 100%|██████████| 821/821 [00:02<00:00, 347.10it/s]
2020-21.csv: Filling in NA Values: 100%|██████████| 25/25 [00:00<00:00, 919.79it/s]


In [5]:
# cleaned_df_dict

In [6]:
def create_full_shift(df_dict, columns, timestep):
    df_pieces = []
    for key, df in df_dict.items():
        season = key[:key.find('.csv')]
        df.Date = pd.to_datetime(df.Date)
        pieces = []
        unique_players = df.Player.unique()
        pbar = tqdm(unique_players)
        player_pieces = []
        for player in pbar: 
            pbar.set_description(f'{key}: Shifting the DF w/ timestep {timestep}: {player}')
            player_df = df[df.Player == player]
            player_df = player_df.sort_values(by = 'Date', ascending = True)
            for column in columns:
#                 player_df[column] = StandardScaler().fit_transform(player_df[column].values.reshape(-1,1))
                for step in range(1, timestep + 1):
                    player_df[f'{column}_{step}'] = player_df[column].shift(step)
            player_pieces.append(player_df)
        final_df = pd.concat(player_pieces)
        final_df['Season'] = [season for i in range(len(final_df))]
        df_pieces.append(final_df)
    full_final_shifted = pd.concat(df_pieces)
    full_final_shifted.dropna(inplace = True)
    full_final_shifted.to_csv(f'{roto_shifted_dir}/Combined_{timestep}.csv', index = False)
      
    return full_final_shifted

# test_df = create_full_shift(cleaned_df_dict, ['FDP'], 5)
# test_df
shutil.rmtree(roto_shifted_dir)
os.mkdir(roto_shifted_dir)
df_dict = {}
for timestep in range(1, 21):       
    df_dict[timestep] = create_full_shift(cleaned_df_dict, ['FDP'], timestep) 

2019-20.csv: Shifting the DF w/ timestep 1: Willie Cauley-Stein: 100%|██████████| 641/641 [00:02<00:00, 266.46it/s]     
2016-17.csv: Shifting the DF w/ timestep 1: James Harden: 100%|██████████| 14/14 [00:00<00:00, 272.02it/s]
2017-18.csv: Shifting the DF w/ timestep 1: Cristiano Felicio: 100%|██████████| 14/14 [00:00<00:00, 306.00it/s]
2018-19.csv: Shifting the DF w/ timestep 1: Willie Cauley-Stein: 100%|██████████| 557/557 [00:02<00:00, 214.36it/s]     
2020-21.csv: Shifting the DF w/ timestep 1: Carmelo Anthony: 100%|██████████| 39/39 [00:00<00:00, 372.15it/s]
2019-20.csv: Shifting the DF w/ timestep 2: Willie Cauley-Stein: 100%|██████████| 641/641 [00:02<00:00, 238.28it/s]     
2016-17.csv: Shifting the DF w/ timestep 2: James Harden: 100%|██████████| 14/14 [00:00<00:00, 261.88it/s]
2017-18.csv: Shifting the DF w/ timestep 2: Cristiano Felicio: 100%|██████████| 14/14 [00:00<00:00, 415.43it/s]
2018-19.csv: Shifting the DF w/ timestep 2: Willie Cauley-Stein: 100%|██████████| 557/557

2019-20.csv: Shifting the DF w/ timestep 15: Willie Cauley-Stein: 100%|██████████| 641/641 [00:05<00:00, 109.68it/s]     
2016-17.csv: Shifting the DF w/ timestep 15: James Harden: 100%|██████████| 14/14 [00:00<00:00, 133.37it/s]
2017-18.csv: Shifting the DF w/ timestep 15: Cristiano Felicio: 100%|██████████| 14/14 [00:00<00:00, 121.24it/s]
2018-19.csv: Shifting the DF w/ timestep 15: Willie Cauley-Stein: 100%|██████████| 557/557 [00:05<00:00, 100.84it/s]     
2020-21.csv: Shifting the DF w/ timestep 15: Carmelo Anthony: 100%|██████████| 39/39 [00:00<00:00, 136.28it/s]         
2019-20.csv: Shifting the DF w/ timestep 16: Willie Cauley-Stein: 100%|██████████| 641/641 [00:06<00:00, 102.52it/s]    
2016-17.csv: Shifting the DF w/ timestep 16: James Harden: 100%|██████████| 14/14 [00:00<00:00, 118.94it/s]
2017-18.csv: Shifting the DF w/ timestep 16: Cristiano Felicio: 100%|██████████| 14/14 [00:00<00:00, 126.53it/s]
2018-19.csv: Shifting the DF w/ timestep 16: Willie Cauley-Stein: 100%|██

In [12]:
def create_shift_splits(df_dict):
    shifted_pickle_dir = f'{pickle_dir}/roto/shifted'
    shutil.rmtree(shifted_pickle_dir)
    os.mkdir(shifted_pickle_dir)
    pbar = tqdm(df_dict.items(), total = len(df_dict))
    for timestep, df in pbar:
        pbar.set_description(f'{timestep} | Getting Pickles...')
        sampled_df = df.sample(frac = 1).reset_index(drop = True)
#         feature_columns = [i for i in sampled_df.columns if '_' in i]
#         X = sampled_df[['Player']+feature_columns]
        ignore = ['FDP', 'YHSal', 'YHChange', 'FDPos', 'DKPos', 'DDPos', 'YHPos', 'Season', 'GID', 'Date']
        X = sampled_df[[i for i in sampled_df if i not in ignore]]
        Y = sampled_df[['FDP']]
        x_train, x_test, y_train, y_test = train_test_split(X, Y, random_state = 10, 
                                                           train_size = .8)       
        x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, random_state = 10, train_size = .9)
        
#         sc = StandardScaler()
#         final_pickle_dict = { }
        pickle_dict = {'x_train': x_train, 'x_test': x_test, 'x_val': x_val, 
                      'y_train': y_train, 'y_test': y_test, 'y_val': y_val}
        
#         for key, df in pickle_dict.items():
#             unique_players = df.Player.unique()
#             df_pieces = []
#             for player in unique_players:
#                 player_df = df[df.Player == player]
#                 print(len(player_df))
#                 for c in feature_columns:
#                     player_df[c] = sc.fit_transform(player_df[c].values.reshape(-1,1))
#                     print(player_df)
#                 df_pieces.append(player_df)
#             final_df = pd.concat(df_pieces)
#             final_pickle_dict[key] = final_df
    
        pickle.dump(pickle_dict, open(f'{shifted_pickle_dir}/{timestep}-TrainTestSplit.p', 'wb'))
    pbar.close()
        
        
create_shift_splits(df_dict)

20 | Getting Pickles...: 100%|██████████| 20/20 [00:00<00:00, 164.23it/s]


In [13]:
assert False

AssertionError: 

In [None]:
# test = shifted_df[shifted_df.Player == 'Drew Eubanks'].sort_values(by = 'Date', ascending = False)
# test

In [None]:
assert False

In [None]:
original_df = pd.read_csv(f'{data_dir}/roto-files/cleaned/Combined.csv')
print(original_df.shape)
print(original_df.columns)

In [None]:
columns = ['Player', 'Date', 'FDP', 'FDSal']
df = original_df[[i for i in columns]]
# print(df.info())

df_none = df[(df.FDP.isnull()) | (df.FDSal.isnull())]

pbar = tqdm(df_none.iterrows(), total = len(df_none), desc = 'Filling in NA Values')

for idx, row in pbar:
    player = row.Player
    player_df = df[df.Player == player]
    if str(row.FDP) == 'nan':
        mean_fdp = player_df.FDP.mean()
        if str(mean_fdp) == 'nan':
            mean_fdp = None
        else:
            mean_fdp = round(mean_fdp, 2)
        df.loc[idx, 'FDP'] = mean_fdp
    
    if str(row.FDSal) == 'nan':
        mean_fdsal = player_df.FDSal.mean()
        if str(mean_fdsal) == 'nan':
            mean_fdsal = None
        else:
            mean_fdsal = round(mean_fdsal, 2)
        df.loc[idx, 'FDSal'] = mean_fdsal          
pbar.close()

df.dropna(inplace = True)

In [None]:
def create_timestep(df, columns, timestep):
    df.Date = pd.to_datetime(df.Date)
    df_copy = df.copy()
    pieces = []
    unique_players = df.Player.unique()
    pbar = tqdm(unique_players)
    for player in pbar: 
        pbar.set_description(f'Shifting the DF w/ timestep {timestep}: {player}')
        player_df = df[df.Player == player]
        player_df = player_df.sort_values(by = 'Date', ascending = True)
        unique_years = df.Date.dt.strftime('%y').unique()
        for year in unique_years:
            player_year_df = player_df[player_df.Date.dt.strftime('%y') == year]
            if len(player_year_df) == 0:
                continue           
            for column in columns:
                player_year_df[column] = StandardScaler().fit_transform(player_year_df[column].values.reshape(-1,1))
                for step in range(1, timestep + 1):
                    player_year_df[f'{column}_{step}'] = player_year_df[column].shift(-step)
            pieces.append(player_year_df)
        break
    pbar.close()
    full_df = pd.concat(pieces)
    full_df.sort_values(by = 'Date', inplace = True)
    full_df.to_csv(f'{data_dir}/roto-files/shifted/RotoShifted_{timestep}.csv', index = False)
    return full_df


df_dict = {}
for timestep in range(1, 21):       
    df_dict[timestep] = create_timestep(df, ['FDP'], timestep) 
    break

In [None]:
# df_dict[1]

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler


df_split_dict = {}
for timestep, df in df_dict.items():
    X_df = df[[i for i in df.columns if '_' in i]]
    Y_df = df[['FDP']]

In [None]:
shifted_df.info(), shifted_df.shape

In [None]:
len(shifted_df), len(df[df.Player == 'Drew Eubanks'])

In [None]:
# shifted_df

In [None]:
# shifted_df.sort_values(by = 'Date').head(25)

In [None]:
# test_df = df[df.Player =='Drew Eubanks'].sort_values(by = 'Date')
# test_df[test_df.FDP == 9.6]

In [None]:
assert False