In [42]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import os
import numpy as np
import pickle
from tqdm import  tqdm

pd.options.mode.chained_assignment = None
pd.options.display.min_rows = 50

In [43]:
data_dir = f'/home/samuel-linux/PycharmProjects/Personal/FantasyBasketball/Data'
pickle_dir = f'{data_dir}/pickles'

original_df = pd.read_csv(f'{data_dir}/roto-files/cleaned/Combined.csv')
print(original_df.shape)
print(original_df.columns)

(78023, 33)
Index(['GID', 'Player', 'Date', 'Team', 'Against', 'Home', 'GameID',
       'GameTime', 'TeamPoints', 'OppPoints', 'Starting', 'Minutes',
       'Appeared', 'Active', 'FDP', 'DKP', 'DDP', 'YHP', 'Stats', 'DoubleD',
       'TripleD', 'FDSal', 'FDChange', 'DKSal', 'DKChange', 'DDSal',
       'DDChange', 'YHSal', 'YHChange', 'FDPos', 'DKPos', 'DDPos', 'YHPos'],
      dtype='object')


In [44]:
columns = ['Player', 'Date', 'FDP', 'FDSal']
df = original_df[[i for i in columns]]
# print(df.info())

df_none = df[(df.FDP.isnull()) | (df.FDSal.isnull())]

pbar = tqdm(df_none.iterrows(), total = len(df_none), desc = 'Filling in NA Values')

for idx, row in pbar:
    player = row.Player
    player_df = df[df.Player == player]
    if str(row.FDP) == 'nan':
        mean_fdp = player_df.FDP.mean()
        if str(mean_fdp) == 'nan':
            mean_fdp = None
        else:
            mean_fdp = round(mean_fdp, 2)
        df.loc[idx, 'FDP'] = mean_fdp
    
    if str(row.FDSal) == 'nan':
        mean_fdsal = player_df.FDSal.mean()
        if str(mean_fdsal) == 'nan':
            mean_fdsal = None
        else:
            mean_fdsal = round(mean_fdsal, 2)
        df.loc[idx, 'FDSal'] = mean_fdsal          
pbar.close()

df.dropna(inplace = True)

Filling in NA Values: 100%|██████████| 1888/1888 [00:08<00:00, 232.21it/s]


In [49]:
def create_timestep(df, columns, timestep):
    df.Date = pd.to_datetime(df.Date)
    df_copy = df.copy()
    pieces = []
    unique_players = df.Player.unique()
    pbar = tqdm(unique_players)
    for player in pbar: 
        pbar.set_description(f'Shifting the DF w/ timestep {timestep}: {player}')
        player_df = df[df.Player == player]
        player_df = player_df.sort_values(by = 'Date', ascending = True)
        unique_years = df.Date.dt.strftime('%y').unique()
        for year in unique_years:
            player_year_df = player_df[player_df.Date.dt.strftime('%y') == year]
            if len(player_year_df) == 0:
                continue           
            for column in columns:
                player_year_df[column] = StandardScaler().fit_transform(player_year_df[column].values.reshape(-1,1))
                for step in range(1, timestep + 1):
                    player_year_df[f'{column}_{step}'] = player_year_df[column].shift(-step)
            pieces.append(player_year_df)
        break
    pbar.close()
    full_df = pd.concat(pieces)
    full_df.sort_values(by = 'Date', inplace = True)
    full_df.to_csv(f'shifted-data/RotoShifted_{timestep}.csv', index = False)
    return full_df


df_dict = {}
for timestep in range(1, 21):       
    df_dict[timestep] = create_timestep(df, ['FDP'], timestep) 
    break

Shifting the DF w/ timestep 1: Drew Eubanks:   0%|          | 0/689 [00:00<?, ?it/s]


In [52]:
# df_dict[1]

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler


df_split_dict = {}
for timestep, df in df_dict.items():
    X_df = df[[i for i in df.columns if '_' in i]]
    Y_df = df[['FDP']]

In [30]:
shifted_df.info(), shifted_df.shape

<class 'pandas.core.frame.DataFrame'>
Int64Index: 172 entries, 34221 to 77002
Data columns (total 7 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   Player  172 non-null    object        
 1   Date    172 non-null    datetime64[ns]
 2   FDP     172 non-null    float64       
 3   FDSal   172 non-null    float64       
 4   FDP_1   172 non-null    float64       
 5   FDP_2   172 non-null    float64       
 6   FDP_3   172 non-null    float64       
dtypes: datetime64[ns](1), float64(5), object(1)
memory usage: 10.8+ KB


(None, (172, 7))

In [31]:
len(shifted_df), len(df[df.Player == 'Drew Eubanks'])

(172, 172)

In [34]:
# shifted_df

In [None]:
# shifted_df.sort_values(by = 'Date').head(25)

In [None]:
# test_df = df[df.Player =='Drew Eubanks'].sort_values(by = 'Date')
# test_df[test_df.FDP == 9.6]

In [None]:
assert False

In [None]:


subset_df_copy = subset_df.copy()
pbar = tqdm(subset_df_copy.iterrows(), total = len(subset_df_copy), desc = 'Filling NA Values')
for idx, row in pbar:
    player = row.Player
    player_df = subset_df[subset_df.Player == player]
    
    fdp = row.FDP 
    fdsal = row.FDSal
    
    if fdp == None:
        mean = player_df.dropna(subset = 'FDP', axis = 1).FDP.mean()
        print(mean)
    
    if fdp == None: 
        pass
    
    pbar.update(1)
pbar.close()