In [1]:
import pandas as pd
import os
import gc
from tqdm import tqdm

import pickle
import numpy as np
from functools import reduce
from datetime import datetime, timedelta
from typing import Callable, Dict, List, Tuple

def load_obj(name):
    with open("../input/baseball/" + name + ".pkl", "rb") as f:
        return pickle.load(f)

In [2]:
def get_factorize(vitrine: pd.DataFrame, field: str) -> pd.DataFrame:
    code, cats = pd.factorize(vitrine[field])
    vitrine[field] = code
    return vitrine


def decrease_mem_consuming(
    features: pd.DataFrame, excluding_fields: List[str] = ["id", "date"]
) -> pd.DataFrame:

    new_types = {}
    for name in list(features.columns):
        if name in excluding_fields:
            continue
        if features[name].dtype == "float64":
            new_types[name] = "float32"
        elif features[name].dtype == "int64":
            new_types[name] = "int32"

    features = features.astype(new_types)
    return features




d = load_obj('prod4_1f')
models = load_obj('full_target_1_354')
names_nec_fields = load_obj('names_nec_fields')



pls = pd.read_csv('../input/mlb-player-digital-engagement-forecasting/players.csv')

sns = pd.read_csv('../input/mlb-player-digital-engagement-forecasting/seasons.csv')
sns = sns.rename(columns={"seasonId": "season"})
sns = decrease_mem_consuming(sns)



tms = pd.read_csv('../input/mlb-player-digital-engagement-forecasting/teams.csv')
tms = tms.drop(
    [
        "name",
        "teamName",
        "teamCode",
        "shortName",
        "abbreviation",
        "venueId",
        "venueName",
        "leagueName",
        "divisionName",
        "divisionId"
    ],
    axis=1,
)
tms = tms.rename(
    columns={
        "id": "team_id",
    }
)
tms = get_factorize(tms, "locationName")
tms = decrease_mem_consuming(tms)

In [3]:
def extend_specific_field(df: pd.DataFrame, field_name: str) -> pd.DataFrame:
    df = df.query(f"{field_name} == {field_name}")
    null = np.nan
    true = True
    false = False
    res = []
    for el in list(df[field_name]):
        res += eval(el)
    return res


def get_df_from_extend_field(
    train: pd.DataFrame, field_name: str, names: List[str], drop_list: List[str]
) -> pd.DataFrame:
    df = extend_specific_field(train, field_name)
    df = pd.DataFrame(df)
    if len(drop_list) != 0:
        df = df.drop(drop_list, axis=1)
    if names != []:
        df.columns = names
    return df


def get_object_types(df: pd.DataFrame) -> List[str]:
    return list(
        set(filter(lambda name: df[name].dtype == object, df.columns)) - set(["date"])
    )



def get_part_of_vitrine(
    df: pd.DataFrame,
    vitrine: pd.DataFrame,
    merge_fields: List[str],
    nec_fields: List[str],
) -> pd.DataFrame:
    df = decrease_mem_consuming(df)
    types = df.dtypes.to_dict()
    vitrine = pd.merge(vitrine, df, on=merge_fields, how="left")
    vitrine = vitrine.replace([np.inf, -np.inf], np.nan)
    vitrine = vitrine.fillna(-1)
    vitrine = vitrine.groupby(["id", "date"]).sum().reset_index()
    vitrine = vitrine.astype(types)
        
    return vitrine[nec_fields]


def generate_rosters(train: pd.DataFrame, vitrine: pd.DataFrame) -> pd.DataFrame:
    fields = names_nec_fields['rosters']
    
    if train['rosters'].iloc[0] == train['rosters'].iloc[0]:
        rosters = get_df_from_extend_field(
            train, "rosters", ["id", "date"] + ["team_id", "status"], ["status"]
        )
        rosters = get_factorize(rosters, "status")
        rosters.status = rosters.status.replace(
            {
                4: 3,  # Injured 7-Day -> Injured 10-Day
                5: 0,
                6: 0,
                7: 0,
                8: 0,
                9: 0,
                10: 0,
            }
        )
    else:
        rosters = vitrine.copy()
        for col in fields:
            rosters[col] = np.nan
    rosters = rosters[['id', 'date'] + fields]
    vitrine = get_part_of_vitrine(
        rosters, vitrine, ["id", "date"], ["id", "date", "team_id", "status"]
    )
    return vitrine.fillna(-1)

def prepare_games(df: pd.DataFrame) -> pd.DataFrame:
    df["homeWinner"] = df["homeWinner"].astype(int)
    df["awayWinner"] = df["awayWinner"].astype(int)
    df["isTie"] = df["isTie"].fillna(False)
    df["isTie"] = df["isTie"].astype(int)

    # Generate rel features for game
    df["rel_score"] = df["homeScore"] / df["awayScore"]
    df["rel_win_pct"] = df["homeWinPct"] / df["awayWinPct"]
    df["rel_wins"] = df["homeWins"] / df["awayWins"]
    df["rel_losses"] = df["homeLosses"] / df["awayLosses"]

    df["home_team"] = df["homeId"]
    df["away_team"] = df["awayId"]

    df_1 = df.drop(
        ["awayId", "awayWins", "awayLosses", "awayWinPct", "awayWinner", "awayScore"],
        axis=1,
    )
    df_1 = df_1.rename(
        columns={
            "homeId": "team_id",
            "homeWins": "count_wins_on_season",
            "homeLosses": "count_losses_on_season",
            "homeWinPct": "win_pct",
            "homeWinner": "win",
            "homeScore": "score",
            "gameDate": "date",
        }
    )
    df_1.loc[:, "is_home"] = 1

    df_2 = df.drop(
        ["homeId", "homeWins", "homeLosses", "homeWinPct", "homeWinner", "homeScore"],
        axis=1,
    )
    df_2 = df_2.rename(
        columns={
            "awayId": "team_id",
            "awayWins": "count_wins_on_season",
            "awayLosses": "count_losses_on_season",
            "awayWinPct": "win_pct",
            "awayWinner": "win",
            "awayScore": "score",
            "gameDate": "date",
        }
    )
    df_2.loc[:, "is_home"] = 0

    # Generate rel features for game
    df_2["rel_score"] = 1 / df_2["rel_score"]
    df_2["rel_win_pct"] = 1 / df_2["rel_win_pct"]
    df_2["rel_wins"] = 1 / df_2["rel_wins"]
    df_2["rel_losses"] = 1 / df_2["rel_losses"]

    res = pd.concat([df_1, df_2]).reset_index(drop=True)

    index_resumed = res.query("resumeDate == resumeDate").index

    df_3 = res.loc[index_resumed, :]
    df_3 = df_3.drop(["date"], axis=1)
    df_3 = df_3.rename(columns={"resumeDate": "date"})
    df_3["date"] = df_3["date"].apply(lambda x: x[:10])
    df_3["is_resume"] = 2

    for name in [
        "isTie",
        "count_wins_on_season",
        "count_losses_on_season",
        "win_pct",
        "win",
        "score",
        "rel_score",
        "rel_win_pct",
        "rel_wins",
        "rel_losses",
    ]:
        res.loc[index_resumed, name] = -1

    res.loc[index_resumed, "is_resume"] = 1
    res["is_resume"] = res["is_resume"].fillna(-1)

    res = pd.concat([res, df_3]).reset_index(drop=True)

    res.loc[:, "is_game"] = 1

    res = res.rename(columns={"gamePk": "game_id"})

    res = res[
        [
            "team_id",
            "game_id",
            "home_team",
            "away_team",
            "date",
            "is_game",
            "is_resume",
            "win",
            "score",
            "is_home",
            "win_pct",
            "count_wins_on_season",
            "count_losses_on_season",
            "gameType",
            "codedGameState",
            "detailedGameState",
            "isTie",
            "gameNumber",
            "doubleHeader",
            "gamesInSeries",
            "rel_score",
            "rel_win_pct",
            "rel_wins",
            "rel_losses",
        ]
    ]
    return res

def prepare_games_field(vitrine: pd.DataFrame) -> pd.DataFrame:
    vitrine.gameType = vitrine.gameType.replace({"F": "D", "W": "L"})
    vitrine = get_factorize(vitrine, "gameType")
    vitrine = get_factorize(vitrine, "codedGameState")
    vitrine = get_factorize(vitrine, "detailedGameState")
    vitrine = get_factorize(vitrine, "doubleHeader")

    vitrine.is_game = vitrine.is_game.fillna(-1)
    vitrine.gameType = vitrine.gameType.fillna(-1)
    vitrine.codedGameState = vitrine.codedGameState.fillna(-1)
    vitrine.detailedGameState = vitrine.detailedGameState.fillna(-1)
    vitrine.isTie = vitrine.isTie.fillna(-1)
    vitrine.doubleHeader = vitrine.doubleHeader.fillna(-1)

    for name in [
        "is_resume",
        "win",
        "score",
        "is_home",
        "win_pct",
        "count_wins_on_season",
        "count_losses_on_season",
        "gameNumber",
        "gamesInSeries",
        "rel_score",
        "rel_win_pct",
        "rel_wins",
        "rel_losses",
        "home_team",
        "away_team",
    ]:
        vitrine[name] = vitrine[name].fillna(-1)
    for name in [
        "is_resume",
        "count_wins_on_season",
        "count_losses_on_season",
        "gamesInSeries",
    ]:
        vitrine[name] = vitrine[name].astype(int)
    return vitrine


def generate_games(train: pd.DataFrame, vitrine: pd.DataFrame) -> pd.DataFrame:
    fields = names_nec_fields['games']
    nec_fields = list(set(fields) - set(["home_team", "away_team"])) + ["enemy_team"]
    if train['games'].iloc[0] == train['games'].iloc[0]:
        games = get_df_from_extend_field(
            train,
            "games",
            [],
            [],
        )
        games = prepare_games(games)
        games = prepare_games_field(games)

        features = get_part_of_vitrine(
            games,
            vitrine,
            ["team_id", "date"],
            ['id', 'date', 'game_id', 'team_id'] + fields
        )

        features.loc[:, "enemy_team"] = -1
        is_home_index = features.query("is_home == 1").index
        not_is_home_index = features.query("is_home == 0").index
        features.loc[is_home_index, "enemy_team"] = features.loc[is_home_index, "away_team"]
        features.loc[not_is_home_index, "enemy_team"] = features.loc[
            not_is_home_index, "home_team"
        ]
        features = features.drop(["home_team", "away_team"], axis=1)
    else:
        features = vitrine.copy()
        for col in nec_fields:
            features[col] = np.nan
    features = features[['id', 'date', 'game_id', 'team_id'] + nec_fields]
        
    return features.fillna(-1)


def prepare_player_box_score(df: pd.DataFrame) -> pd.DataFrame:
    df = df.rename(columns={"gameDate": "date", "playerId": "id", "gamePk": "game_id"})
    names = list(set(df.columns) - set(["id", "date", "game_id"]))
    df.positionCode = df.positionCode.apply(eval)
    df.jerseyNum = df.jerseyNum.apply(eval_str)
    df.jerseyNum = df.jerseyNum.replace(
        [
            69.0,
            72.0,
            73.0,
            75.0,
            76.0,
            78.0,
            79.0,
            80.0,
            81.0,
            82.0,
            83.0,
            84.0,
            85.0,
            86.0,
            87.0,
            88.0,
            89.0,
            90.0,
            91.0,
            92.0,
            93.0,
            94.0,
            95.0,
            96.0,
        ],
        -1,
    )
    df = get_factorize(df, "positionType")
    for name in [
        "flyOutsPitching",
        "gamesPlayedBatting",
        "sacFliesPitching",
        "blownSaves",
        "saveOpportunities",
        "assists",
        "putOuts",
        "sacBuntsPitching",
        "hits",
        "groundOutsPitching",
        "doubles",
        "leftOnBase",
        "gamesFinishedPitching",
        "balls",
        "strikes",
        "pickoffs",
        "hitByPitch",
        "lossesPitching",
        "gamesStartedPitching",
        "inheritedRunnersScored",
        "wildPitches",
        "atBatsPitching",
        "sacBunts",
        "strikeOutsPitching",
        "catchersInterference",
        "hitsPitching",
        "catchersInterferencePitching",
        "runsScored",
        "baseOnBalls",
        "gamesPlayedPitching",
        "errors",
        "rbi",
        "rbiPitching",
        "balks",
        "caughtStealing",
        "shutoutsPitching",
        "groundIntoTriplePlay",
        "plateAppearances",
        "hitBatsmen",
        "inningsPitched",
        "pickoffsPitching",
        "pitchesThrown",
        "groundIntoDoublePlay",
        "flyOuts",
        "homeRunsPitching",
        "homeRuns",
        "chances",
        "stolenBases",
        "airOutsPitching",
        "outsPitching",
        "caughtStealingPitching",
        "holds",
        "strikeOuts",
        "hitByPitchPitching",
        "runsPitching",
        "intentionalWalks",
        "jerseyNum",
        "totalBases",
        "stolenBasesPitching",
        "saves",
        "intentionalWalksPitching",
        "inheritedRunners",
        "battersFaced",
        "groundOuts",
        "triples",
        "earnedRuns",
        "battingOrder",
        "baseOnBallsPitching",
        "doublesPitching",
        "sacFlies",
        "triplesPitching",
        "winsPitching",
        "atBats",
        "completeGamesPitching",
    ]:
        df[name] = df[name].fillna(-1).astype(int)

    return df.loc[:, ["id", "date", "game_id"] + names]

def eval_str(x):
    if x == "":
        return np.nan
    if isinstance(x, str):
        return int(eval(x))
    return np.nan

def generate_player_box(train: pd.DataFrame, vitrine: pd.DataFrame) -> pd.DataFrame:
    nec_fields = names_nec_fields['player_box']
    if train['playerBoxScores'].iloc[0] == train['playerBoxScores'].iloc[0]:
        player_box = get_df_from_extend_field(
            train,
            "playerBoxScores",
            [],
            ["gameTimeUTC", "teamName", "playerName", "positionName", "teamId"],
        )
        player_box = prepare_player_box_score(player_box)
        #print(player_box.columns)

        features = get_part_of_vitrine(
            player_box,
            vitrine,
            ["id", "date", "game_id"],
            ["id", "date", 'game_id', 'team_id'] + nec_fields
        )
    else:
        features = vitrine.copy()
        for col in nec_fields:
            features[col] = np.nan
    features = features[['id', 'date', 'game_id', 'team_id'] + nec_fields]
        
    return features.fillna(-1)

def prepare_team_box_score(df: pd.DataFrame) -> pd.DataFrame:
    df = df.rename(columns={"gamePk": "game_id", "teamId": "team_id"})
    names = list(set(df.columns) - set(["team_id", "game_id"]))
    df = df.loc[:, ["team_id", "game_id"] + names]
    names = list(map(lambda x: x + "_team", names))
    df.columns = ["team_id", "game_id"] + names

    return df

def generate_team_box(train: pd.DataFrame, vitrine: pd.DataFrame) -> pd.DataFrame:
    nec_fields = names_nec_fields['team_box']
    if train['teamBoxScores'].iloc[0] == train['teamBoxScores'].iloc[0]:
        team_box = get_df_from_extend_field(
            train,
            "teamBoxScores",
            [],
            ["home", "gameTimeUTC", "gameDate"],
        )
        team_box = prepare_team_box_score(team_box)

        features = get_part_of_vitrine(
            team_box,
            vitrine,
            ["team_id", "game_id"],
            ["date", "id", "team_id", "game_id"] + nec_fields,
        )
    else:
        features = vitrine.copy()
        for col in nec_fields:
            features[col] = np.nan
    features = features[['id', 'date', "team_id", 'game_id'] + nec_fields]
        
    return features.fillna(-1)


def get_types_dict_rename(types: List[str], sub_date: str, sub_field: str) -> Dict:
    res = dict()
    for el in types:
        res[el] = f"{el}_{sub_date}_{sub_field}"
    return res

def add_transaction_features(
    vitrine: pd.DataFrame,
    df_i: pd.DataFrame,
    old_name_date: str = "date",
    sub_date: str = "simple",
) -> pd.DataFrame:
    df = df_i.rename(columns={"date": "i_date"}).rename(columns={old_name_date: "date"})

    keys = [["date", "team_id"], ["date", "team_id"], ["date", "id"]]
    names = ["from", "to", "player"]
    types = [
        "SFA",
        "TR",
        "NUM",
        "ASG",
        "DES",
        "CLW",
        "OUT",
        "REL",
        "SC",
        "OPT",
        "RTN",
        "SGN",
        "SE",
        "CU",
        "DFA",
        "RET",
    ]
    for t in types:
        if not t in list(df.columns):
            df[t] = 0
    for ind, second_field in enumerate(["fromTeamId", "toTeamId", "playerId"]):
        df_curr = df.groupby(["date", second_field]).sum().reset_index()
        df_curr = df_curr.astype({second_field: "int64"})
        df_curr = df_curr[["date", second_field] + types]
        df_curr = df_curr.rename(
            columns=get_types_dict_rename(types, sub_date, names[ind])
        )
        vitrine = pd.merge(
            vitrine,
            df_curr,
            left_on=keys[ind],
            right_on=["date", second_field],
            how="left",
        )
        vitrine = vitrine.drop([second_field], axis=1)

    return vitrine

def generate_trans(train: pd.DataFrame, vitrine_i: pd.DataFrame) -> pd.DataFrame:
    nec_fields = names_nec_fields['trans']
    vitrine = vitrine_i.copy()
    if train['transactions'].iloc[0] == train['transactions'].iloc[0]:
        trans = get_df_from_extend_field(
            train,
            "transactions",
            [],
            [
                "transactionId",
                "playerName",
                "fromTeamName",
                "toTeamName",
                "description",
                "typeDesc",
            ],
        )
        trans = pd.concat([trans, pd.get_dummies(trans["typeCode"])], axis=1)
        trans = trans.drop(["typeCode"], axis=1)

        sub_dates = ["simple", "eff", "resol"]
        for ind, date_type in enumerate(["i_date", "effectiveDate", "resolutionDate"]):
            vitrine = add_transaction_features(vitrine, trans, date_type, sub_dates[ind])
        for name in set(vitrine.columns) - set(["team_id", "game_id", "id", "date"]):
            vitrine[name] = vitrine[name].fillna(-1).astype(int)

        features = vitrine
    else:
        features = vitrine
        for col in nec_fields:
            features[col] = np.nan
    features = features[['id', 'date', "team_id", 'game_id'] + nec_fields]
        
    return features.fillna(-1)






def preapre_standings(df: pd.DataFrame) -> pd.DataFrame:
    df = df.rename(columns={"gameDate": "date", "teamId": "team_id"})
    df = get_factorize(df, "streakCode")
    df["wildCardLeader"] = df["wildCardLeader"].fillna("False")

    for name in get_object_types(df):
        df[name] = df[name].replace(["-", "E"], "-1").fillna("-1").apply(eval)

    for name in [
        "wildCardLeader",
        "divisionLeader",
        "divisionChamp",
        "alWins",
        "alLosses",
        "nlWins",
        "nlLosses",
    ]:
        df[name] = df[name].fillna(-1).astype(int)

    return df



def generate_standings(train: pd.DataFrame, vitrine: pd.DataFrame) -> pd.DataFrame:
    nec_fields = names_nec_fields['standings']
    if train['standings'].iloc[0] == train['standings'].iloc[0]:
        standings = get_df_from_extend_field(
            train,
            "standings",
            [],
            [
                "season",
                "teamName",
            ],
        )
        standings = preapre_standings(standings)
        features = get_part_of_vitrine(
            standings,
            vitrine,
            ["team_id", "date"],
            ["date", "id", "team_id", "game_id"] + nec_fields,
        )
    else:
        features = vitrine.copy()
        for col in nec_fields:
            features[col] = np.nan
    features = features[['id', 'date', "team_id", 'game_id'] + nec_fields]
        
    return features.fillna(-1)


def preapre_awards(df: pd.DataFrame) -> pd.DataFrame:
    df = df.rename(columns={"awardDate": "date", "playerId": "id"})
    df = get_factorize(df, "awardId")
    df.loc[:, "is_award"] = 1
    return df


def generate_awards(train: pd.DataFrame, vitrine: pd.DataFrame) -> pd.DataFrame:
    nec_fields = names_nec_fields['awards']
    if train['awards'].iloc[0] == train['awards'].iloc[0]:
        awards = get_df_from_extend_field(
            train,
            "awards",
            [],
            ["awardName", "awardSeason", "playerName", "awardPlayerTeamId"],
        )
        awards = preapre_awards(awards)
        features = get_part_of_vitrine(
            awards,
            vitrine,
            ["id", "date"],
            ["date", "id", "team_id", "game_id"] + nec_fields,
        )
    else:
        features = vitrine.copy()
        for col in nec_fields:
            features[col] = np.nan
    features = features[['id', 'date', "team_id", 'game_id'] + nec_fields]
        
    return features.fillna(-1)


def preapre_player_twit(df: pd.DataFrame) -> pd.DataFrame:
    df = df.rename(
        columns={"playerId": "id", "numberOfFollowers": "count_follow_player"}
    )
    return df


def generate_player_twit(train: pd.DataFrame, vitrine: pd.DataFrame) -> pd.DataFrame:
    if train['playerTwitterFollowers'].iloc[0] == train['playerTwitterFollowers'].iloc[0]:
        player_twit = get_df_from_extend_field(
            train,
            "playerTwitterFollowers",
            [],
            [
                "playerName",
                "accountName",
                "twitterHandle",
            ],
        )
        player_twit = preapre_player_twit(player_twit)
        player_twit = player_twit[['id', "count_follow_player"]]
        player_twit.index = player_twit['id']
        player_twit = player_twit.to_dict('index')
        return player_twit
    else:
        return dict()
    
    
def preapre_team_twit(df: pd.DataFrame) -> pd.DataFrame:
    df = df.rename(
        columns={"teamId": "team_id", "numberOfFollowers": "count_follow_team"}
    )
    return df


def generate_team_twit(train: pd.DataFrame, vitrine: pd.DataFrame) -> pd.DataFrame:
    if train['playerTwitterFollowers'].iloc[0] == train['playerTwitterFollowers'].iloc[0]:
        team_twit = get_df_from_extend_field(
            train,
            "teamTwitterFollowers",
            [],
            [
                "teamName",
                "accountName",
                "twitterHandle",
            ],
        )
        team_twit = preapre_team_twit(team_twit)
        team_twit = team_twit[['team_id', "count_follow_team"]]
        team_twit.index = team_twit['team_id']
        team_twit = team_twit.to_dict('index')
        return team_twit
    else:
        return dict()
    
    
def add_time_features(df: pd.DataFrame) -> pd.DataFrame:
    df["age"] = (pd.to_datetime(df["date"]) - pd.to_datetime(df["DOB"])).dt.days / 365
    df["year_after_debut"] = (
        pd.to_datetime(df["date"]) - pd.to_datetime(df["mlbDebutDate"])
    ).dt.days / 365
    df["debut_age"] = (
        pd.to_datetime(df["mlbDebutDate"]) - pd.to_datetime(df["DOB"])
    ).dt.days / 365
    df["rel_mlb_age"] = df["year_after_debut"] / df["age"]

    return df.drop(["DOB", "mlbDebutDate"], axis=1)


def generate_players(vitrine_i: pd.DataFrame) -> pd.DataFrame:
    vitrine = vitrine_i.copy()
    players = pls.copy()
    players = players.drop(
        [
            "playerName",
            "birthCity",
            "birthStateProvince",
            "playerForTestSetAndFuturePreds",
        ],
        axis=1,
    )
    players = players.rename(
        columns={
            "playerId": "id",
        }
    )
    players = get_factorize(players, "birthCountry")
    players = get_factorize(players, "primaryPositionName")
    players["primaryPositionCode"] = (
        players["primaryPositionCode"]
        .replace({"I": "11", "O": "0"})
        .fillna("-1")
        .apply(eval)
    )

    players = decrease_mem_consuming(players)
    types = players.dtypes.to_dict()
    del types["DOB"]
    del types["mlbDebutDate"]

    vitrine = pd.merge(
        vitrine[["id", "date", "game_id", "team_id"]], players, on=["id"], how="left"
    )
    vitrine = vitrine.replace([np.inf, -np.inf], np.nan)
    vitrine = vitrine.fillna(-1)
    vitrine = add_time_features(vitrine)
    vitrine = vitrine.astype(types)
    return vitrine


def triple_date_cats(vitrine: pd.DataFrame) -> pd.DataFrame:
    preSeason = vitrine.query("preSeasonStartDate <= date <= preSeasonEndDate").index
    regularSeason = vitrine.query(
        "regularSeasonStartDate <= date <= regularSeasonEndDate"
    ).index
    postSeason = vitrine.query("postSeasonStartDate <= date <= postSeasonEndDate").index
    allStarDate = vitrine.query("date == allStarDate").index

    vitrine.loc[:, "triple_dates_cats"] = 0
    vitrine.loc[preSeason, "triple_dates_cats"] = 1
    vitrine.loc[regularSeason, "triple_dates_cats"] = 2
    vitrine.loc[postSeason, "triple_dates_cats"] = 3
    vitrine.loc[allStarDate, "triple_dates_cats"] = 4

    return vitrine


def triple_date_cats_2(vitrine: pd.DataFrame) -> pd.DataFrame:
    preSeason = vitrine.query("preSeasonStartDate <= date <= preSeasonEndDate").index
    regularSeason_1 = vitrine.query(
        "regularSeasonStartDate <= date <= lastDate1stHalf"
    ).index
    regularSeason_2 = vitrine.query(
        "firstDate2ndHalf <= date <= regularSeasonEndDate"
    ).index
    postSeason = vitrine.query("postSeasonStartDate <= date <= postSeasonEndDate").index
    allStarDate = vitrine.query("date == allStarDate").index

    vitrine.loc[:, "triple_dates_cats_2"] = 0
    vitrine.loc[preSeason, "triple_dates_cats_2"] = 1
    vitrine.loc[regularSeason_1, "triple_dates_cats_2"] = 2
    vitrine.loc[regularSeason_2, "triple_dates_cats_2"] = 3
    vitrine.loc[postSeason, "triple_dates_cats_2"] = 4
    vitrine.loc[allStarDate, "triple_dates_cats_2"] = 5

    return vitrine


def double_date_cats(vitrine: pd.DataFrame) -> pd.DataFrame:
    vitrine_preSeason = vitrine.query(
        "preSeasonStartDate <= date <= preSeasonEndDate"
    ).index
    vitrine_Season = vitrine.query("seasonStartDate <= date <= seasonEndDate").index
    allStarDate = vitrine.query("date == allStarDate").index

    vitrine.loc[:, "double_dates_cats"] = 0
    vitrine.loc[vitrine_preSeason, "double_dates_cats"] = 1
    vitrine.loc[vitrine_Season, "double_dates_cats"] = 2
    vitrine.loc[allStarDate, "double_dates_cats"] = 3

    return vitrine


def double_date_cats_2(vitrine: pd.DataFrame) -> pd.DataFrame:
    vitrine_preSeason = vitrine.query(
        "preSeasonStartDate <= date <= preSeasonEndDate"
    ).index
    vitrine_Season_1 = vitrine.query("seasonStartDate <= date <= lastDate1stHalf").index
    vitrine_Season_2 = vitrine.query("firstDate2ndHalf <= date <= seasonEndDate").index
    allStarDate = vitrine.query("date == allStarDate").index

    vitrine.loc[:, "double_dates_cats_2"] = 0
    vitrine.loc[vitrine_preSeason, "double_dates_cats_2"] = 1
    vitrine.loc[vitrine_Season_1, "double_dates_cats_2"] = 2
    vitrine.loc[vitrine_Season_2, "double_dates_cats_2"] = 3
    vitrine.loc[allStarDate, "double_dates_cats_2"] = 4

    return vitrine


def add_seasons_features(df: pd.DataFrame) -> pd.DataFrame:
    df = triple_date_cats(df)
    df = triple_date_cats_2(df)
    df = double_date_cats(df)
    df = double_date_cats_2(df)

    return df.drop(
        [
            "season",
            "seasonStartDate",
            "seasonEndDate",
            "preSeasonStartDate",
            "preSeasonEndDate",
            "regularSeasonStartDate",
            "regularSeasonEndDate",
            "lastDate1stHalf",
            "allStarDate",
            "firstDate2ndHalf",
            "postSeasonStartDate",
            "postSeasonEndDate",
        ],
        axis=1,
    )


def generate_seasons(vitrine_i: pd.DataFrame) -> pd.DataFrame:
    seasons = sns
    vitrine = vitrine_i.copy()

    vitrine = vitrine.loc[:, ["id", "date", "game_id", "team_id"]]
    vitrine.loc[:, "season"] = vitrine["date"].apply(lambda x: int(x[:4]))

    vitrine = pd.merge(vitrine, seasons, on=["season"])
    vitrine = vitrine.replace([np.inf, -np.inf], np.nan)
    vitrine = vitrine.fillna(-1)

    vitrine = add_seasons_features(vitrine)
    return vitrine


def generate_teams(vitrine: pd.DataFrame) -> pd.DataFrame:
    teams = tms.copy()
    types = teams.dtypes.to_dict()
    del types["team_id"]
    
    vitrine = pd.merge(
        vitrine[["id", "date", "game_id", "team_id"]], teams, on=["team_id"], how="left"
    )
    vitrine = vitrine.replace([np.inf, -np.inf], np.nan)
    vitrine = vitrine.fillna(-1)
    vitrine = vitrine.astype(types)
    vitrine = decrease_mem_consuming(vitrine)

    return vitrine



def day_before(val):
    datetime_object = datetime.strptime(val, '%Y-%m-%d')
    date_before = datetime_object - timedelta(1)
    return date_before.strftime('%Y-%m-%d')

def get_base(df: pd.DataFrame) -> pd.DataFrame:
    df["date"] = df["date_playerId"].apply(
        lambda x: day_before(datetime.strftime(datetime.strptime(x.split("_")[0], '%Y%m%d'), '%Y-%m-%d')))
    df["id"] = df["date_playerId"].apply(lambda x: int( x.split("_")[1]))
    return df


def get_features(df, train):
    df = get_base(df)
    df = df\
    .drop(["target1","target2","target3","target4"], axis=1)\
    .reset_index(drop=True)
    base = df[['id', 'date']]
    rosters = generate_rosters(train, base)
    #print(rosters.shape)
    games = generate_games(train, rosters[['id', 'date', 'team_id']])
    #print(games.shape)
    player_box = generate_player_box(train, games[['id', 'date', 'team_id', 'game_id']])
    #print(player_box.shape)
    team_box = generate_team_box(train, player_box[['id', 'date', 'team_id', 'game_id']])
    #print(team_box.shape)
    trans = generate_trans(train, team_box[['id', 'date', 'team_id', 'game_id']])
    #print(trans.shape)
    standings = generate_standings(train, trans[['id', 'date', 'team_id', 'game_id']])
    #print(standings.shape)
    awards = generate_awards(train, standings[['id', 'date', 'team_id', 'game_id']])
    #print(awards.shape)
    
    
    
    player_twit = generate_player_twit(train, awards[['id', 'date', 'team_id', 'game_id']])
    team_twit = generate_team_twit(train, awards[['id', 'date', 'team_id', 'game_id']])
    
    
    players = generate_players(awards[['id', 'date', 'team_id', 'game_id']])
    #print(players.shape)
    seasons = generate_seasons(awards[['id', 'date', 'team_id', 'game_id']])
    #print(seasons.shape)
    teams = generate_teams(awards[['id', 'date', 'team_id', 'game_id']])
    #print(teams.shape)
    
    
    #print(rosters.columns)
    #print(games.columns)
    #print(player_box.columns)
    #print(team_box.columns)
    #print(trans.columns)
    #print(standings.columns)
    #print(awards.columns)
    #print(player_twit)
    #print(team_twit)
    #print(players.columns)
    #print(teams.columns)
    
    ret_1_ma_1 = np.array(
        [0 for i in range(len(df))], dtype=np.float32)
    ret_2_ma_1 = np.array(
        [0 for i in range(len(df))], dtype=np.float32)
    ret_3_ma_1 = np.array(
        [0 for i in range(len(df))], dtype=np.float32)
    ret_4_ma_1 = np.array(
        [0 for i in range(len(df))], dtype=np.float32)
    
    
    
    for i, player_id in enumerate(df['id'].values):
        lag_1 = d.get(player_id)
        if lag_1 != None:
            ret_1_ma_1[i] = lag_1['target_1_ma_med_100']
            ret_2_ma_1[i] = lag_1['target_2_ma_med_100']
            ret_3_ma_1[i] = lag_1['target_3_ma_med_100']
            ret_4_ma_1[i] = lag_1['target_4_ma_med_100']
    
    df['target1'] = ret_1_ma_1
    df['target2'] = ret_2_ma_1
    df['target3'] = ret_3_ma_1
    df['target4'] = ret_4_ma_1
    
    #standings = generate_standings(train, games)
    #print(standings.shape)
    for right_df in [rosters, games, player_box, team_box, trans, standings, awards, players, seasons, teams]:
        df = pd.merge(
            df,
            right_df,
            left_on=['id', 'date'],
            right_on=['id', 'date'],
            how='left',
            suffixes=("", "_xx"),
        )
        drop_names = list(filter(lambda x: x.find("_xx") != -1, df.columns))
        df = df.drop(drop_names, axis=1)
    df = df.fillna(-1)
    
    for target_name, val in models.items():
        if target_name in ['target1']:
            df[target_name] = val['model'].predict(df.loc[:, val['features']])
    df.index = df.date
    #return decrease_mem_consuming(df)
    return df.loc[:, ['date_playerId'] + [f'target{i+1}' for i in range(4)]]

In [4]:
import mlb
env = mlb.make_env() # initialize the environment
iter_test = env.iter_test() # iterator which loops over each date in test set

for (test_df, sample_prediction_df) in tqdm(iter_test):
    sample_prediction_df = get_features(sample_prediction_df, test_df)
    env.predict(sample_prediction_df)

0it [00:00, ?it/s]

This version of the API is not optimized and should not be used to estimate the runtime of your code on the hidden test set.


5it [00:06,  1.37s/it]
