In [None]:

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
from sklearn.linear_model import LogisticRegression
pd.set_option("display.max_column", 999)

In [None]:
DATA_PATH = '../input/ncaaw-march-mania-2021/WDataFiles_Stage2/'

## Feature ingineering

In [None]:
df_season_results = pd.read_csv(
    DATA_PATH + 'WRegularSeasonCompactResults.csv')
df_season_results.drop(['WLoc'], axis=1, inplace=True)

In [None]:
df_season_results['ScoreGap'] = df_season_results['WScore'] - \
                                df_season_results['LScore']
df_season_results.head()

In [None]:
num_win = df_season_results.groupby(['Season', 'WTeamID']).count()
num_win = num_win.reset_index()[['Season', 'WTeamID', 'DayNum']].rename(
    columns={"DayNum": "NumWins", "WTeamID": "TeamID"})

num_loss = df_season_results.groupby(['Season', 'LTeamID']).count()
num_loss = num_loss.reset_index()[['Season', 'LTeamID', 'DayNum']].rename(
    columns={"DayNum": "NumLosses", "LTeamID": "TeamID"})

gap_win = df_season_results.groupby(['Season', 'WTeamID']).mean().reset_index()
gap_win = gap_win[['Season', 'WTeamID', 'ScoreGap']].rename(
    columns={"ScoreGap": "GapWins", "WTeamID": "TeamID"})

gap_loss = df_season_results.groupby(['Season', 'LTeamID']).mean().reset_index()
gap_loss = gap_loss[['Season', 'LTeamID', 'ScoreGap']].rename(
    columns={"ScoreGap": "GapLosses", "LTeamID": "TeamID"})

In [None]:
df_features_season_w = df_season_results.groupby(
    ['Season', 'WTeamID']).count().reset_index()\
        [['Season', 'WTeamID']].rename(columns={"WTeamID": "TeamID"})
df_features_season_l = df_season_results.groupby(
    ['Season', 'LTeamID']).count().reset_index()\
        [['Season', 'LTeamID']].rename(columns={"LTeamID": "TeamID"})

In [None]:
df_features_season = pd.concat(
    [df_features_season_w, df_features_season_l], 0)\
        .drop_duplicates().sort_values(['Season', 'TeamID'])\
            .reset_index(drop=True)

In [None]:
df_features_season = df_features_season.merge(
    num_win, on=['Season', 'TeamID'], how='left')
df_features_season = df_features_season.merge(
    num_loss, on=['Season', 'TeamID'], how='left')
df_features_season = df_features_season.merge(
    gap_win, on=['Season', 'TeamID'], how='left')
df_features_season = df_features_season.merge(
    gap_loss, on=['Season', 'TeamID'], how='left')

In [None]:
df_features_season

In [None]:
df_features_season.fillna(0, inplace=True)

In [None]:
df_features_season['WinRatio'] = df_features_season['NumWins']\
    / (df_features_season['NumWins'] + df_features_season['NumLosses'])
df_features_season['GapAvg'] = (
    (df_features_season['NumWins'] * df_features_season['GapWins'] - 
    df_features_season['NumLosses'] * df_features_season['GapLosses'])
    / (df_features_season['NumWins'] + df_features_season['NumLosses'])
)

In [None]:
df_teams = df_features_season
df_teams.info()

In [None]:
df_seeds = pd.read_csv(DATA_PATH + "WNCAATourneySeeds.csv")
df_seeds

In [None]:
df_seeds['Seed'] = df_seeds['Seed'].apply(lambda s: int(s[1:3]))
df_seeds.head()

In [None]:
df_teams = df_teams.merge(df_seeds, how='left',
                          on=['Season', 'TeamID'])
df_teams.tail()

In [None]:
df_teams['SeedEsp'] = np.exp(df_teams['Seed'])
df_teams

## Create train dataset

In [None]:
df = pd.read_csv(DATA_PATH + 'WNCAATourneyCompactResults.csv')
df = df.drop(['DayNum', 'NumOT', 'WLoc'], axis=1)\
        .reset_index(drop=True)

df

#### Now we have only win matches. Let's add loosing games!

In [None]:
def add_loosing_matches(win_df):
    w_names = [name for name in win_df.columns.values if name[0] == 'W']
    l_names = [name for name in win_df.columns.values if name[0] == 'L']
    a_names = ['A' + name[1:] for name in w_names]
    b_names = ['B' + name[1:] for name in l_names]
    
    orig_names = w_names + l_names
    win_names = a_names + b_names
    lose_names = b_names + a_names
    
    win_df = win_df.copy()
    lose_df = win_df.copy()
    
    win_df = win_df.rename(columns=dict(zip(orig_names, win_names)))
    lose_df = lose_df.rename(columns=dict(zip(orig_names, lose_names)))
    
    return pd.concat([win_df, lose_df], 0, sort=False)

df = add_loosing_matches(df)

In [None]:
df.info()

In [None]:
def add_teams_stat(df, df_teams):
    old_col_names = df_teams.columns.values[2:]
    
    new_col_names = ['A' + name for name in old_col_names]
    df = pd.merge(
        df,
        df_teams,
        how='left',
        left_on=['Season', 'ATeamID'],
        right_on=['Season', 'TeamID']
    ).rename(columns=dict(zip(old_col_names, new_col_names))).drop(
        columns='TeamID', axis=1)
    
    new_col_names = ['B' + name for name in old_col_names]
    df = pd.merge(
        df,
        df_teams,
        how='left',
        left_on=['Season', 'BTeamID'],
        right_on=['Season', 'TeamID']
    ).rename(columns=dict(zip(old_col_names, new_col_names))).drop(
        columns='TeamID', axis=1)
    
    df['SeedDiff'] = df['ASeed'] - df['BSeed']
    df['WinRatioDiff'] = df['AWinRatio'] - df['BWinRatio']
    df['GapAvgDiff'] = df['AGapAvg'] - df['BGapAvg']
    
    rank_columns = [col for col in df_teams.columns.values
                    if 'Rank' in col]
    for col in rank_columns:
        df[col + 'Diff'] = df['A' + col] - df['B' + col]
    
    return df

In [None]:
df = add_teams_stat(df, df_teams)

In [None]:
df['WinA'] = ((df['AScore'] - df['BScore']) > 0).astype(int)

In [None]:
df.info()

In [None]:
df_test = pd.read_csv(DATA_PATH + "WSampleSubmissionStage2.csv")

In [None]:
df_test['Season'] = df_test['ID'].apply(lambda x: int(x.split('_')[0]))
df_test['ATeamID'] = df_test['ID'].apply(lambda x: int(x.split('_')[1]))
df_test['BTeamID'] = df_test['ID'].apply(lambda x: int(x.split('_')[2]))

In [None]:
df_test.head()

In [None]:
df_test = add_teams_stat(df_test, df_teams)
df_test.head()

In [None]:
X = df.drop(['Season', 'ATeamID', 'AScore', 'BTeamID',
                   'BScore', 'WinA'], axis=1)
y = df['WinA']
X_test = df_test.drop(['ID', 'Pred', 'Season', 'ATeamID',
                       'BTeamID'], axis=1)

In [None]:
#use it for linear models
diff_cols = [col for col in X.columns.values if 'Diff' in col]
X = X.drop(diff_cols, axis=1)
X_test = X_test.drop(diff_cols, axis=1)

In [None]:
X.info()

In [None]:
X_test.info()

## Lets train some models!

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix,\
                            log_loss
from sklearn.model_selection import GridSearchCV, StratifiedKFold,\
                                    train_test_split
from sklearn.preprocessing import StandardScaler

from hyperopt import STATUS_OK, Trials, fmin, hp, tpe

import matplotlib.pyplot as plt

In [None]:
scaler = StandardScaler()
scaler.fit(X) 
X_scaled = pd.DataFrame(scaler.transform(X),columns = X.columns)
X_test_scaled = pd.DataFrame(scaler.transform(X_test),
                             columns = X_test.columns)

In [None]:
logit = LogisticRegression(C=0.03812083)
logit.fit(X_scaled, y)
preds = logit.predict_proba(X_test_scaled)[:, 1]
pd.DataFrame(preds).describe()

## Make submission

In [None]:
submission = pd.read_csv(DATA_PATH + "WSampleSubmissionStage2.csv")
submission.Pred = preds
submission.to_csv('logit_submit.csv', index=False)