In [1]:
from cric_info_scrapping import CricInfoScrape
import pandas as pd
import json
from tqdm import tqdm
import numpy as np

In [3]:
# load year_matchnum_dict
with open('../data/ipl_year_match_num.json') as json_file:
    year_matchnum_dict = json.load(json_file)

In [4]:
# initliase class
s = CricInfoScrape()

In [5]:
def get_features_for_match(match_num):
    
    """
    gets one long list of features for each match
    """

    players = s.get_match_summary_info(match_num)

    # run totals
    runs_inn1 = players['innings1']['total_runs']
    runs_inn2 = players['innings2']['total_runs']

    # if inns 1 wins then winner = 0
    if runs_inn1 > runs_inn2:
        winner = 0
    elif runs_inn1 < runs_inn2:
        winner = 1
    else:
        winner = None

    # as long as there was a noticible result compute
    if winner is not None:

        innings = ['innings1', 'innings2']

        all_players = []
        for inn in innings:
            player_lst = players[inn]['batsmen']
            for pos, player_dict in enumerate(player_lst):
                for k in player_dict.keys():
                    player_stats = s.get_playing_stats(player_dict[k])
                    player_stats = player_stats.add_suffix(
                        f'_{inn}-pos{pos}'
                    )
                    all_players.append(player_stats)
                    
        return pd.concat(all_players), winner
    
    else:
        return None, None


In [7]:
all_ipl_matches = [
    num 
    for match_list in year_matchnum_dict.values() 
    for num in match_list 
]

X = []
y = []
for match_num in tqdm(all_ipl_matches):
    try:
        X_i, y_i = get_features_for_match(match_num)

        if y_i is not None:
            X.append(X_i)
            y.append(y_i)
    except KeyError:
        print('key error')
        pass
    except IndexError:
        print('index error')
        pass

    

  3%|▎         | 22/820 [03:19<1:17:23,  5.82s/it]

index error


  3%|▎         | 23/820 [03:20<57:46,  4.35s/it]  

key error


  3%|▎         | 24/820 [03:21<43:27,  3.28s/it]

key error


  3%|▎         | 25/820 [03:22<33:20,  2.52s/it]

key error


  3%|▎         | 26/820 [03:22<26:18,  1.99s/it]

key error


  3%|▎         | 27/820 [03:23<21:27,  1.62s/it]

key error


  3%|▎         | 28/820 [03:24<18:02,  1.37s/it]

key error


  4%|▎         | 29/820 [03:25<15:26,  1.17s/it]

key error


  4%|▎         | 30/820 [03:25<13:47,  1.05s/it]

key error


  4%|▍         | 31/820 [03:26<12:30,  1.05it/s]

key error


  4%|▍         | 32/820 [03:27<11:46,  1.12it/s]

key error


  4%|▍         | 33/820 [03:28<11:23,  1.15it/s]

key error


  4%|▍         | 34/820 [03:29<11:34,  1.13it/s]

key error


  4%|▍         | 35/820 [03:30<12:24,  1.05it/s]

key error


  4%|▍         | 36/820 [03:31<11:54,  1.10it/s]

key error


  5%|▍         | 37/820 [03:32<12:15,  1.07it/s]

key error


  5%|▍         | 38/820 [03:33<13:45,  1.06s/it]

key error


  5%|▍         | 39/820 [03:34<12:43,  1.02it/s]

key error


  5%|▍         | 40/820 [03:35<12:17,  1.06it/s]

key error


  5%|▌         | 41/820 [03:35<11:56,  1.09it/s]

key error


  5%|▌         | 42/820 [03:37<13:25,  1.04s/it]

key error


  5%|▌         | 43/820 [03:38<12:51,  1.01it/s]

key error


  5%|▌         | 44/820 [03:39<15:21,  1.19s/it]

key error


  5%|▌         | 45/820 [03:40<15:26,  1.20s/it]

key error


  6%|▌         | 46/820 [03:41<14:36,  1.13s/it]

key error


  6%|▌         | 47/820 [03:43<16:33,  1.29s/it]

key error


  6%|▌         | 48/820 [03:44<16:46,  1.30s/it]

key error


  6%|▌         | 49/820 [03:45<15:36,  1.21s/it]

key error


  6%|▌         | 50/820 [03:47<16:00,  1.25s/it]

key error


  6%|▌         | 51/820 [03:48<15:53,  1.24s/it]

key error


  6%|▋         | 52/820 [03:49<14:05,  1.10s/it]

key error


  6%|▋         | 53/820 [03:50<13:04,  1.02s/it]

key error


  7%|▋         | 54/820 [03:50<12:06,  1.05it/s]

key error


  7%|▋         | 55/820 [03:51<12:12,  1.04it/s]

key error


  7%|▋         | 56/820 [03:52<11:31,  1.10it/s]

key error


 25%|██▌       | 205/820 [30:00<1:20:26,  7.85s/it]

index error


 39%|███▉      | 321/820 [49:45<1:01:39,  7.41s/it]

index error


 40%|███▉      | 325/820 [50:19<1:01:00,  7.40s/it]

index error


 64%|██████▍   | 524/820 [1:25:01<32:29,  6.58s/it]  

index error


 64%|██████▍   | 526/820 [1:25:10<26:34,  5.42s/it]

index error


 72%|███████▏  | 588/820 [1:36:16<32:10,  8.32s/it]

index error


 78%|███████▊  | 636/820 [1:45:15<26:46,  8.73s/it]

index error


 86%|████████▋ | 709/820 [1:57:23<14:26,  7.81s/it]

index error


 87%|████████▋ | 715/820 [1:58:02<09:25,  5.39s/it]

index error


 99%|█████████▊| 808/820 [2:13:57<01:29,  7.48s/it]

index error


100%|██████████| 820/820 [2:15:54<00:00,  9.94s/it]


In [18]:
feat = pd.concat(X, axis=1)
feat = feat.T
feat.to_pickle("../data/match_features.pkl")

In [22]:
pd.Series(y).to_pickle("../data/match_results.pkl")