In [3]:
import pandas as pd
import numpy as np
import pickle
import os
from lxml import etree
from logos_tools import *
from logos_opta import *

In [4]:
pd.options.display.max_rows=20

In [5]:
EVENT_FILE = '../resources/dim/event.txt'
QUALIFIER_FILE = '../resources/dim/qualifier.txt'
ASSOCIATE_FILE = '../resources/dim/associate_use.tsv'
GAME_FILE = '../resources/games_friendly/3_Metz_vs_Angers.xml'
SAVE_DIR = '../resources/samples/team'

In [6]:
# 加载维表
event_df = pd.read_csv(EVENT_FILE, sep='|')
event_s = pd.Series(data=event_df.type.values, index=[str(x) for x in event_df.id])
event_dict = event_s.to_dict()

qualifier_df = pd.read_csv(QUALIFIER_FILE, sep='|')
qualifier_s = pd.Series(data=qualifier_df.type.values, index=[str(x) for x in qualifier_df.id])
qualifier_dict = qualifier_s.to_dict()

associate_df = pd.read_csv(ASSOCIATE_FILE, sep='\t')
associate_dict = {str(associate_df.Type_id[i]):associate_df.qualifier_id[i].split(',') for i in range(len(associate_df))}

In [8]:
# 读取比赛
xml = etree.parse(GAME_FILE)

In [9]:
# 获取主客队信息
game = xml.xpath('Game')[0]
away_team_id = game.get('away_team_id')
away_team_name = game.get('away_team_name')
home_team_id = game.get('home_team_id')
home_team_name = game.get('home_team_name')

In [10]:
# 处理事件，将xml转化为dataframe
game_df = pd.concat([parse_event(x, event_dict, qualifier_dict, associate_dict) for x in game], axis=0)
game_df = game_df.fillna(value=UNK)
# Deleted event去除附加信息
game_df.loc[game_df.event_type=='Deleted event', 
            ['length', 'direction', 'position', 'qualifier']] = UNK
# Clearance去除方向与位置信息
game_df.loc[game_df.event_type=='Clearance', 
            ['direction', 'position']] = UNK
# 去除Start/END事件
game_df = game_df.loc[~game_df.event_type.isin(['Team set up', 'Start', 'End', 'Collection End'])]
# 标注主客队
game_df['team_id_real'] = game_df['team_id']
game_df['team_id'] = '1'
game_df.loc[game_df.team_id_real==away_team_id, 'team_id'] = '0'
# 便于统计的时间
game_df['time'] = game_df['min'].astype('int')*60 + game_df['sec'].astype('int')
# 重新整理index
game_df.index = list(range(len(game_df)))

In [143]:
# 整理df
# columns = ['period', 'min', 'sec', 'player_id', 'team_id', 'event_type', 'outcome', 'length', 'direction', 
#            'x', 'y', 'position', 'qualifier']
# game_df = game_df.reindex(columns=columns)

In [None]:
result1 = make_half_sample('1', game_df, False)

1
4
6
7
9
10
13
14
15
16
19
21
24
27
30
32
33
34
41
44
45
48
50
53
54
55
56
make_one_sample error
58
59
72
75
77
78
80
81
84
87
90
91
92
95
101
103
105
108
115
116
160
166
167
169
175
177
198
200
202
207
209
210
212
214
216
218
221
223
227
231
242
243
245
248
249
252
272
275
276
280
281
283
314
316
317
321
323
327
328
330
336
340
346
349
353
356
359
365
368
370
372
373
374
401


In [620]:
SECONDS_15MINUTES = 15*60
# 上下半场分开
result1 = make_half_sample('1', game_df, False)
result2 = make_half_sample('2', game_df, False)

In [563]:
# 保存数据
result = pd.concat([result1, result2], axis=0, sort=False)
result = result.reset_index(drop=True)
team_info_df = pd.DataFrame({'team_id':['0', '1'], 'team_id_real':[away_team_id, home_team_id], 'team_name':[away_team_name, home_team_name]})
result = pd.merge(result, team_info_df, on='team_id')
save_name = GAME_FILE.split('/')[-1].split('.')[0] + '.tsv'
result.to_csv(os.path.join(SAVE_DIR, save_name), sep='\t', index=False)

In [132]:
half_df = game_df.loc[game_df.period==use_half]
time_unique_s = half_df.groupby('time').apply(lambda df: list(df.index))
time_unique = pd.Series(time_unique_s.index)
# 限定可以被循环的开始时间（在半场内至少能满15分钟）
end_max = half_df.time.iloc[-2]
start_max = end_max - SECONDS_15MINUTES
result = []
time_unique_loop = time_unique.loc[time_unique<=start_max]
ct = len(time_unique_loop)

In [133]:
time_unique_loop

0      2700
1      2704
2      2714
3      2723
4      2725
5      2727
6      2729
7      2735
8      2738
9      2740
       ... 
471    4649
472    4650
473    4651
474    4652
475    4655
476    4657
477    4659
478    4663
479    4668
480    4672
Name: time, Length: 481, dtype: int64

In [134]:
i = 10
start = time_unique_loop.iloc[i]
end = time_unique[time_unique <= (start+SECONDS_15MINUTES)].iloc[-1]
use_df = half_df.loc[(half_df.time>=start) & (half_df.time<=end)]

In [135]:
use_df

Unnamed: 0,direction,event_type,length,min,outcome,period,player_id,position,qualifier,sec,team_id,x,y,team_id_real,time
787,,Deleted event,,45,1,2,18574,,,41,0,0.0,0.0,2128,2741
788,right,Pass,middle,45,1,2,168985,Center,,42,0,51.4,73.8,2128,2742
789,right,Pass,middle,45,1,2,168109,Right,,44,0,53.8,45.8,2128,2744
790,back,Pass,middle,45,1,2,58309,Back,,47,0,54.6,5.6,2128,2747
791,forward,Pass,middle,45,1,2,42416,Right,,49,0,43.5,14.5,2128,2749
792,back,Pass,middle,45,1,2,66589,Center,Lay-off,52,0,72.1,15.4,2128,2752
793,forward,Pass,long,45,1,2,168109,Right,,53,0,62.5,28.0,2128,2753
794,left,Pass,long,45,1,2,58309,Left,Chipped|Cross,55,0,96.7,14.1,2128,2755
795,right,Pass,middle,46,0,2,79870,Center,Cross,2,0,88.1,92.2,2128,2762
796,,Clearance,middle,46,1,2,133633,,,5,1,7.6,47.0,145,2765


In [None]:
nextone = half_df.loc[use_df.index[-1]+1]
# 最后十个事件不完整，不纳入参考
use_df = use_df.iloc[:-10]
end = use_df.time.iloc[-1]
use_time_unique = time_unique.loc[(time_unique>=start) & (time_unique<=end)]

In [115]:
use_df.iloc[-1]

direction         null
event_type         Out
length            null
min                 60
outcome              0
period               2
player_id        86281
position        Center
qualifier         null
sec                 19
team_id              1
x                102.0
y                 39.1
team_id_real      1028
time              3619
Name: 1132, dtype: object

In [121]:
nextone.time

3643

In [123]:
len(use_df)

323

3590

In [604]:
def make_half_sample(use_half, game_df, use_control=False):
    half_df = game_df.loc[game_df.period==use_half]
    time_unique_s = half_df.groupby('time').apply(lambda df: list(df.index))
    time_unique = pd.Series(time_unique_s.index)
    # 限定可以被循环的开始时间（在半场内至少能满15分钟）
    end_max = half_df.time.iloc[-2]
    start_max = end_max - SECONDS_15MINUTES
    result = []
    time_unique_loop = time_unique.loc[time_unique<=start_max]
    ct = len(time_unique_loop)
    for i in range(ct):
        print(round(i*1.0/ct, 4))
        start = time_unique_loop.iloc[i]
        end = time_unique[time_unique <= (start+SECONDS_15MINUTES)].iloc[-1]
        use_df = half_df.loc[(half_df.time>=start) & (half_df.time<=end)]
        use_time_unique = time_unique.loc[(time_unique>=start) & (time_unique<=end)]
        nextone = half_df.loc[use_df.index[-1]+1]
        # 去除下一个事件是delete
        if (nextone.event_type != 'Deleted event'):
            try:
                result.append(make_one_sample(use_half, start, end, use_time_unique, use_df, nextone, use_control))
            except:
                print('make_one_sample error')
    return pd.concat(result, axis=0, sort=False)

[34m__pycache__[m[m                        make_sample_team.py
data_summary.ipynb                 print_associate_event_qualifier.py
logos_opta.py                      test.ipynb
logos_tools.py                     transform_xml.py


In [582]:
# 解析球员球队
xml = etree.parse('../resources/Players and IDs - F40 - L1 20162017.xml')
team_id = []
team_name = []
player_id = []
player_name = []
position = []
real_position = []
real_position_side = []
jersey_num = []
for element in xml.iter(tag=etree.Element):
    if (element.tag=='Team'):
        team_id.append(element.get('uID')[1:])
        team_name.append(element.xpath('Name')[0].text)
    if (element.tag=='Player'):
        player_id.append(element.get('uID')[1:])
        player_name.append(element.xpath('Name')[0].text)
        position.append(element.xpath('Position')[0].text)
        real_position.append(element.xpath("Stat[@Type='real_position']")[0].text)
        real_position_side.append(element.xpath("Stat[@Type='real_position_side']")[0].text)
        jersey_num.append(element.xpath("Stat[@Type='jersey_num']")[0].text)
team = dict()
player = dict()
rm_Unknown = lambda x : '' if x == 'Unknown' else x
for i in range(len(team_id)):
    team[team_id[i]] = team_name[i]
for i in range(len(player_id)):
    player[player_id[i]] = '%s, %s, %s, %s, %s' % (rm_Unknown(player_name[i]), rm_Unknown(jersey_num[i]), rm_Unknown(position[i]), rm_Unknown(real_position[i]), rm_Unknown(real_position_side[i]))

In [452]:
pd.options.display.max_rows=30
use_df

Unnamed: 0,direction,event_type,length,min,outcome,period,player_id,position,qualifier,sec,team_id,x,y,team_id_real,time
2,forward,Pass,short,0,1,1,46497,Right,,6,1,44.0,9.8,149,6
3,,Deleted event,,0,1,1,61170,,,8,1,0.0,0.0,149,8
4,,Deleted event,,0,1,1,204727,,,8,0,0.0,0.0,1395,8
5,,Ball touch,,0,0,1,61170,Right,,10,1,59.2,3.1,149,10
6,,Out,,0,1,1,204727,Back,,11,0,38.0,101.2,1395,11
7,,Out,,0,0,1,61170,Right,,11,1,62.0,-1.2,149,11
8,back,Pass,middle,0,1,1,212752,Back,Throw-in,16,0,35.7,100.0,1395,16
9,right,Pass,middle,0,1,1,204727,Back,,19,0,22.3,85.3,1395,19
10,forward,Pass,long,0,0,1,115851,Back,Launch,23,0,3.9,49.4,1395,23
11,forward,Pass,middle,0,0,1,80226,Right,,26,1,63.3,3.4,149,26


In [79]:
def handle_diff(df, field):
    df = df.copy()
    df['t_1'] = [0] + list(df[field])[:-1]
    df[field+'_diff'] = df[field] - df.t_1
    return df

In [80]:
home_df = game_df.loc[game_df.team_id=='149']
home_df = handle_diff(home_df, 'time')

In [81]:
home_df['id'] = home_df.index
home_df = handle_diff(home_df, 'id')

In [84]:
home_df.id_diff.quantile([0.9, 0.95, 0.99])

0.90     3.00
0.95     5.00
0.99    11.28
Name: id_diff, dtype: float64

In [24]:
# game_df.to_csv('../resources/test.tsv', sep='\t', index=False)

In [28]:
# 遍历树添加属性
for element in xml.iter(tag=etree.Element):
    if (element.tag=='Event'):
        element.set('type', event[element.get('type_id')])
    if (element.tag=='Q'):
        element.set('type', qualifier[element.get('qualifier_id')])