In [1]:
from mobfot import MobFot
from datetime import date, timedelta
from tqdm import tqdm
from joblib import Parallel, delayed
import pandas as pd

In [2]:
def generar_fechas(anio_inicio, anio_fin):
  """
  Genera una lista de fechas en formato YYYYMMDD
  entre dos años (inclusive).

  Args:
    anio_inicio (int): Año de inicio (ej. 2017)
    anio_fin (int): Año de fin (ej. 2024)

  Returns:
    lista_fechas (list): Lista de strings con las fechas.
  """
  lista_fechas = []
  fecha_inicio = date(anio_inicio, 1, 1)
  fecha_fin = date(anio_fin, 12, 31)
  while fecha_inicio <= fecha_fin:
    fecha_str = fecha_inicio.strftime("%Y%m%d")
    lista_fechas.append(fecha_str)
    fecha_inicio += timedelta(days=1)
  return lista_fechas

# Ejemplo de uso
anio_inicio = 2015
anio_fin = 2023
lista_fechas = generar_fechas(anio_inicio, anio_fin)

In [3]:
client = MobFot()

In [4]:
def day_matches(date):
    client = MobFot()
    ids = []
    day_matches = client.get_matches_by_date(date)
    for leag in range(len(day_matches["leagues"])):
        for match_leg in range(len(day_matches["leagues"][leag]["matches"])):
            ids.append(day_matches["leagues"][leag]["matches"][match_leg]["id"])
    return ids

In [5]:
workers = 12
results = Parallel(n_jobs=workers)(delayed(day_matches)(d) for d in tqdm(lista_fechas))

100%|██████████| 3287/3287 [02:34<00:00, 21.24it/s]


In [6]:
ids = []
for r in results:
    ids.extend(r)
len(ids)

471436

In [8]:
def get_info(id):
    try : 
        client = MobFot()
        info = client.get_match_details(id)
        data = {
            "id": [id],
            'date': [info["general"]["matchTimeUTCDate"][:10]],
            'leagueName': [info["general"]["leagueName"]],
            'homeTeam': [info["general"]["homeTeam"]["name"]],
            'awayTeam': [info["general"]["awayTeam"]["name"]],
            'homeTeam_score': [info["header"]["teams"][0]["score"]],
            'awayTeam_score': [info["header"]["teams"][1]["score"]],
            'homeidTeam': [info["general"]["homeTeam"]["id"]],
            'awayidTeam': [info["general"]["awayTeam"]["id"]]
        }
        for i in range(len(info["content"]["stats"]["Periods"]["All"]["stats"])):
            for j in range(len(info["content"]["stats"]["Periods"]["All"]["stats"][i]["stats"])):
                key = info["content"]["stats"]["Periods"]["All"]["stats"][i]["stats"][j]["key"]
                data[key+"_home"] = info["content"]["stats"]["Periods"]["All"]["stats"][i]["stats"][j]["stats"][0]
                data[key+"_away"] = info["content"]["stats"]["Periods"]["All"]["stats"][i]["stats"][j]["stats"][1]
    
        df_id = pd.DataFrame(data)
        return df_id
    except Exception as error:
        print(error)
        return False

In [9]:
# La primera vez, hay que crear un data frame de 0
# df = pd.DataFrame()
# Después leer el archivo guardado
df = pd.read_csv('datos_fotmob_primer_filtro.csv')
workers = 10
trabajo_por_division = int(len(ids)/1000)
for div in tqdm(range(1, trabajo_por_division+1)):
    results = Parallel(n_jobs=workers)(delayed(get_info)(id)
                                       for id in ids[1000*(div-1):1000*div])
    for i in results:
        if str(type(i)) == "<class 'pandas.core.frame.DataFrame'>":
            df = pd.concat([df, i], ignore_index=True)
    df.to_csv('datos_fotmob_primer_filtro.csv', header=True, index=False)
    print(len(df))

  0%|          | 0/129 [00:00<?, ?it/s]

343


  1%|          | 1/129 [03:58<8:27:56, 238.10s/it]

89234
344


  2%|▏         | 2/129 [08:11<8:43:04, 247.12s/it]

89686
345


  2%|▏         | 3/129 [12:41<9:00:48, 257.52s/it]

90168
346


  3%|▎         | 4/129 [16:43<8:43:28, 251.27s/it]

90584
347


  4%|▍         | 5/129 [20:28<8:19:38, 241.76s/it]

90965
348


  5%|▍         | 6/129 [24:51<8:31:00, 249.27s/it]

91438
349


  5%|▌         | 7/129 [28:24<8:02:13, 237.16s/it]

91778
350


  6%|▌         | 8/129 [32:03<7:47:00, 231.58s/it]

92128
351


  7%|▋         | 9/129 [36:14<7:55:15, 237.63s/it]

92569
352


  8%|▊         | 10/129 [40:12<7:51:31, 237.74s/it]

92969
353


  9%|▊         | 11/129 [44:39<8:05:10, 246.70s/it]

93423
354


  9%|▉         | 12/129 [48:51<8:04:16, 248.35s/it]

93847
355


 10%|█         | 13/129 [53:01<8:01:01, 248.81s/it]

94258
356


  df = pd.concat([df, i], ignore_index=True)
 11%|█         | 14/129 [57:22<8:03:41, 252.36s/it]

94631
357


 12%|█▏        | 15/129 [1:01:52<8:09:59, 257.89s/it]

95037
358


 12%|█▏        | 16/129 [1:06:01<8:00:16, 255.02s/it]

95454
359


 13%|█▎        | 17/129 [1:09:54<7:43:57, 248.55s/it]

95820
360


 14%|█▍        | 18/129 [1:13:22<7:16:56, 236.19s/it]

96130
361


 15%|█▍        | 19/129 [1:17:36<7:22:57, 241.62s/it]

96547
362


 16%|█▌        | 20/129 [1:21:52<7:26:43, 245.90s/it]

96971
363


 16%|█▋        | 21/129 [1:25:36<7:11:07, 239.51s/it]

97329
364


 17%|█▋        | 22/129 [1:29:32<7:05:11, 238.42s/it]

97696
365


 18%|█▊        | 23/129 [1:33:56<7:14:41, 246.05s/it]

98115
366


 19%|█▊        | 24/129 [1:38:06<7:12:40, 247.24s/it]

98513
367


 19%|█▉        | 25/129 [1:42:50<7:27:39, 258.26s/it]

98933
368


 20%|██        | 26/129 [1:46:45<7:11:08, 251.15s/it]

99300
369


 21%|██        | 27/129 [1:50:56<7:07:09, 251.27s/it]

99715
370


 22%|██▏       | 28/129 [1:55:04<7:01:08, 250.19s/it]

100121
371


 22%|██▏       | 29/129 [1:59:11<6:55:30, 249.30s/it]

100495
372


 23%|██▎       | 30/129 [2:03:46<7:03:45, 256.82s/it]

100903
373


 24%|██▍       | 31/129 [2:07:58<6:57:28, 255.60s/it]

101287
374


 25%|██▍       | 32/129 [2:12:08<6:50:17, 253.79s/it]

101688
375


 26%|██▌       | 33/129 [2:16:10<6:40:32, 250.34s/it]

102073
376


 26%|██▋       | 34/129 [2:19:55<6:24:13, 242.67s/it]

102418
377


 27%|██▋       | 35/129 [2:24:34<6:37:24, 253.67s/it]

102865
378


 28%|██▊       | 36/129 [2:29:13<6:44:48, 261.17s/it]

103298
379


 29%|██▊       | 37/129 [2:34:05<6:54:47, 270.51s/it]

103757
380


 29%|██▉       | 38/129 [2:39:16<7:08:47, 282.72s/it]

104261
381


 30%|███       | 39/129 [2:44:23<7:14:57, 289.97s/it]

104744
382


 31%|███       | 40/129 [2:49:56<7:29:05, 302.76s/it]

105256
383


 32%|███▏      | 41/129 [2:54:26<7:09:41, 292.97s/it]

105680
384


 33%|███▎      | 42/129 [2:58:56<6:54:48, 286.08s/it]

106097
385


 33%|███▎      | 43/129 [3:03:47<6:52:20, 287.69s/it]

106556
386


 34%|███▍      | 44/129 [3:08:02<6:33:37, 277.85s/it]

106937
387


 35%|███▍      | 45/129 [3:12:35<6:26:37, 276.16s/it]

107353
388


 36%|███▌      | 46/129 [3:17:24<6:27:27, 280.09s/it]

107787
389


 36%|███▋      | 47/129 [3:21:53<6:18:06, 276.66s/it]

108195
390


 37%|███▋      | 48/129 [3:26:39<6:17:16, 279.46s/it]

108626
391


 38%|███▊      | 49/129 [3:31:11<6:09:47, 277.34s/it]

109030
392


 39%|███▉      | 50/129 [3:34:46<5:40:40, 258.74s/it]

109318
393


 40%|███▉      | 51/129 [3:39:02<5:35:13, 257.87s/it]

109685
394


 40%|████      | 52/129 [3:43:45<5:40:40, 265.46s/it]

110114
395


 41%|████      | 53/129 [3:48:43<5:48:35, 275.20s/it]

110565
396


 42%|████▏     | 54/129 [3:53:21<5:44:59, 275.99s/it]

110967
397


 43%|████▎     | 55/129 [3:57:47<5:36:50, 273.11s/it]

111357
398


 43%|████▎     | 56/129 [4:02:05<5:26:44, 268.56s/it]

111721
399


 44%|████▍     | 57/129 [4:07:00<5:31:50, 276.53s/it]

112155
400


 45%|████▍     | 58/129 [4:11:51<5:32:05, 280.64s/it]

112578
401


 46%|████▌     | 59/129 [4:16:57<5:36:22, 288.33s/it]

113030
402


 47%|████▋     | 60/129 [4:21:42<5:30:32, 287.43s/it]

113435
403


 47%|████▋     | 61/129 [4:26:06<5:17:49, 280.43s/it]

113793
404


 48%|████▊     | 62/129 [4:29:30<4:47:14, 257.23s/it]

114035
405


 49%|████▉     | 63/129 [4:33:01<4:28:00, 243.65s/it]

114278
406


 50%|████▉     | 64/129 [4:36:59<4:21:55, 241.77s/it]

114585
407


 50%|█████     | 65/129 [4:41:18<4:23:20, 246.89s/it]

114936
408


 51%|█████     | 66/129 [4:46:55<4:47:51, 274.15s/it]

115436
409


 52%|█████▏    | 67/129 [4:51:45<4:47:56, 278.65s/it]

115824
410


 53%|█████▎    | 68/129 [4:56:33<4:46:13, 281.54s/it]

116217
411


 53%|█████▎    | 69/129 [5:01:25<4:44:47, 284.78s/it]

116621
412


 54%|█████▍    | 70/129 [5:06:06<4:38:52, 283.61s/it]

117016
413


 55%|█████▌    | 71/129 [5:10:34<4:29:42, 279.01s/it]

117380
414


 56%|█████▌    | 72/129 [5:16:34<4:48:01, 303.18s/it]

117780
415


 57%|█████▋    | 73/129 [5:22:23<4:55:53, 317.03s/it]

118158
416


 57%|█████▋    | 74/129 [5:28:23<5:02:24, 329.89s/it]

118588
417


 58%|█████▊    | 75/129 [5:33:39<4:52:59, 325.55s/it]

118973
418


 59%|█████▉    | 76/129 [5:38:52<4:44:19, 321.87s/it]

119374
419


 60%|█████▉    | 77/129 [5:44:00<4:35:24, 317.78s/it]

119780
420


 60%|██████    | 78/129 [5:49:09<4:27:50, 315.10s/it]

120191
421


 61%|██████    | 79/129 [5:54:27<4:23:20, 316.02s/it]

120619
422


 62%|██████▏   | 80/129 [5:58:26<3:59:05, 292.76s/it]

120906
423


 63%|██████▎   | 81/129 [6:03:26<3:56:02, 295.05s/it]

121315
424


 64%|██████▎   | 82/129 [6:08:25<3:51:56, 296.09s/it]

121708
425


 64%|██████▍   | 83/129 [6:13:48<3:53:19, 304.34s/it]

122134
426


 65%|██████▌   | 84/129 [6:19:00<3:49:58, 306.64s/it]

122562
427


 66%|██████▌   | 85/129 [6:24:22<3:48:15, 311.27s/it]

123004
428


 67%|██████▋   | 86/129 [6:29:13<3:38:45, 305.25s/it]

123390
429


 67%|██████▋   | 87/129 [6:35:05<3:43:28, 319.26s/it]

123875
430


 68%|██████▊   | 88/129 [6:41:00<3:45:21, 329.80s/it]

124358
431


 69%|██████▉   | 89/129 [6:46:12<3:36:16, 324.41s/it]

124788
432


 70%|██████▉   | 90/129 [6:51:38<3:31:12, 324.94s/it]

125239
433


 71%|███████   | 91/129 [6:56:53<3:23:59, 322.10s/it]

125675
434


 71%|███████▏  | 92/129 [7:02:19<3:19:14, 323.09s/it]

126124
435


 72%|███████▏  | 93/129 [7:07:40<3:13:32, 322.57s/it]

126565
436


 73%|███████▎  | 94/129 [7:13:09<3:09:19, 324.55s/it]

127013
437


 74%|███████▎  | 95/129 [7:18:04<2:58:53, 315.68s/it]

127395
438


 74%|███████▍  | 96/129 [7:23:29<2:55:04, 318.31s/it]

127827
439


 75%|███████▌  | 97/129 [7:29:31<2:56:44, 331.38s/it]

128309
440


 76%|███████▌  | 98/129 [7:35:30<2:55:30, 339.70s/it]

128777
441


 77%|███████▋  | 99/129 [7:40:57<2:48:03, 336.12s/it]

129176
442


 78%|███████▊  | 100/129 [7:46:48<2:44:29, 340.33s/it]

129603
443


 78%|███████▊  | 101/129 [7:53:14<2:45:19, 354.28s/it]

130059
444


  df = pd.concat([df, i], ignore_index=True)
 79%|███████▉  | 102/129 [7:59:53<2:45:27, 367.69s/it]

130557
445


 80%|███████▉  | 103/129 [8:06:12<2:40:48, 371.10s/it]

131021
446


 81%|████████  | 104/129 [8:12:09<2:32:46, 366.67s/it]

131450
447


 81%|████████▏ | 105/129 [8:18:22<2:27:29, 368.72s/it]

131905
448


 82%|████████▏ | 106/129 [8:24:11<2:19:02, 362.72s/it]

132320
449


 83%|████████▎ | 107/129 [8:30:46<2:16:30, 372.30s/it]

132800
450


 84%|████████▎ | 108/129 [8:35:23<2:00:23, 343.95s/it]

133084
451


 84%|████████▍ | 109/129 [8:41:00<1:53:53, 341.66s/it]

133467
452


 85%|████████▌ | 110/129 [8:46:54<1:49:25, 345.58s/it]

133886
453


 86%|████████▌ | 111/129 [8:52:57<1:45:09, 350.54s/it]

134307
454


 87%|████████▋ | 112/129 [8:58:38<1:38:31, 347.75s/it]

134702
455


 88%|████████▊ | 113/129 [9:05:00<1:35:28, 358.02s/it]

135159
456


 88%|████████▊ | 114/129 [9:11:19<1:31:05, 364.36s/it]

135605
457


 89%|████████▉ | 115/129 [9:17:17<1:24:34, 362.46s/it]

136022
458


 90%|████████▉ | 116/129 [9:22:17<1:14:26, 343.61s/it]

136311
459


 91%|█████████ | 117/129 [9:27:44<1:07:46, 338.86s/it]

136672
460


 91%|█████████▏| 118/129 [9:33:50<1:03:36, 346.99s/it]

137090
461


 92%|█████████▏| 119/129 [9:40:01<59:01, 354.16s/it]  

137506
462


 93%|█████████▎| 120/129 [9:45:46<52:42, 351.40s/it]

137884
463


 94%|█████████▍| 121/129 [9:51:42<47:02, 352.81s/it]

138278
464


 95%|█████████▍| 122/129 [9:57:49<41:39, 357.07s/it]

138681
465


  df = pd.concat([df, i], ignore_index=True)
 95%|█████████▌| 123/129 [10:02:25<33:15, 332.65s/it]

138957
466


  df = pd.concat([df, i], ignore_index=True)
 96%|█████████▌| 124/129 [10:07:07<26:27, 317.43s/it]

139234
467


 97%|█████████▋| 125/129 [10:13:40<22:40, 340.13s/it]

139671
468


 98%|█████████▊| 126/129 [10:20:21<17:55, 358.45s/it]

140120
469


 98%|█████████▊| 127/129 [10:26:28<12:02, 361.11s/it]

140519
470


 99%|█████████▉| 128/129 [10:33:27<06:18, 378.39s/it]

140988
471


100%|██████████| 129/129 [10:40:32<00:00, 297.93s/it]

141468





In [13]:
df_sin_duplicados = df.drop_duplicates()
df_sin_duplicados = df_sin_duplicados.sort_values(by="date",ignore_index=True)
# Guardar la primera corrida de datos scrapeados
df_sin_duplicados.to_csv('datos_fotmob_primer_filtro.csv', header=True, index=False)

In [10]:
df_primer_filtro = pd.read_csv('datos_fotmob_primer_filtro.csv')
segundo_filtro = list(set(ids)-set(df_primer_filtro["id"]))

# La primera vez, hay que crear un data frame de 0
# df = pd.DataFrame()
# Después leer el archivo guardado
df_segudo_filtro = pd.read_csv('datos_fotmob_segundo_filtro.csv')
workers = 6
trabajo_por_division = int(len(segundo_filtro)/1000)
for div in tqdm(range(16, trabajo_por_division+1)):
    results = Parallel(n_jobs=workers)(delayed(get_info)(id)
                                       for id in segundo_filtro[1000*(div-1):1000*div])
    for i in results:
        if str(type(i)) == "<class 'pandas.core.frame.DataFrame'>":
            df_segudo_filtro = pd.concat([df, i], ignore_index=True)
    df_segudo_filtro.to_csv('datos_fotmob_segundo_filtro.csv', header=True, index=False)
    print(len(df_segudo_filtro))

  0%|          | 1/303 [01:42<8:37:38, 102.84s/it]

125


  1%|          | 2/303 [26:26<76:31:32, 915.26s/it]

125


  1%|          | 2/303 [29:08<73:05:45, 874.24s/it]


KeyboardInterrupt: 