In [144]:
import re
import pandas as pd
from datetime import datetime

In [145]:
file_path = "full_datasets/pluribus_30.txt"


def parse_poker_hands(file_path):
    with open(file_path, 'r') as file:
        data = file.read()

    hands = data.split('PokerStars Hand #')
    parsed_hands = []

    for hand in hands[1:]:  # Saltar el primer split vacío
        try:
            hand_data = {}

            # Hand ID
            hand_id_match = re.search(r'^(\d+):', hand)
            hand_data['hand_id'] = int(hand_id_match.group(1)) if hand_id_match else None

            # Timestamp
            timestamp_match = re.search(r'- (\d{4}/\d{2}/\d{2} \d{2}:\d{2}:\d{2}) ET', hand)
            hand_data['timestamp'] = datetime.strptime(timestamp_match.group(1), '%Y/%m/%d %H:%M:%S') if timestamp_match else None

            # Blinds como lista de enteros
            blinds_match = re.search(r'Hold\'em No Limit \((\d+)/(\d+)\)', hand)
            hand_data['blinds'] = [int(blinds_match.group(1)), int(blinds_match.group(2))] if blinds_match else None

            # Posiciones de los jugadores y stacks
            player_positions = {}
            player_matches = re.findall(r'Seat \d+: (\w+) \((\d+) in chips\)', hand)
            for player, stack in player_matches:
                player_positions[player] = int(stack)
            hand_data['player_positions'] = player_positions

            # Hole cards como lista
            hole_cards = {}
            hole_card_matches = re.findall(r'Dealt to (\w+) \[(\w\w \w\w)\]', hand)
            for player, cards in hole_card_matches:
                hole_cards[player] = cards.split()
            hand_data['hole_cards'] = hole_cards

            # Acciones en pre-flop, flop, turn y river
            phases = ['pre_flop', 'flop', 'turn', 'river']
            phase_actions = {phase: [] for phase in phases}

            for phase in phases:
                if phase == 'pre_flop':
                    phase_match = re.search(r'\*\*\* HOLE CARDS \*\*\*(.*?)(\*\*\*|$)', hand, re.DOTALL)
                else:
                    phase_match = re.search(fr'\*\*\* {phase.upper()} \*\*\*(.*?)(\*\*\*|$)', hand, re.DOTALL)

                if phase_match and phase_match.group(1):
                    actions = phase_match.group(1).strip().split('\n')
                    if phase == 'pre_flop':
                        # Filtrar las acciones, eliminando las líneas de cartas repartidas
                        actions = [action for action in actions if not action.startswith('Dealt to')]
                    phase_actions[phase] = [action.strip() for action in actions if action.strip()]

            hand_data.update(phase_actions)

            # Resumen
            summary_match = re.search(r'\*\*\* SUMMARY \*\*\*(.*)', hand, re.DOTALL)
            hand_data['summary'] = summary_match.group(1).strip() if summary_match else None

            parsed_hands.append(hand_data)

        except Exception as e:
            print(f"Error parsing hand: {e}\nHand data:\n{hand[:200]}")  # Imprime parte de la mano para depuración

    return parsed_hands

In [146]:
output_path = "prueba.csv"

def save_to_csv(hands, output_path):
    df = pd.DataFrame(hands)
    df.to_csv(output_path, index=False)


hands = parse_poker_hands(file_path)

In [147]:
save_to_csv(hands, output_path)

print(f"Procesamiento completo. Datos guardados en {output_path}.")

Procesamiento completo. Datos guardados en prueba.csv.


In [148]:
import pandas as pd

manos = pd.read_csv(output_path)
manos.head()

Unnamed: 0,hand_id,timestamp,blinds,player_positions,hole_cards,pre_flop,flop,turn,river,summary
0,30000,2019-07-11 08:20:00,"[50, 100]","{'MrWhite': 10000, 'Gogo': 10000, 'Budd': 1000...","{'MrWhite': ['3c', '9s'], 'Gogo': ['6d', '5s']...","['Budd: folds', 'Eddie: folds', 'Bill: raises ...",[],[],[],Total pot 250 | Rake 0
1,30001,2019-07-11 08:20:01,"[50, 100]","{'Gogo': 10000, 'Budd': 10000, 'Eddie': 10000,...","{'Gogo': ['8s', 'Qc'], 'Budd': ['2s', '8d'], '...","['Eddie: folds', 'Bill: folds', 'Pluribus: fol...",[],[],[],Total pot 200 | Rake 0
2,30002,2019-07-11 08:20:02,"[50, 100]","{'Budd': 10000, 'Eddie': 10000, 'Bill': 10000,...","{'Budd': ['2h', '3c'], 'Eddie': ['Jc', '4c'], ...","['Bill: folds', 'Pluribus: folds', 'MrWhite: f...","['[5c 9s 7c]', 'Eddie: checks', 'Gogo: checks']","['[5c 9s 7c] [3h]', 'Eddie: bets 675', 'Gogo: ...",[],Total pot 450 | Rake 0\nBoard [5c 9s 7c 3h]
3,30003,2019-07-11 08:20:03,"[50, 100]","{'Eddie': 10000, 'Bill': 10000, 'Pluribus': 10...","{'Eddie': ['Jh', 'Kh'], 'Bill': ['Jd', 'Ts'], ...","['Pluribus: folds', 'MrWhite: folds', 'Gogo: r...","['[9s Jc 5d]', 'Gogo: checks', 'Budd: checks']","['[9s Jc 5d] [8h]', 'Gogo: checks', 'Budd: bet...",[],Total pot 4250 | Rake 0\nBoard [9s Jc 5d 8h]
4,30004,2019-07-11 08:20:04,"[50, 100]","{'Bill': 10000, 'Pluribus': 10000, 'MrWhite': ...","{'Bill': ['Tc', '2c'], 'Pluribus': ['3c', '6c'...","['MrWhite: folds', 'Gogo: folds', 'Budd: raise...",[],[],[],Total pot 600 | Rake 0


In [149]:
import os

def process_all_files_in_folder(folder_path, output_path):
    all_hands = []
    
    files = [f for f in os.listdir(folder_path) if f.endswith('.txt')]
    
    for file_name in files:
        file_path = os.path.join(folder_path, file_name)
        hands = parse_poker_hands(file_path)
        all_hands.extend(hands)  

    save_to_csv(all_hands, output_path)


folder_path = "full_datasets" 
output_path = "processed_hands.csv"

process_all_files_in_folder(folder_path, output_path)

In [150]:
processed_hands = pd.read_csv(output_path)
processed_hands.head()

Unnamed: 0,hand_id,timestamp,blinds,player_positions,hole_cards,pre_flop,flop,turn,river,summary
0,87000,2019-07-12 00:10:00,"[50, 100]","{'MrPink': 10000, 'Eddie': 10000, 'MrOrange': ...","{'MrPink': ['Jd', 'Jh'], 'Eddie': ['5c', '7d']...","['MrOrange: folds', 'Bill: folds', 'MrBlue: ra...",[],[],[],Total pot 550 | Rake 0
1,87001,2019-07-12 00:10:01,"[50, 100]","{'Eddie': 10000, 'MrOrange': 10000, 'Bill': 10...","{'Eddie': ['Ts', '9s'], 'MrOrange': ['8s', 'Ah...","['Bill: folds', 'MrBlue: raises 125 to 225', '...","['[6d Qd 6c]', 'Eddie: checks', 'MrBlue: bets ...",[],[],Total pot 550 | Rake 0\nBoard [6d Qd 6c]
2,87002,2019-07-12 00:10:02,"[50, 100]","{'MrOrange': 10000, 'Bill': 10000, 'MrBlue': 1...","{'MrOrange': ['As', 'Kd'], 'Bill': ['Jh', '5h'...","['MrBlue: folds', 'Pluribus: folds', 'MrPink: ...",['[7h Ad 8c]'],['[7h Ad 8c] [Th]'],['[7h Ad 8c] [Th] [Ts]'],Total pot 20100 | Rake 0\nBoard [7h Ad 8c Th T...
3,87003,2019-07-12 00:10:03,"[50, 100]","{'Bill': 10000, 'MrBlue': 10000, 'Pluribus': 1...","{'Bill': ['Ah', 'Ac'], 'MrBlue': ['9h', 'Jh'],...","['Pluribus: folds', 'MrPink: folds', 'Eddie: f...","['[Ts 8c 7d]', 'Bill: checks', 'MrBlue: bets 1...","['[Ts 8c 7d] [Td]', 'Bill: checks', 'MrBlue: b...","['[Ts 8c 7d] [Td] [Ad]', 'Bill: checks', 'MrBl...",Total pot 20000 | Rake 0\nBoard [Ts 8c 7d Td A...
4,87004,2019-07-12 00:10:04,"[50, 100]","{'MrBlue': 10000, 'Pluribus': 10000, 'MrPink':...","{'MrBlue': ['As', '7d'], 'Pluribus': ['9s', 'A...","['MrPink: raises 110 to 210', 'Eddie: folds', ...","['[8c 7h Kh]', 'Pluribus: checks', 'MrPink: be...",[],[],Total pot 470 | Rake 0\nBoard [8c 7h Kh]


In [151]:
processed_hands.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   hand_id           10000 non-null  int64 
 1   timestamp         10000 non-null  object
 2   blinds            10000 non-null  object
 3   player_positions  10000 non-null  object
 4   hole_cards        10000 non-null  object
 5   pre_flop          10000 non-null  object
 6   flop              10000 non-null  object
 7   turn              10000 non-null  object
 8   river             10000 non-null  object
 9   summary           10000 non-null  object
dtypes: int64(1), object(9)
memory usage: 781.4+ KB


In [152]:
data = pd.read_csv("processed_hands.csv")
data.head()

Unnamed: 0,hand_id,timestamp,blinds,player_positions,hole_cards,pre_flop,flop,turn,river,summary
0,87000,2019-07-12 00:10:00,"[50, 100]","{'MrPink': 10000, 'Eddie': 10000, 'MrOrange': ...","{'MrPink': ['Jd', 'Jh'], 'Eddie': ['5c', '7d']...","['MrOrange: folds', 'Bill: folds', 'MrBlue: ra...",[],[],[],Total pot 550 | Rake 0
1,87001,2019-07-12 00:10:01,"[50, 100]","{'Eddie': 10000, 'MrOrange': 10000, 'Bill': 10...","{'Eddie': ['Ts', '9s'], 'MrOrange': ['8s', 'Ah...","['Bill: folds', 'MrBlue: raises 125 to 225', '...","['[6d Qd 6c]', 'Eddie: checks', 'MrBlue: bets ...",[],[],Total pot 550 | Rake 0\nBoard [6d Qd 6c]
2,87002,2019-07-12 00:10:02,"[50, 100]","{'MrOrange': 10000, 'Bill': 10000, 'MrBlue': 1...","{'MrOrange': ['As', 'Kd'], 'Bill': ['Jh', '5h'...","['MrBlue: folds', 'Pluribus: folds', 'MrPink: ...",['[7h Ad 8c]'],['[7h Ad 8c] [Th]'],['[7h Ad 8c] [Th] [Ts]'],Total pot 20100 | Rake 0\nBoard [7h Ad 8c Th T...
3,87003,2019-07-12 00:10:03,"[50, 100]","{'Bill': 10000, 'MrBlue': 10000, 'Pluribus': 1...","{'Bill': ['Ah', 'Ac'], 'MrBlue': ['9h', 'Jh'],...","['Pluribus: folds', 'MrPink: folds', 'Eddie: f...","['[Ts 8c 7d]', 'Bill: checks', 'MrBlue: bets 1...","['[Ts 8c 7d] [Td]', 'Bill: checks', 'MrBlue: b...","['[Ts 8c 7d] [Td] [Ad]', 'Bill: checks', 'MrBl...",Total pot 20000 | Rake 0\nBoard [Ts 8c 7d Td A...
4,87004,2019-07-12 00:10:04,"[50, 100]","{'MrBlue': 10000, 'Pluribus': 10000, 'MrPink':...","{'MrBlue': ['As', '7d'], 'Pluribus': ['9s', 'A...","['MrPink: raises 110 to 210', 'Eddie: folds', ...","['[8c 7h Kh]', 'Pluribus: checks', 'MrPink: be...",[],[],Total pot 470 | Rake 0\nBoard [8c 7h Kh]


In [153]:
prueba = data.loc[2, :]
print(prueba)
print()
print(prueba['player_positions'])
print(prueba['hole_cards'])
print(prueba['pre_flop'])
print(prueba['flop'])
print(prueba['turn'])
print(prueba['river'])

hand_id                                                         87002
timestamp                                         2019-07-12 00:10:02
blinds                                                      [50, 100]
player_positions    {'MrOrange': 10000, 'Bill': 10000, 'MrBlue': 1...
hole_cards          {'MrOrange': ['As', 'Kd'], 'Bill': ['Jh', '5h'...
pre_flop            ['MrBlue: folds', 'Pluribus: folds', 'MrPink: ...
flop                                                   ['[7h Ad 8c]']
turn                                              ['[7h Ad 8c] [Th]']
river                                        ['[7h Ad 8c] [Th] [Ts]']
summary             Total pot 20100 | Rake 0\nBoard [7h Ad 8c Th T...
Name: 2, dtype: object

{'MrOrange': 10000, 'Bill': 10000, 'MrBlue': 10000, 'Pluribus': 10000, 'MrPink': 10000, 'Eddie': 10000}
{'MrOrange': ['As', 'Kd'], 'Bill': ['Jh', '5h'], 'MrBlue': ['6c', '5d'], 'Pluribus': ['6s', '5s'], 'MrPink': ['Kh', 'Ac'], 'Eddie': ['6d', '4s']}
['MrBlue: folds', 'Pluribu

In [154]:
import ast

# Filtrar partidas donde solo queden dos jugadores desde el flop
def is_heads_up_flop(pre_flop_actions):
    try:
        actions = ast.literal_eval(pre_flop_actions)
        active_players = set()
        for action in actions:
            if ":" in action:
                player = action.split(":")[0].strip()
                if "folds" not in action:
                    active_players.add(player)
        return len(active_players) == 2
    except Exception:
        return False

def filter_heads_up_from_flop(df):
    return df[df['pre_flop'].apply(is_heads_up_flop)]

In [155]:
import ast

def preprocess_data(df):
    processed_data = []

    for _, row in df.iterrows():
        hand_data = {}

        # Blinds
        hand_data['blinds'] = ast.literal_eval(row['blinds'])
        # Stacks iniciales
        hand_data['stacks'] = [10000, 10000] # No cambia 

        # Historial de apuestas
        phases = [row['pre_flop'], row['flop'], row['turn'], row['river']]
        for i, phase in enumerate(['pre_flop', 'flop', 'turn', 'river']):
            actions = ast.literal_eval(phases[i]) if phases[i] else []
            if phase == 'pre_flop':
                actions = [action for action in actions if "folds" not in action]
            hand_data[f'{phase}_actions'] = actions

        processed_data.append(hand_data)

    return pd.DataFrame(processed_data)

heads_up_data = filter_heads_up_from_flop(data)
processed_data = preprocess_data(heads_up_data)
processed_data


Unnamed: 0,blinds,stacks,pre_flop_actions,flop_actions,turn_actions,river_actions
0,"[50, 100]","[10000, 10000]","[MrBlue: raises 125 to 225, MrPink: raises 825...",[],[],[]
1,"[50, 100]","[10000, 10000]","[MrBlue: raises 125 to 225, Eddie: calls 175]","[[6d Qd 6c], Eddie: checks, MrBlue: bets 275, ...",[],[]
2,"[50, 100]","[10000, 10000]","[MrPink: raises 110 to 210, MrOrange: raises 8...",[[7h Ad 8c]],[[7h Ad 8c] [Th]],[[7h Ad 8c] [Th] [Ts]]
3,"[50, 100]","[10000, 10000]","[Bill: calls 50, MrBlue: raises 250 to 350, Bi...","[[Ts 8c 7d], Bill: checks, MrBlue: bets 1150, ...","[[Ts 8c 7d] [Td], Bill: checks, MrBlue: bets 1...","[[Ts 8c 7d] [Td] [Ad], Bill: checks, MrBlue: b..."
4,"[50, 100]","[10000, 10000]","[MrPink: raises 110 to 210, Pluribus: calls 110]","[[8c 7h Kh], Pluribus: checks, MrPink: bets 47...",[],[]
...,...,...,...,...,...,...
5634,"[50, 100]","[10000, 10000]","[Hattori: raises 125 to 225, MrBlue: calls 175]","[[6c 2d 8s], MrBlue: checks, Hattori: bets 459...",[],[]
5635,"[50, 100]","[10000, 10000]","[Pluribus: raises 175 to 275, Budd: raises 550...","[[8d 6d 5c], Pluribus: checks, Budd: bets 575,...","[[8d 6d 5c] [Kd], Pluribus: checks, Budd: checks]","[[8d 6d 5c] [Kd] [5h], Pluribus: checks, Budd:..."
5636,"[50, 100]","[10000, 10000]","[MrBlue: raises 125 to 225, Pluribus: raises 5...","[[Ts 8h 9h], MrBlue: checks, Pluribus: bets 41...","[[Ts 8h 9h] [8c], MrBlue: checks, Pluribus: ch...","[[Ts 8h 9h] [8c] [Tc], MrBlue: checks, Pluribu..."
5637,"[50, 100]","[10000, 10000]","[Hattori: calls 50, MrBlue: raises 250 to 350,...","[[4c Js 7s], Hattori: checks, MrBlue: bets 400...",[],[]


In [156]:
processed_data.loc[5635, 'pre_flop_actions']

['Pluribus: raises 175 to 275',
 'Budd: raises 550 to 825',
 'Pluribus: calls 550']

In [157]:
processed_data.loc[5635, 'river_actions']

['[8d 6d 5c] [Kd] [5h]', 'Pluribus: checks', 'Budd: checks']

In [158]:
processed_data.loc[0, 'pre_flop_actions']

['MrBlue: raises 125 to 225',
 'MrPink: raises 825 to 1050',
 'Uncalled bet (825) returned to MrPink',
 'MrPink collected 550.0 from pot']

In [159]:
import re
import json

def process_bets(actions):
    bets = {}
    current_bet = 0  # La apuesta actual (más alta) en la ronda
    
    for action in actions:
        # Ignorar acciones que son cartas o resultados del pot
        if '[' in action or 'collected' in action or 'returned' in action:
            continue
            
        if ":" in action:
            player = action.split(":")[0].strip()
            action_details = action.split(":")[1].strip()
            
            # Procesar raises
            if 'raises' in action_details:
                match = re.search(r'raises (\d+) to (\d+)', action_details)
                if match:
                    raise_amount = int(match.group(1))
                    final_amount = int(match.group(2))
                    current_bet = final_amount  # Actualizamos la apuesta actual a la nueva apuesta
                    bets[player] = current_bet
            
            # Procesar calls
            elif 'calls' in action_details:
                match = re.search(r'calls (\d+)', action_details)
                if match:
                    call_amount = int(match.group(1))
                    bets[player] = current_bet  # El jugador iguala la apuesta actual (raise o bet)
            
            # Procesar bets
            elif 'bets' in action_details:
                match = re.search(r'bets (\d+)', action_details)
                if match:
                    bet_amount = int(match.group(1))
                    current_bet = bet_amount  # Actualizamos la apuesta actual a la nueva apuesta
                    bets[player] = current_bet
            
            # Procesar checks o folds
            elif 'checks' in action_details or 'folds' in action_details:
                if player not in bets:
                    bets[player] = 0
    
    return bets

In [160]:
def second_preprocessing_data(df):
    """
    Procesa el DataFrame para extraer las apuestas por jugador en cada fase del juego (pre-flop, flop, turn, river).
    """
    processed_data = []
    
    for _, row in df.iterrows():
        hand_data = {}
        
        # Blinds y stack
        hand_data['blinds'] = row['blinds']
        hand_data['stacks'] = row['stacks']
      
        # Historial de apuestas
        phases = [row['pre_flop_actions'], row['flop_actions'], row['turn_actions'], row['river_actions']]
        
        for i, phase in enumerate(['pre_flop', 'flop', 'turn', 'river']):
            # Asegurarnos de que la fase tenga una lista de acciones
            if phases[i]:
                actions = json.loads(json.dumps(phases[i]))  # Convertir la lista a cadena JSON y luego cargarla
            else:
                actions = []
                
            hand_data[f'{phase}_bets'] = process_bets(actions)
        
        processed_data.append(hand_data)
    
    return pd.DataFrame(processed_data)


second_processed_data = second_preprocessing_data(processed_data)
second_processed_data

Unnamed: 0,blinds,stacks,pre_flop_bets,flop_bets,turn_bets,river_bets
0,"[50, 100]","[10000, 10000]","{'MrBlue': 225, 'MrPink': 1050}",{},{},{}
1,"[50, 100]","[10000, 10000]","{'MrBlue': 225, 'Eddie': 225}","{'Eddie': 0, 'MrBlue': 275}",{},{}
2,"[50, 100]","[10000, 10000]","{'MrPink': 10000, 'MrOrange': 10000}",{},{},{}
3,"[50, 100]","[10000, 10000]","{'Bill': 1400, 'MrBlue': 1400}","{'Bill': 1150, 'MrBlue': 1150}","{'Bill': 1950, 'MrBlue': 1950}","{'Bill': 5500, 'MrBlue': 5500}"
4,"[50, 100]","[10000, 10000]","{'MrPink': 210, 'Pluribus': 210}","{'Pluribus': 0, 'MrPink': 470}",{},{}
...,...,...,...,...,...,...
5634,"[50, 100]","[10000, 10000]","{'Hattori': 225, 'MrBlue': 225}","{'MrBlue': 0, 'Hattori': 459}",{},{}
5635,"[50, 100]","[10000, 10000]","{'Pluribus': 825, 'Budd': 825}","{'Pluribus': 575, 'Budd': 575}","{'Pluribus': 0, 'Budd': 0}","{'Pluribus': 0, 'Budd': 0}"
5636,"[50, 100]","[10000, 10000]","{'MrBlue': 750, 'Pluribus': 750}","{'MrBlue': 412, 'Pluribus': 412}","{'MrBlue': 0, 'Pluribus': 0}","{'MrBlue': 0, 'Pluribus': 2474}"
5637,"[50, 100]","[10000, 10000]","{'Hattori': 350, 'MrBlue': 350}","{'Hattori': 1500, 'MrBlue': 400}",{},{}


In [161]:
second_processed_data.to_csv("processed_data.csv", index=False)

In [162]:
processed_data = pd.read_csv("processed_data.csv")
processed_data

Unnamed: 0,blinds,stacks,pre_flop_bets,flop_bets,turn_bets,river_bets
0,"[50, 100]","[10000, 10000]","{'MrBlue': 225, 'MrPink': 1050}",{},{},{}
1,"[50, 100]","[10000, 10000]","{'MrBlue': 225, 'Eddie': 225}","{'Eddie': 0, 'MrBlue': 275}",{},{}
2,"[50, 100]","[10000, 10000]","{'MrPink': 10000, 'MrOrange': 10000}",{},{},{}
3,"[50, 100]","[10000, 10000]","{'Bill': 1400, 'MrBlue': 1400}","{'Bill': 1150, 'MrBlue': 1150}","{'Bill': 1950, 'MrBlue': 1950}","{'Bill': 5500, 'MrBlue': 5500}"
4,"[50, 100]","[10000, 10000]","{'MrPink': 210, 'Pluribus': 210}","{'Pluribus': 0, 'MrPink': 470}",{},{}
...,...,...,...,...,...,...
5634,"[50, 100]","[10000, 10000]","{'Hattori': 225, 'MrBlue': 225}","{'MrBlue': 0, 'Hattori': 459}",{},{}
5635,"[50, 100]","[10000, 10000]","{'Pluribus': 825, 'Budd': 825}","{'Pluribus': 575, 'Budd': 575}","{'Pluribus': 0, 'Budd': 0}","{'Pluribus': 0, 'Budd': 0}"
5636,"[50, 100]","[10000, 10000]","{'MrBlue': 750, 'Pluribus': 750}","{'MrBlue': 412, 'Pluribus': 412}","{'MrBlue': 0, 'Pluribus': 0}","{'MrBlue': 0, 'Pluribus': 2474}"
5637,"[50, 100]","[10000, 10000]","{'Hattori': 350, 'MrBlue': 350}","{'Hattori': 1500, 'MrBlue': 400}",{},{}


In [221]:
import pandas as pd
import ast

def clean_data(data):
    # Eliminamos las columnas 'blinds' y 'stacks'
    data = data.drop(['blinds', 'stacks'], axis=1)
    data = data.dropna()  # Eliminamos las filas con valores NaN
    stack = 10000  # Suponemos que el stack inicial es 10000
    final = []  # Lista para almacenar los resultados finales
    
    # Iteramos sobre cada fila del dataset
    for i in range(data.shape[0]):
        datos = data.loc[i, :]
        jugadas = {}  # Diccionario para almacenar las jugadas por jugador
        fracciones = {}  # Diccionario para almacenar las fracciones acumuladas por jugador
       
        # Iteramos sobre las fases del juego (pre_flop, flop, turn, river)
        for fase in ['pre_flop_bets', 'flop_bets', 'turn_bets', 'river_bets']:
            if len(datos[fase]) != 0:
                # Si el valor en esta fase es una cadena (string), lo convertimos a diccionario
                if isinstance(datos[fase], str):
                    try:
                        apuestas = ast.literal_eval(datos[fase])  # Convertimos de cadena a diccionario
                    except:
                        apuestas = {}
                else:
                    apuestas = datos[fase]  # Ya es un diccionario, lo usamos directamente
                
                # Si hay apuestas en esta fase
                for jugador, apuesta in apuestas.items():
                    apostado = round(apuesta / stack, 4)  # Calculamos la fracción apostada
                    fracciones.setdefault(jugador, 0)  # Inicializamos la fracción acumulada si no existe
                    fracciones[jugador] += apostado  # Actualizamos la fracción acumulada
                    jugadas.setdefault(jugador, [])  # Inicializamos la lista de jugadas si no existe
                    jugadas[jugador].append([apostado, fracciones[jugador]])  # Añadimos la jugada
            else:
                # Si no hay apuestas en esta fase, añadimos 0 a la jugada
                for jugador in jugadas:
                    jugadas[jugador].append([0, fracciones[jugador]])

        # Añadimos las jugadas de cada jugador al resultado final
        for jugador in jugadas:
            datos_jugador = jugadas[jugador]
            # Verificamos si el jugador apostó en cada fase y usamos la fracción acumulada de las fases previas
            preflop = datos_jugador[0] if len(datos_jugador) > 0 else [0, 0]
            flop = datos_jugador[1] if len(datos_jugador) > 1 else [0, fracciones[jugador]]
            turn = datos_jugador[2] if len(datos_jugador) > 2 else [0, fracciones[jugador]]
            river = datos_jugador[3] if len(datos_jugador) > 3 else [0, fracciones[jugador]]
            total = fracciones[jugador]  # Total es la fracción acumulada

            final.append({
                'preflop': preflop,
                'flop': flop,
                'turn': turn,
                'river': river,
                'total': total
            })
        
    # Convertimos la lista de diccionarios en un DataFrame
    return pd.DataFrame(final)


# Llamamos a la función para limpiar los datos
cleaned_data = clean_data(processed_data)
cleaned_data


Unnamed: 0,preflop,flop,turn,river,total
0,"[0.0225, 0.0225]","[0, 0.0225]","[0, 0.0225]","[0, 0.0225]",0.0225
1,"[0.105, 0.105]","[0, 0.105]","[0, 0.105]","[0, 0.105]",0.1050
2,"[0.0225, 0.0225]","[0.0275, 0.05]","[0, 0.05]","[0, 0.05]",0.0500
3,"[0.0225, 0.0225]","[0.0, 0.0225]","[0, 0.0225]","[0, 0.0225]",0.0225
4,"[1.0, 1.0]","[0, 1.0]","[0, 1.0]","[0, 1.0]",1.0000
...,...,...,...,...,...
11273,"[0.075, 0.075]","[0.0412, 0.1162]","[0.0, 0.1162]","[0.2474, 0.36360000000000003]",0.3636
11274,"[0.035, 0.035]","[0.15, 0.185]","[0, 0.185]","[0, 0.185]",0.1850
11275,"[0.035, 0.035]","[0.04, 0.07500000000000001]","[0, 0.07500000000000001]","[0, 0.07500000000000001]",0.0750
11276,"[0.022, 0.022]","[0.0255, 0.0475]","[0, 0.0475]","[0, 0.0475]",0.0475


In [222]:
# Convert lists to tuples to make them hashable
cleaned_data = cleaned_data.map(lambda x: tuple(x) if isinstance(x, list) else x)
cleaned_data.drop_duplicates(inplace=True)
cleaned_data

Unnamed: 0,preflop,flop,turn,river,total
0,"(0.0225, 0.0225)","(0, 0.0225)","(0, 0.0225)","(0, 0.0225)",0.0225
1,"(0.105, 0.105)","(0, 0.105)","(0, 0.105)","(0, 0.105)",0.1050
2,"(0.0225, 0.0225)","(0.0275, 0.05)","(0, 0.05)","(0, 0.05)",0.0500
4,"(1.0, 1.0)","(0, 1.0)","(0, 1.0)","(0, 1.0)",1.0000
6,"(0.14, 0.14)","(0.115, 0.255)","(0.195, 0.45)","(0.55, 1.0)",1.0000
...,...,...,...,...,...
11268,"(0.0225, 0.0225)","(0.0459, 0.0684)","(0, 0.0684)","(0, 0.0684)",0.0684
11270,"(0.0825, 0.0825)","(0.0575, 0.14)","(0.0, 0.14)","(0.0, 0.14)",0.1400
11273,"(0.075, 0.075)","(0.0412, 0.1162)","(0.0, 0.1162)","(0.2474, 0.36360000000000003)",0.3636
11274,"(0.035, 0.035)","(0.15, 0.185)","(0, 0.185)","(0, 0.185)",0.1850


In [223]:
cleaned_data.to_csv("cleaned_data.csv", index=False)

En cleaned_data, se han hecho 4 columnas que corresponden a las 4 fases de juego más una llamada total. 

Cada una de las líneas de este dataset representa como ha jugado la mano un jugador en cuanto apuestas. En las columnas, podemos encontrar la fracción del dinero inicial que el jugador ha apostado en esa fase, y la fracción total de dinero que lleva apostado en esa mano (respecto a su dinero inicial que siempre es 10000). Como el stack inicial y los blinds no cambian, hay muchas manos que se repiten, por lo que jugadas únicas solo tenemos 3000.

En total, podemos ver la fracción total de dinero que ha apostado el jugador en esa mano (coincide con el segundo valor que aparece en el river).
Es importante recalcar que solo han sido usadas aquellas manos en las que, tras el preflop, solo han quedado dos oponentes. 