In [1]:
import re
import pandas as pd
from datetime import datetime

In [12]:
file_path = "full_datasets/pluribus_30.txt"


def parse_poker_hands(file_path):
    with open(file_path, 'r') as file:
        data = file.read()

    hands = data.split('PokerStars Hand #')
    parsed_hands = []

    for hand in hands[1:]:  # Saltar el primer split vacío
        try:
            hand_data = {}

            # Hand ID
            hand_id_match = re.search(r'^(\d+):', hand)
            hand_data['hand_id'] = int(hand_id_match.group(1)) if hand_id_match else None

            # Timestamp
            timestamp_match = re.search(r'- (\d{4}/\d{2}/\d{2} \d{2}:\d{2}:\d{2}) ET', hand)
            hand_data['timestamp'] = datetime.strptime(timestamp_match.group(1), '%Y/%m/%d %H:%M:%S') if timestamp_match else None

            # Blinds como lista de enteros
            blinds_match = re.search(r'Hold\'em No Limit \((\d+)/(\d+)\)', hand)
            hand_data['blinds'] = [int(blinds_match.group(1)), int(blinds_match.group(2))] if blinds_match else None

            # Posiciones de los jugadores y stacks
            player_positions = {}
            player_matches = re.findall(r'Seat \d+: (\w+) \((\d+) in chips\)', hand)
            for player, stack in player_matches:
                player_positions[player] = int(stack)
            hand_data['player_positions'] = player_positions

            # Hole cards como lista
            hole_cards = {}
            hole_card_matches = re.findall(r'Dealt to (\w+) \[(\w\w \w\w)\]', hand)
            for player, cards in hole_card_matches:
                hole_cards[player] = cards.split()
            hand_data['hole_cards'] = hole_cards

            # Acciones en pre-flop, flop, turn y river
            phases = ['pre_flop', 'flop', 'turn', 'river']
            phase_actions = {phase: [] for phase in phases}

            for phase in phases:
                if phase == 'pre_flop':
                    phase_match = re.search(r'\*\*\* HOLE CARDS \*\*\*(.*?)(\*\*\*|$)', hand, re.DOTALL)
                else:
                    phase_match = re.search(fr'\*\*\* {phase.upper()} \*\*\*(.*?)(\*\*\*|$)', hand, re.DOTALL)

                if phase_match and phase_match.group(1):
                    actions = phase_match.group(1).strip().split('\n')
                    if phase == 'pre_flop':
                        # Filtrar las acciones, eliminando las líneas de cartas repartidas
                        actions = [action for action in actions if not action.startswith('Dealt to')]
                    phase_actions[phase] = [action.strip() for action in actions if action.strip()]

            hand_data.update(phase_actions)

            # Resumen
            summary_match = re.search(r'\*\*\* SUMMARY \*\*\*(.*)', hand, re.DOTALL)
            hand_data['summary'] = summary_match.group(1).strip() if summary_match else None

            parsed_hands.append(hand_data)

        except Exception as e:
            print(f"Error parsing hand: {e}\nHand data:\n{hand[:200]}")  # Imprime parte de la mano para depuración

    return parsed_hands

In [13]:
output_path = "prueba.csv"

def save_to_csv(hands, output_path):
    df = pd.DataFrame(hands)
    df.to_csv(output_path, index=False)


hands = parse_poker_hands(file_path)

In [14]:
save_to_csv(hands, output_path)

print(f"Procesamiento completo. Datos guardados en {output_path}.")

Procesamiento completo. Datos guardados en prueba.csv.


In [None]:
import pandas as pd

manos = pd.read_csv(output_path)
manos.head()

Unnamed: 0,hand_id,timestamp,blinds,player_positions,hole_cards,pre_flop,flop,turn,river,summary
0,30000,2019-07-11 08:20:00,"[50, 100]","{'MrWhite': 10000, 'Gogo': 10000, 'Budd': 1000...","{'MrWhite': ['3c', '9s'], 'Gogo': ['6d', '5s']...","['Budd: folds', 'Eddie: folds', 'Bill: raises ...",[],[],[],Total pot 250 | Rake 0
1,30001,2019-07-11 08:20:01,"[50, 100]","{'Gogo': 10000, 'Budd': 10000, 'Eddie': 10000,...","{'Gogo': ['8s', 'Qc'], 'Budd': ['2s', '8d'], '...","['Eddie: folds', 'Bill: folds', 'Pluribus: fol...",[],[],[],Total pot 200 | Rake 0
2,30002,2019-07-11 08:20:02,"[50, 100]","{'Budd': 10000, 'Eddie': 10000, 'Bill': 10000,...","{'Budd': ['2h', '3c'], 'Eddie': ['Jc', '4c'], ...","['Bill: folds', 'Pluribus: folds', 'MrWhite: f...","['[5c 9s 7c]', 'Eddie: checks', 'Gogo: checks']","['[5c 9s 7c] [3h]', 'Eddie: bets 675', 'Gogo: ...",[],Total pot 450 | Rake 0\nBoard [5c 9s 7c 3h]
3,30003,2019-07-11 08:20:03,"[50, 100]","{'Eddie': 10000, 'Bill': 10000, 'Pluribus': 10...","{'Eddie': ['Jh', 'Kh'], 'Bill': ['Jd', 'Ts'], ...","['Pluribus: folds', 'MrWhite: folds', 'Gogo: r...","['[9s Jc 5d]', 'Gogo: checks', 'Budd: checks']","['[9s Jc 5d] [8h]', 'Gogo: checks', 'Budd: bet...",[],Total pot 4250 | Rake 0\nBoard [9s Jc 5d 8h]
4,30004,2019-07-11 08:20:04,"[50, 100]","{'Bill': 10000, 'Pluribus': 10000, 'MrWhite': ...","{'Bill': ['Tc', '2c'], 'Pluribus': ['3c', '6c'...","['MrWhite: folds', 'Gogo: folds', 'Budd: raise...",[],[],[],Total pot 600 | Rake 0


In [16]:
import os

def process_all_files_in_folder(folder_path, output_path):
    all_hands = []
    
    files = [f for f in os.listdir(folder_path) if f.endswith('.txt')]
    
    for file_name in files:
        file_path = os.path.join(folder_path, file_name)
        hands = parse_poker_hands(file_path)
        all_hands.extend(hands)  

    save_to_csv(all_hands, output_path)


folder_path = "full_datasets" 
output_path = "processed_hands.csv"

process_all_files_in_folder(folder_path, output_path)

In [17]:
processed_hands = pd.read_csv(output_path)
processed_hands.head()

Unnamed: 0,hand_id,timestamp,blinds,player_positions,hole_cards,pre_flop,flop,turn,river,summary
0,87000,2019-07-12 00:10:00,"[50, 100]","{'MrPink': 10000, 'Eddie': 10000, 'MrOrange': ...","{'MrPink': ['Jd', 'Jh'], 'Eddie': ['5c', '7d']...","['MrOrange: folds', 'Bill: folds', 'MrBlue: ra...",[],[],[],Total pot 550 | Rake 0
1,87001,2019-07-12 00:10:01,"[50, 100]","{'Eddie': 10000, 'MrOrange': 10000, 'Bill': 10...","{'Eddie': ['Ts', '9s'], 'MrOrange': ['8s', 'Ah...","['Bill: folds', 'MrBlue: raises 125 to 225', '...","['[6d Qd 6c]', 'Eddie: checks', 'MrBlue: bets ...",[],[],Total pot 550 | Rake 0\nBoard [6d Qd 6c]
2,87002,2019-07-12 00:10:02,"[50, 100]","{'MrOrange': 10000, 'Bill': 10000, 'MrBlue': 1...","{'MrOrange': ['As', 'Kd'], 'Bill': ['Jh', '5h'...","['MrBlue: folds', 'Pluribus: folds', 'MrPink: ...",['[7h Ad 8c]'],['[7h Ad 8c] [Th]'],['[7h Ad 8c] [Th] [Ts]'],Total pot 20100 | Rake 0\nBoard [7h Ad 8c Th T...
3,87003,2019-07-12 00:10:03,"[50, 100]","{'Bill': 10000, 'MrBlue': 10000, 'Pluribus': 1...","{'Bill': ['Ah', 'Ac'], 'MrBlue': ['9h', 'Jh'],...","['Pluribus: folds', 'MrPink: folds', 'Eddie: f...","['[Ts 8c 7d]', 'Bill: checks', 'MrBlue: bets 1...","['[Ts 8c 7d] [Td]', 'Bill: checks', 'MrBlue: b...","['[Ts 8c 7d] [Td] [Ad]', 'Bill: checks', 'MrBl...",Total pot 20000 | Rake 0\nBoard [Ts 8c 7d Td A...
4,87004,2019-07-12 00:10:04,"[50, 100]","{'MrBlue': 10000, 'Pluribus': 10000, 'MrPink':...","{'MrBlue': ['As', '7d'], 'Pluribus': ['9s', 'A...","['MrPink: raises 110 to 210', 'Eddie: folds', ...","['[8c 7h Kh]', 'Pluribus: checks', 'MrPink: be...",[],[],Total pot 470 | Rake 0\nBoard [8c 7h Kh]


In [18]:
processed_hands.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   hand_id           10000 non-null  int64 
 1   timestamp         10000 non-null  object
 2   blinds            10000 non-null  object
 3   player_positions  10000 non-null  object
 4   hole_cards        10000 non-null  object
 5   pre_flop          10000 non-null  object
 6   flop              10000 non-null  object
 7   turn              10000 non-null  object
 8   river             10000 non-null  object
 9   summary           10000 non-null  object
dtypes: int64(1), object(9)
memory usage: 781.4+ KB
