In [46]:
import os
import re
import random

def separate_games(file_content):
    games = file_content.split('Game started')
    return ['Game started' + game for game in games if game.strip()]

def remove_game_started_at(game):
    return re.sub(r'Game started at: .*', 'Game started', game)

def remove_muck(game):
    game_lines = game.split('\n')
    return '\n'.join([line for line in game_lines if 'received a card' not in line])

def check_IlxxxlI_result(game):
    for line in game.split('\n'):
        if line.startswith('Player IlxxxlI'):
            if 'Wins:' in line : # Si IlxxxlI n'a rien fait c'est Wins:0
                return 'win'
            elif 'Loses:' in line and float(line[:-1].split('Loses:')[1]) >= 0:
                return 'lose'
      

def remove_received_card_and_muck_lines(game):
    game_lines = game.split('\n')
    return '\n'.join([line for line in game_lines if 'received a card' not in line and 'mucks' not in line and 'timed' not in line])

def remove_summary(game):
    lines = game.split('\n')
    summary_start = None
    for i, line in enumerate(lines):
        if line.strip() == '------ Summary ------':
            summary_start = i
            break
    
    if summary_start is not None:
        return '\n'.join(lines[:summary_start])

def process_file(input_file, output_file):
    with open(input_file, 'r') as infile, open(output_file, 'w') as outfile:
        file_content = infile.read()
        games = separate_games(file_content)
        
        for game in games:
            game = remove_game_started_at(game)
            game = remove_received_card_and_muck_lines(game)
            
            result = check_IlxxxlI_result(game)
            
            if result == 'win' or (result == 'lose' and random.random() < 0.5):
                game = remove_summary(game)
                outfile.write(game + '\n\n')
            

def process_directory(input_dir, output_dir):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    for filename in os.listdir(input_dir):
        if filename.endswith('.txt'):
            input_path = os.path.join(input_dir, filename)
            output_path = os.path.join(output_dir, filename)
            process_file(input_path, output_path)
            print(f"Processed {filename}")

input_directory = "archive"
output_directory = "archive_treated"
process_directory(input_directory, output_directory)

Processed Export Holdem Manager 2.0 12292016131233.txt
Processed Export Holdem Manager 2.0 12302016144830.txt


In [26]:
import pandas as pd
import re

def process_game_for_training(game):
    lines = game.split('\n')
    data = []
    context = []

    for line in lines:
        line = line.strip()  
        if not line:  # Skip lines vide
            continue
        
        if line.startswith('Player IlxxxlI'):
            if ('received card:' in line) or ('has small blind' in line) or ('has big blind' in line):
                context.append(line)
            else:
                action = line.split('Player IlxxxlI ')[1]
                if context:  # Only add data point if there's context
                    data.append({'instruction': '\n'.join(context), 'output': action})
                context.append(line)  # Add this action to context for future actions
        else:
            context.append(line)

    return data

def create_training_dataset(file_path):
    with open(file_path, 'r') as file:
        content = file.read()

    games = re.split(r'Game started\n', content)
    all_data = []

    for game in games:
        if game.strip():  # Ignore empty games
            all_data.extend(process_game_for_training(game))

    df = pd.DataFrame(all_data)
    return df


In [27]:
file_path1 = '/home/avakili/Mistralbluff/archive_treated/Export Holdem Manager 2.0 12292016131233.txt'
file_path2 = '/home/avakili/Mistralbluff/archive_treated/Export Holdem Manager 2.0 12302016144830.txt'

# Create datasets from both files
training_df1 = create_training_dataset(file_path1)
training_df2 = create_training_dataset(file_path2)

# Merge the two datasets
merged_df = pd.concat([training_df1, training_df2], ignore_index=True)
merged_df.instruction = merged_df.instruction + "\nPlayer IlxxxlI"
merged_df['input']=""
merged_df = merged_df[['instruction', 'input', 'output']]

In [39]:
import numpy as np
merged_df.head()
folds_row = merged_df[merged_df['output']=='folds']
print('proportion de folds:', len(folds_row)/len(merged_df))
nb_remove= len(folds_row)//5
indicies_to_remove=np.random.choice(folds_row.index, size=nb_remove, replace=False)
new_df = merged_df.drop(indicies_to_remove) #dataframe avec nombre de fold réduit
folds_row = new_df[new_df['output']=='folds']
print('proportion de folds nouveaux dataset:', len(folds_row)/len(merged_df))

proportion de folds: 0.8786018360313921
proportion de folds: 0.7029046364136573


In [10]:
set([re.sub(r'\d','',action) for action in list(merged_df.output.value_counts().index)])
#Compte le nbr d'action unique

{'allin ()',
 'allin (.)',
 'bets ()',
 'bets (.)',
 'calls ()',
 'calls (.)',
 'caps ()',
 'caps (.)',
 'checks',
 'folds',
 'raises ()',
 'raises (.)'}

In [40]:
merged_df.head()

Unnamed: 0,instruction,input,output
0,Game ID: 787027613 0.50/1 (PRR) Karkadann (Hol...,,folds
1,Game ID: 787027929 0.50/1 (PRR) Karkadann (Hol...,,folds
2,Game ID: 787027464 0.50/1 (PRR) Karkadann (Hol...,,folds
3,Game ID: 787027410 0.50/1 (PRR) Kraken - 10 (H...,,folds
4,Game ID: 787027157 0.50/1 (PRR) Kraken - 10 (H...,,folds


In [41]:
from sklearn.model_selection import train_test_split
import pandas as pd

train, test = train_test_split(merged_df, test_size=0.1, random_state=0)
train.to_json('data/anatole_data_train.json', orient='records')
test.to_json('data/anatole_data_test.json', orient='records')

In [47]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('mistralai/Mistral-7b-v0.1')
with open('/home/avakili/Mistralbluff/data/anatole_data_train.json', 'r') as file:
    text = file.read()
tokens = tokenizer.tokenize(text)


