In [56]:
# Import necessary libraries
import pandas as pd
import numpy as np
import json
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
import seaborn as sns

In [6]:
# Data Loading
train_files = ['tournament_1.json', 'tournament_10.json', 'tournament_100.json' , 'tournament_101.json' , 'tournament_102.json']  # list of train JSON files
#test_files = ['test_file1.json', 'test_file2.json', ...]  # list of test JSON files

train_data = []
#test_data = []

for file in train_files:
    with open(file, 'rb') as f:
        train_data.extend(json.load(f))

#for file in test_files:
    #with open(file, 'r') as f:
        #test_data.extend(json.load(f))

train_df = pd.DataFrame(train_data)
#test_df = pd.DataFrame(test_data)

In [10]:
train_df

Unnamed: 0,0
0,name
1,start_date
2,end_date
3,games
4,tours
5,time_control
6,name
7,start_date
8,end_date
9,games


In [8]:
# Combine data into a single dataset
full_df = pd.concat([train_df])
# for test use test_df

In [11]:
# Exploratory Data Analysis (EDA)
print(full_df.head())  # inspect the dataset structure
print(full_df.info())  # check for missing values
print(full_df.describe())  # summary statistics

            0
0        name
1  start_date
2    end_date
3       games
4       tours
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30 entries, 0 to 29
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   0       30 non-null     object
dtypes: object(1)
memory usage: 372.0+ bytes
None
           0
count     30
unique     6
top     name
freq       5


In [13]:
print(full_df.columns)

RangeIndex(start=0, stop=1, step=1)


In [15]:
if 'ELO_score' in full_df.columns:
    sns.histplot(full_df['ELO_score'])
    plt.show()
else:
    print("The column 'ELO_score' does not exist in the DataFrame.")

The column 'ELO_score' does not exist in the DataFrame.


In [19]:
if 'game_duration' in full_df.columns and 'move_sequences' in full_df.columns:
    sns.scatterplot(x='game_duration', y='move_sequences', data=full_df)
    plt.show()
else:
    print("One or both of the columns 'game_duration' and 'move_sequences' do not exist in the DataFrame.")

One or both of the columns 'game_duration' and 'move_sequences' do not exist in the DataFrame.


In [22]:
# Select only numeric columns
numeric_df = full_df.select_dtypes(include=[int, float])

# Calculate correlation matrix
corr_matrix = numeric_df.corr()

# Check if 'outcome' is a column in the correlation matrix
if 'outcome' in corr_matrix.columns:
    print(corr_matrix['outcome'].sort_values(ascending=False))
else:
    print("The column 'outcome' does not exist in the correlation matrix.")

The column 'outcome' does not exist in the correlation matrix.


In [42]:
# Read the JSON file
data = pd.read_json('tournament_1.json')
data1 = pd.read_json('tournament_102.json')
# Write the CSV file
data.to_csv('output.csv', index=False)
data1.to_csv('output.csv', index=False)
data.head()
data1.head()

Unnamed: 0,name,start_date,end_date,games,tours,time_control
tour_1,tournament_102,2014-09-04,2014-09-04,"[{'white': 'Zhang, Jia', 'black': 'Bi, Luhui',...",8,classic
tour_2,tournament_102,2014-09-04,2014-09-04,"[{'white': 'Bi, Luhui', 'black': 'Zhao, Suiron...",8,classic
tour_3,tournament_102,2014-09-04,2014-09-04,"[{'white': 'Zhao, Suirong', 'black': 'Wang, Xu...",8,classic
tour_4,tournament_102,2014-09-04,2014-09-04,"[{'white': 'Ma, Jinwen', 'black': 'Zhao, Suiro...",8,classic
tour_5,tournament_102,2014-09-04,2014-09-04,"[{'white': 'Chen, Sen', 'black': 'Ma, Jinwen',...",8,classic


In [45]:

json_files = ['tournament_1.json','tournament_10.json','tournament_100.json','tournament_101.json','tournament_102.json']
# Read the JSON files into a list of DataFrames
dfs = [pd.read_json(file) for file in json_files]

# Concatenate the DataFrames into a single DataFrame
data = pd.concat(dfs, ignore_index=True)

# Write the DataFrame to a CSV file
data.to_csv('output.csv', index=False)

# Extract the games data
games_data = []
for tour, games in data['games'].items():
    for game in games:
        game['tour'] = tour
        games_data.append(game)

# Convert the games data to a Pandas DataFrame
df = pd.DataFrame(games_data)

# Write the DataFrame to a CSV file
df.to_csv('tournament.csv', index=False)
df.head()

Unnamed: 0,white,black,date,result,id,tour
0,贾叶珍,范辰妮,2014-01-08,0.5,tournament_1_1,0
1,吕亚光,李嘉爵,2014-01-08,0.5,tournament_1_2,0
2,刘奇喜,刘晓鹏,2014-01-08,0.5,tournament_1_3,0
3,陆桂姐,郑新聪,2014-01-08,1.0,tournament_1_4,0
4,李汶玲,叶天英,2014-01-08,0.5,tournament_1_5,0


In [46]:
df.shape

(1469, 6)

In [48]:
df.tail()


Unnamed: 0,white,black,date,result,id,tour
1464,"Li, Minkang","Hong, Xueyou",2014-09-04,0.0,tournament_102_148,52
1465,"Zhang, Zhengnan","Shi, Yufa",2014-09-04,0.5,tournament_102_149,52
1466,"Wu, Dongping","Zhang, Jia",2014-09-04,0.5,tournament_102_150,52
1467,"Qin, Huifei","Chen, Chunqian",2014-09-04,1.0,tournament_102_151,52
1468,"Lv, Yinwen","Chen, Si",2014-09-04,1.0,tournament_102_152,52


In [50]:
df.tail

<bound method NDFrame.tail of                 white           black        date  result                  id  \
0                 贾叶珍             范辰妮  2014-01-08     0.5      tournament_1_1   
1                 吕亚光             李嘉爵  2014-01-08     0.5      tournament_1_2   
2                 刘奇喜             刘晓鹏  2014-01-08     0.5      tournament_1_3   
3                 陆桂姐             郑新聪  2014-01-08     1.0      tournament_1_4   
4                 李汶玲             叶天英  2014-01-08     0.5      tournament_1_5   
...               ...             ...         ...     ...                 ...   
1464      Li, Minkang    Hong, Xueyou  2014-09-04     0.0  tournament_102_148   
1465  Zhang, Zhengnan       Shi, Yufa  2014-09-04     0.5  tournament_102_149   
1466     Wu, Dongping      Zhang, Jia  2014-09-04     0.5  tournament_102_150   
1467      Qin, Huifei  Chen, Chunqian  2014-09-04     1.0  tournament_102_151   
1468       Lv, Yinwen        Chen, Si  2014-09-04     1.0  tournament_102_152  

In [49]:
# ELO SCORES()
# Load the games data
games_data = pd.read_csv('tournament.csv')

# Initialize the Elo ratings for each player
elo_ratings = {}

# Set the initial Elo rating and the K-factor
initial_elo = 1200
k_factor = 32

# Iterate over each game
for index, row in games_data.iterrows():
    # Get the player IDs and the outcome of the game
    white = row['white']
    black = row['black']
    outcome = row['result']

    # Initialize the Elo ratings for the players if they don't exist
    if white not in elo_ratings:
        elo_ratings[white] = initial_elo
    if black not in elo_ratings:
        elo_ratings[black] = initial_elo

    # Calculate the expected scores
    rating1 = elo_ratings[white]
    rating2 = elo_ratings[black]
    expected_score1 = 1 / (1 + 10 ** ((rating2 - rating1) / 400))
    expected_score2 = 1 / (1 + 10 ** ((rating1 - rating2) / 400))

    # Calculate the actual scores
    if outcome == 'white won ':
        score1 = 1
        score2 = 0
    elif outcome == 'black won':
        score1 = 0
        score2 = 1
    else:
        score1 = 0.5
        score2 = 0.5

    # Update the Elo ratings
    elo_ratings[white] = rating1 + k_factor * (score1 - expected_score1)
    elo_ratings[black] = rating2 + k_factor * (score2 - expected_score2)

# Convert the Elo ratings to a Pandas DataFrame
elo_data = pd.DataFrame(list(elo_ratings.items()), columns=['white_player , black_player', 'elo_rating'])

# Write the Elo ratings to a CSV file
elo_data.to_csv('elo_ratings.csv', index=False)
print(elo_data)
elo_data.head()

    white_player , black_player  elo_rating
0                           贾叶珍      1200.0
1                           范辰妮      1200.0
2                           吕亚光      1200.0
3                           李嘉爵      1200.0
4                           刘奇喜      1200.0
..                          ...         ...
237              Zhang, Xiucong      1200.0
238                   Shi, Yufa      1200.0
239                   Xing, Xin      1200.0
240                   Yang, Yue      1200.0
241                  Lv, Yinwen      1200.0

[242 rows x 2 columns]


Unnamed: 0,"white_player , black_player",elo_rating
0,贾叶珍,1200.0
1,范辰妮,1200.0
2,吕亚光,1200.0
3,李嘉爵,1200.0
4,刘奇喜,1200.0


In [51]:
elo_data.head

<bound method NDFrame.head of     white_player , black_player  elo_rating
0                           贾叶珍      1200.0
1                           范辰妮      1200.0
2                           吕亚光      1200.0
3                           李嘉爵      1200.0
4                           刘奇喜      1200.0
..                          ...         ...
237              Zhang, Xiucong      1200.0
238                   Shi, Yufa      1200.0
239                   Xing, Xin      1200.0
240                   Yang, Yue      1200.0
241                  Lv, Yinwen      1200.0

[242 rows x 2 columns]>

In [52]:
elo_data.tail()

Unnamed: 0,"white_player , black_player",elo_rating
237,"Zhang, Xiucong",1200.0
238,"Shi, Yufa",1200.0
239,"Xing, Xin",1200.0
240,"Yang, Yue",1200.0
241,"Lv, Yinwen",1200.0


In [None]:

# Analyze correlation between features and outcome
#from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
full_df['tournament'] = le.fit_transform(full_df['tournament'])
corr_matrix = full_df.corr()
print(corr_matrix['tournament'].sort_values(ascending=False))

# Data Transformation
# Handle missing data and outliers
full_df.fillna(full_df.mean(), inplace=True)  # replace missing values with mean

# Create new features
full_df['ELO_diff'] = full_df['ELO_score_player1'] - full_df['ELO_score_player2']

# Encode categorical variables
full_df['outcome'] = pd.get_dummies(full_df['outcome'])

In [None]:
#import pandas as pd
#from sklearn.model_selection import train_test_split
#from sklearn.tree import DecisionTreeClassifier

# Load your data into a pandas DataFrame
df = pd.read_csv('tournament_1.json')

# Define X (features) and y (target)
X = df.drop('white_player', axis=1)  # features
y = df['black_player']  # target

# Split your data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and train a Decision Tree Classifier model
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train, y_train)

In [None]:
# Decision Tree
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train, y_train)


In [84]:

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Load your data into a pandas DataFrame
df = pd.read_json('tournament_102.json')

# Print the column names to identify the correct target variable
print(df.columns)

# Identify the correct target variable (e.g., 'winner', 'game_result', etc.)
target_variable = 'games'  # Replace with the actual column name

# Define X (features) and y (target)
X = df.drop([target_variable], axis=1)  # features
y = df[target_variable]  # target

# Split your data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and train a Decision Tree Classifier model
dt_model = DecisionTreeClassifier(random_state=42)

le = LabelEncoder()
X['tours'] = le.fit_transform(X['tours'])
#dt_model.fit(X_train, y_train)

# Make predictions on the test set
#y_pred = dt_model.predict(X_test)

# Evaluate the model
#print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:")
#print(classification_report(y_test, y_pred))
print("Confusion Matrix:")
#print(confusion_matrix(y_test, y_pred))

Index(['name', 'start_date', 'end_date', 'games', 'tours', 'time_control'], dtype='object')
Classification Report:
Confusion Matrix:


In [None]:

# Load the data
#train_data = pd.read_json('train.json')
#test_data = pd.read_json('test.json')

# Combine the data into a single dataset
#data = pd.concat([train_data, test_data])

# Perform exploratory data analysis and data transformation
# ...

# Encode categorical variables
le = LabelEncoder()
data1['player1'] = le.fit_transform(data1['player1'])
data1['player2'] = le.fit_transform(data1['player2'])

# Normalize or scale data as needed
# ...

# Split the data into training and testing sets
X = data1.drop(['outcome'], axis=1)
y = data1['outcome']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and train a Random Forest classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Evaluate the model using cross-validation and accuracy metrics
y_pred = rf_model.predict(X_test)
print("Random Forest Accuracy:", accuracy_score(y_test, y_pred))
print("Random Forest Classification Report:")
print(classification_report(y_test, y_pred))
print("Random Forest Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# Tune hyperparameters to enhance accuracy
# ...

# Compare the performance with the Decision Tree model
# ...