In [1]:
import pandas as pd
import tensorflow as tf




In [2]:
import findspark
findspark.init()

In [25]:
from pyspark.sql import SparkSession
from pyspark import SparkFiles
import sklearn as skl

In [4]:
spark = SparkSession.builder.appName('SparkSQL').getOrCreate()

In [5]:
spark.sparkContext.addFile('games_metadata_profile_2024_01.csv')
df = spark.read.csv(SparkFiles.get('games_metadata_profile_2024_01.csv'), header=True, inferSchema= True)

In [6]:
df.createOrReplaceTempView('chess')

In [249]:
data = spark.sql(
    '''
SELECT WhiteElo, White_playTime_total, White_count_all, BlackElo, Black_playTime_total, Black_count_all, TotalMoves,
Result from chess
WHERE Result != '*'
'''
).toPandas()

In [250]:
data.dropna(inplace=True)

In [251]:
y = data['Result']
X = pd.get_dummies(data.drop('Result', axis=1))

In [257]:
y.replace('1-0', 1,inplace=True)
y.replace('0-1',0,inplace=True)
y.replace('1/2-1/2',0.5,inplace=True)

In [259]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [260]:
from sklearn.preprocessing import StandardScaler
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [267]:
from sklearn.ensemble import RandomForestClassifier
rf_model = RandomForestClassifier(n_estimators=512, random_state=78)

In [268]:
rf_model = rf_model.fit(X_train_scaled, y_train)

ValueError: Unknown label type: 'continuous'

In [247]:
# Making predictions using the testing data
predictions = rf_model.predict(X_test_scaled)

In [248]:
# Random Forests in sklearn will automatically calculate feature importance
importances = rf_model.feature_importances_
# We can sort the features by their importance
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

[(0.18612424190490773, 'TotalMoves'),
 (0.1414724659967678, 'Black_playTime_total'),
 (0.14000356113742343, 'White_playTime_total'),
 (0.1350775059175494, 'White_count_all'),
 (0.13439302626965818, 'Black_count_all'),
 (0.125711006036769, 'BlackElo'),
 (0.12502714164295187, 'WhiteElo'),
 (0.006110311084131381, 'Termination_Normal'),
 (0.006074456814402773, 'Termination_Time forfeit'),
 (6.2831954384373324e-06, 'Termination_Rules infraction')]

In [261]:
# Define the model - deep neural net
number_input_features = len(X_train.columns)

nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(
    tf.keras.layers.Dense(units=256, input_dim=number_input_features, activation="relu")
)

# Second hidden layer
nn.add(tf.keras.layers.Dense(units=64, activation="relu"))

nn.add(tf.keras.layers.Dense(units=64, activation="relu"))

nn.add(tf.keras.layers.Dense(units=64, activation="relu"))

# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Check the structure of the model
nn.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_17 (Dense)            (None, 256)               2048      
                                                                 
 dense_18 (Dense)            (None, 64)                16448     
                                                                 
 dense_19 (Dense)            (None, 64)                4160      
                                                                 
 dense_20 (Dense)            (None, 64)                4160      
                                                                 
 dense_21 (Dense)            (None, 1)                 65        
                                                                 
Total params: 26881 (105.00 KB)
Trainable params: 26881 (105.00 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [262]:
nn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [263]:
fit_model = nn.fit(X_train_scaled,y_train,epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [264]:
# Create a method that creates a new Sequential model with hyperparameter options
def create_model(hp):
    nn_model = tf.keras.models.Sequential()

    # Allow kerastuner to decide which activation function to use in hidden layers
    activation = hp.Choice('activation',['relu','tanh','sigmoid','softmax','swish'])
    # Allow kerastuner to decide number of neurons in first layer
    nn_model.add(tf.keras.layers.Dense(units=hp.Int('first_units',
        min_value=1,
        max_value=512,
        step=5), activation=activation, input_dim=len(X_train.columns)))

    # Allow kerastuner to decide number of hidden layers and neurons in hidden layers
    for i in range(hp.Int('num_layers', 1, 8)):
        nn_model.add(tf.keras.layers.Dense(units=hp.Int('units_' + str(i),
            min_value=1,
            max_value=512,
            step=5),
            activation=activation))
    
    nn_model.add(tf.keras.layers.Dense(units=1, activation='softmax'))

    # Compile the model
    nn_model.compile(loss="binary_crossentropy", optimizer='adam', metrics=["accuracy"])
    
    return nn_model

In [265]:
# Import the kerastuner library
import keras_tuner as kt

tuner = kt.Hyperband(
    create_model,
    objective="val_accuracy",
    max_epochs=20,
    hyperband_iterations=2,
    overwrite=True)

In [266]:
tuner.search(X_train_scaled,y_train,epochs=20,validation_data=(X_test_scaled,y_test))

Trial 1 Complete [00h 00m 16s]
val_accuracy: 0.5041532516479492

Best val_accuracy So Far: 0.5041532516479492
Total elapsed time: 00h 00m 16s

Search: Running Trial #2

Value             |Best Value So Far |Hyperparameter
relu              |softmax           |activation
361               |471               |first_units
4                 |6                 |num_layers
411               |46                |units_0
221               |1                 |units_1
251               |1                 |units_2
216               |1                 |units_3
266               |1                 |units_4
271               |1                 |units_5
3                 |3                 |tuner/epochs
0                 |0                 |tuner/initial_epoch
2                 |2                 |tuner/bracket
0                 |0                 |tuner/round

Epoch 1/3
Epoch 2/3
 195/2946 [>.............................] - ETA: 4s - loss: 0.6749 - accuracy: 0.4957

KeyboardInterrupt: 