<a href="https://colab.research.google.com/github/Jesteban247/ML-College/blob/main/Projects/Project_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# New Section

In [None]:
# Cell 1: Install required packages and libraries
!git clone https://github.com/rapidsai/rapidsai-csp-utils.git
!python rapidsai-csp-utils/colab/pip-install.py
!pip install -q dagshub mlflow

In [None]:
# Cell 2: Import necessary libraries
import cudf
import zipfile
import os
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.callbacks import TensorBoard
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Input, Dropout
from sklearn.model_selection import train_test_split
import time
from tensorflow.keras.utils import plot_model
import mlflow
import mlflow.tensorflow
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import dagshub

In [None]:
# Cell 3: Check GPU Availability
def check_gpu():
    gpus = tf.config.list_physical_devices('GPU')
    if gpus:
        print("GPUs available:")
        for gpu in gpus:
            print(f"- {gpu}")
    else:
        print("No GPUs detected.")

check_gpu()

GPUs available:
- PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')


In [None]:
# Cell 4: GPU details
!nvidia-smi

Thu Sep 12 05:59:24 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   43C    P8               9W /  70W |      3MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [None]:
# Cell 5: Download and Extract Dataset
!kaggle datasets download -d nonrice/clash-royale-battles-upper-ladder-december-2021
zip_file_path = '/content/clash-royale-battles-upper-ladder-december-2021.zip'
extraction_folder = '/content/clash_royale_dataset/'
os.makedirs(extraction_folder, exist_ok=True)
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(extraction_folder)
print("Dataset successfully extracted!")

Dataset URL: https://www.kaggle.com/datasets/nonrice/clash-royale-battles-upper-ladder-december-2021
License(s): CC-BY-SA-4.0
Downloading clash-royale-battles-upper-ladder-december-2021.zip to /content
 81% 9.00M/11.1M [00:01<00:00, 11.2MB/s]
100% 11.1M/11.1M [00:01<00:00, 8.23MB/s]
Dataset successfully extracted!


In [None]:
# Cell 6: Load Dataset using RAPIDS cuDF
data_ord = cudf.read_csv('/content/clash_royale_dataset/data_ord.csv')
data_ord.drop(data_ord.columns[0], axis=1, inplace=True)
cardlist = cudf.read_csv('/content/clash_royale_dataset/cardlist.csv')
cardlist.drop(cardlist.columns[0], axis=1, inplace=True)

# Create Binary Features
cardlist['p_1'] = cardlist['card'] + '_p1'
cardlist['p_2'] = cardlist['card'] + '_p2'
columns = cardlist['p_1'].to_arrow().to_pylist() + cardlist['p_2'].to_arrow().to_pylist()
df = cudf.DataFrame(index=cudf.Series(range(data_ord.shape[0])), columns=columns).fillna(0)

p1_indices = list(range(8))
p2_indices = list(range(8, 16))
positions_p1 = data_ord.iloc[:, p1_indices].astype(int)
positions_p2 = data_ord.iloc[:, p2_indices].astype(int) + 106

binary_df_p1 = cudf.DataFrame({cardlist['p_1'][i]: (positions_p1 == i).any(axis=1).astype(int) for i in range(len(cardlist))})
binary_df_p2 = cudf.DataFrame({cardlist['p_2'][i]: (positions_p2 == i + 106).any(axis=1).astype(int) for i in range(len(cardlist))})
df = cudf.concat([binary_df_p1, binary_df_p2], axis=1)

# Prepare Features and Target
X = df
y = data_ord['outcome']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Show Data Types
print(f"Type of X: {type(X)}")
print(f"Type of y: {type(y)}")

Type of X: <class 'cudf.core.dataframe.DataFrame'>
Type of y: <class 'cudf.core.series.Series'>


# New Section

In [None]:
# Cell 7: Convert to TensorFlow Dataset
def to_tf_dataset(X, y, batch_size=32):
    dataset = tf.data.Dataset.from_tensor_slices((X.to_pandas().values, y.to_pandas().values))
    dataset = dataset.shuffle(buffer_size=len(X))
    dataset = dataset.batch(batch_size)
    dataset = dataset.prefetch(buffer_size=tf.data.AUTOTUNE)
    return dataset

train_dataset = to_tf_dataset(X_train, y_train)
test_dataset = to_tf_dataset(X_test, y_test)

# Show TensorFlow Dataset Types
print(f"Type of train_dataset: {type(train_dataset)}")
print(f"Type of test_dataset: {type(test_dataset)}")


In [None]:
# Cell 8: Build the TensorFlow Model
def build_tf_model(input_dim, num_hidden_layers, num_neurons_per_layer, learning_rate):
    model = Sequential()
    model.add(Input(shape=(input_dim,)))
    model.add(Dense(num_neurons_per_layer, activation='relu'))
    model.add(Dropout(0.2))

    for _ in range(num_hidden_layers - 1):
        model.add(Dense(num_neurons_per_layer, activation='relu'))
        model.add(Dropout(0.2))

    model.add(Dense(1, activation='sigmoid'))
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
                  loss='binary_crossentropy',
                  metrics=['accuracy'])
    return model

In [None]:
# Cell 9: Initialize DagsHub with MLflow tracking
dagshub.init(repo_owner='Jesteban247', repo_name='Clash-Royale-Experiment', mlflow=True)

# Set up MLflow experiment in the DagsHub repo
mlflow.set_experiment("Clash Royale Experiment")

# Enable MLflow Autologging
mlflow.autolog()

In [None]:
# Cell 10: Function to train and evaluate the model
def train_and_evaluate_model(input_dim, num_hidden_layers, num_neurons_per_layer, epochs, learning_rate):
    # End any active MLflow run before starting a new one
    if mlflow.active_run() is not None:
        mlflow.end_run()

    try:
        # Start a new MLflow run
        with mlflow.start_run():
            # Build the TensorFlow model
            tf_model = build_tf_model(input_dim, num_hidden_layers, num_neurons_per_layer, learning_rate)

            # Setup TensorBoard logging
            log_dir = "results/runs"  # Directory for TensorBoard logs
            tensorboard_callback = TensorBoard(log_dir=log_dir, histogram_freq=1)

            # Log only specific parameters
            mlflow.log_param("num_hidden_layers", num_hidden_layers)
            mlflow.log_param("num_neurons_per_layer", num_neurons_per_layer)
            mlflow.log_param("epochs", epochs)
            mlflow.log_param("learning_rate", learning_rate)

            print(f"Starting training with parameters: "
                  f"num_hidden_layers={num_hidden_layers}, "
                  f"num_neurons_per_layer={num_neurons_per_layer}, "
                  f"epochs={epochs}, "
                  f"learning_rate={learning_rate}")

            # Train the model
            start_time = time.time()
            early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3)
            history = tf_model.fit(
                train_dataset,
                epochs=epochs,
                validation_data=test_dataset,
                callbacks=[early_stopping, tensorboard_callback]
            )
            tf_training_time = time.time() - start_time

            # Print training details
            print(f"Training completed in {tf_training_time:.2f} seconds.")
            print(f"Final training accuracy: {history.history['accuracy'][-1]:.4f}")
            print(f"Final training loss: {history.history['loss'][-1]:.4f}")
            print(f"Final validation accuracy: {history.history['val_accuracy'][-1]:.4f}")
            print(f"Final validation loss: {history.history['val_loss'][-1]:.4f}")

            # Evaluate the model
            start_time = time.time()
            loss, accuracy = tf_model.evaluate(test_dataset)
            tf_evaluation_time = time.time() - start_time

            # Print evaluation details
            print(f"Evaluation completed in {tf_evaluation_time:.2f} seconds.")
            print(f"Test accuracy: {accuracy:.4f}")
            print(f"Test loss: {loss:.4f}")

            # Log metrics in MLflow (autologging will handle other details)
            mlflow.log_metric("training_time", tf_training_time)
            mlflow.log_metric("evaluation_time", tf_evaluation_time)

        print(f"Training done! TensorBoard logs saved at: {log_dir}")
        print(f"View run details at: {mlflow.get_artifact_uri()}")

    except Exception as e:
        print(f"An error occurred: {e}")
        # Optionally, you can end the run if it is not ended
        if mlflow.active_run() is not None:
            mlflow.end_run()


In [None]:
# Cell 11: Example Run
train_and_evaluate_model(input_dim=X_train.shape[1],
                         num_hidden_layers=4,
                         num_neurons_per_layer=128,
                         epochs=2,
                         learning_rate=0.0005)

In [None]:
# Test 2
train_and_evaluate_model(input_dim=X_train.shape[1],
                         num_hidden_layers=3,
                         num_neurons_per_layer=64,
                         epochs=5,
                         learning_rate=0.001)

In [None]:
# Cell 12: Launch TensorBoard for Visualization (after MLflow logging)
%load_ext tensorboard
%tensorboard --logdir results/runs

# New Section

In [None]:
import cuml
from cuml.linear_model import LogisticRegression
import time

# Initialize and fit the Logistic Regression model
log_reg = LogisticRegression()

# Convert X_train to float32
X_train = X_train.astype('float32')
# Convert X_test to float32
X_test = X_test.astype('float32')

# Train the model and measure time
start_time = time.time()
log_reg.fit(X_train, y_train)
training_time = time.time() - start_time

# Evaluate the model
accuracy = log_reg.score(X_test, y_test)

# Print results
print(f"Training time: {training_time:.2f} seconds")
print(f"Test accuracy: {accuracy:.4f}")

Training time: 0.49 seconds
Test accuracy: 0.5588


In [None]:
import cuml
from cuml.naive_bayes import GaussianNB
import time

# Initialize the Naive Bayes model
nb = GaussianNB()

# Train the model and measure time
start_time = time.time()
nb.fit(X_train, y_train)
training_time = time.time() - start_time

# Evaluate the model
accuracy = nb.score(X_test, y_test)

# Print results
print(f"Training time: {training_time:.2f} seconds")
print(f"Test accuracy: {accuracy:.4f}")

Training time: 0.19 seconds
Test accuracy: 0.5360


In [None]:
# Cell 1: Import Required Libraries
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import time
import pandas as pd

# Cell 2: Convert to Pandas DataFrames/Series (if needed)
X_train_pd = X_train.to_pandas()
X_test_pd = X_test.to_pandas()
y_train_pd = y_train.to_pandas()
y_test_pd = y_test.to_pandas()

# Cell 3: Train Logistic Regression Model and Measure Time
start_time = time.time()

# Initialize and fit the Logistic Regression model
log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train_pd, y_train_pd)

training_time = time.time() - start_time

# Predict and evaluate the model
y_pred = log_reg.predict(X_test_pd)
accuracy = accuracy_score(y_test_pd, y_pred)

# Print results
print(f"Training time: {training_time:.2f} seconds")
print(f"Test accuracy: {accuracy:.4f}")

Training time: 24.04 seconds
Test accuracy: 0.5587


**Analysis of Clash Royale Victory Prediction Based on Cards**

I've conducted an analysis to predict the win or loss in Clash Royale based on the cards used by players. Here is a summary of the process and findings:

### Data Processing

1. **Data Preparation**: The data has been processed and is already in binary format. Each player has a feature vector of 212 dimensions, representing the 106 possible cards in the game, with each card being binary (present or not).

2. **Transformations and Cleaning**:
   - **Transformations**: The data is in binary format, and no additional cleaning is necessary.
   - **Normalization and Dimensionality Reduction**: Given the data size, which isn't excessively large, dimensionality reduction techniques like PCA were not applied. The processing using CUDA libraries for linear regression and Naive Bayes was efficient and took minimal time.

### Model Performance

- The highest accuracy achieved so far is around 60%. While this is a reasonable performance considering the complexity of the task, it indicates that predicting the outcome based solely on card features has its limitations.

### Conclusion

Considering the current results and constraints, I believe there is limited scope for significant improvement. Achieving an accuracy of 95% or higher is highly unlikely with the current feature set and methods. The models are performing reasonably well, but further enhancements would require additional data or features, such as integrating external information from the Clash Royale API.