<a href="https://colab.research.google.com/github/LeibGit/-DI_Bootcamp/blob/main/dc_week5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import scipy.stats
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import f1_score, precision_score, accuracy_score, recall_score, confusion_matrix, ConfusionMatrixDisplay, mean_absolute_error

In [None]:
combats_df = pd.read_csv("combats.csv")
pokemon_df = pd.read_csv("pokemon.csv")

In [None]:
combats_df.head()

In [None]:
pokemon_df.head()

In [None]:
pokemon_df["Name"] = pokemon_df["Name"].fillna("Primeape")
pokemon_df["Type 2"] = pokemon_df["Type 2"].fillna("")
pokemon_df.head()

In [None]:
pokemon_df.columns

In [None]:
wins = combats_df["Winner"].value_counts()

first_counts = combats_df["First_pokemon"].value_counts()
second_counts = combats_df["Second_pokemon"].value_counts()
total_battles = first_counts.add(second_counts, fill_value=0)

win_percentage = (wins / total_battles) * 100
win_percentage = win_percentage.fillna(0)  # Pokémon with no wins → 0%

# Fix: Use .map() to directly assign the win percentage based on Pokemon ID,
# which avoids merge conflicts if the cell is run multiple times.
pokemon_df["Win_Percentage"] = pokemon_df["#"].map(win_percentage)

pokemon_df["Win_Percentage"] = pokemon_df["Win_Percentage"].fillna(0)

print(pokemon_df[["Name", "Win_Percentage"]])

In [None]:
numeric_df = pokemon_df.select_dtypes(include='number')
corr_matrix = numeric_df.corr()
plt.figure(figsize=(10,8))
sns.heatmap(corr_matrix, annot=True, fmt=".2f", cmap="coolwarm")
plt.show()

In [None]:
pokemon_df.columns

In [None]:
# Stats to compare with Win_Percentage
stats = ["HP", "Attack", "Defense", "Sp. Atk", "Sp. Def", "Speed", "Generation"]

# Include Win_Percentage
df_plot = pokemon_df[stats + ["Win_Percentage"]]

# Pairplot
sns.pairplot(df_plot, y_vars="Win_Percentage", x_vars=stats, height=3, aspect=1)
plt.show()

In [None]:
top_10 = pokemon_df.sort_values(by="Win_Percentage", ascending=False)
print(f"Top 10 pokemon: {top_10.head(10)}")

In [None]:
pokemon_df.info()

In [None]:
X = pokemon_df.select_dtypes(include=["int64", "float64"]).drop(columns=["Win_Percentage", "#"])
y = pokemon_df["Win_Percentage"]

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42
)

In [None]:
# linear Regression model
lin_pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("model", LinearRegression())
])

rf_pipeline = Pipeline([
    ("model", RandomForestRegressor())
])

xgb_pipeline = Pipeline([
    ("model", XGBRegressor())
])

lin_pipeline.fit(X_train, y_train)
rf_pipeline.fit(X_train, y_train)
xgb_pipeline.fit(X_train, y_train)

In [None]:
# predictions
lin_pred = lin_pipeline.predict(X_test)
rf_pred = rf_pipeline.predict(X_test)
xgb_pred = xgb_pipeline.predict(X_test)

In [None]:
# calculate Mean Absolute Error
def mae_calc(y_test, y_pred):
  mae = mean_absolute_error(y_test, y_pred)
  print(mae)
  return mae

lin_mae_calc = mae_calc(y_test, lin_pred)
rf_mae_calc = mae_calc(y_test, rf_pred)
xgb_mae_calc = mae_calc(y_test, xgb_pred)

mae_values = {
    "Linear Regression": lin_mae_calc,
    "Random Forest:": rf_mae_calc,
    "XGBoost": xgb_mae_calc
}

best_model = min(mae_values, key=mae_values.get)
best_value = mae_values[best_model]

print(f"\nBest model: **{best_model}** with MAE = {best_value:.3f}")