In [7]:
import pandas as pd
import numpy as np
from sklearn.utils import resample
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, log_loss
import joblib


file_path = "/content/previsaoJogador.csv"
df_classification = pd.read_csv(file_path)


class_counts = df_classification['resultado'].value_counts()


max_samples_per_class = min(class_counts.min(), 200)


reduced_df = pd.concat([
    resample(df_classification[df_classification['resultado'] == label],
             replace=False,
             n_samples=max_samples_per_class,
             random_state=42)
    for label in class_counts.index
])


reduced_df = reduced_df.sample(frac=1, random_state=42).reset_index(drop=True)


reduced_df["nivelJogador"] = reduced_df["nivelJogador"] + np.random.normal(0, 0.5, len(reduced_df))
reduced_df["nivelJogador"] = np.clip(reduced_df["nivelJogador"], 1, 6)


rank_weights = {"D": 5, "C": 4, "B": 3, "A": 2, "S": 1}
reduced_df["rankMissao_weight"] = reduced_df["rankMissao"].map(rank_weights)
reduced_df["success_chance"] = (
    reduced_df["nivelJogador"] / reduced_df["rankMissao_weight"]
)

reduced_df["tempoEstimado"] = (
    10 - reduced_df["success_chance"] + np.random.normal(0, 1, len(reduced_df))
)
reduced_df["tempoEstimado"] = np.clip(reduced_df["tempoEstimado"], 1, None)


enhanced_file_path = "/content/previsaoJogador.csv"
reduced_df.to_csv(enhanced_file_path, index=False)


label_encoders = {}
for column in ["rankMissao", "perigo", "resultado", "tipoMissao"]:
    le = LabelEncoder()
    reduced_df[column] = le.fit_transform(reduced_df[column])
    label_encoders[column] = le


X = reduced_df[["nivelJogador", "rankMissao", "success_chance", "tempoEstimado"]]
y = reduced_df["resultado"]  # Target
scaler = StandardScaler()
X = scaler.fit_transform(X)


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


param_grid = {
    'n_estimators': [10, 25, 50],
    'max_depth': [3, 5, 7],
    'min_samples_split': [10, 20, 30],
    'min_samples_leaf': [5, 10, 15]
}


grid_search = GridSearchCV(
    estimator=RandomForestClassifier(random_state=42),
    param_grid=param_grid,
    cv=5,
    scoring='accuracy',
    n_jobs=-1
)
grid_search.fit(X_train, y_train)


best_model = grid_search.best_estimator_
y_pred_test = best_model.predict(X_test)
test_accuracy = accuracy_score(y_test, y_pred_test)
y_pred_proba = best_model.predict_proba(X_test)
test_log_loss = log_loss(y_test, y_pred_proba)

feature_importance = pd.DataFrame({
    'Feature': ["nivelJogador", "rankMissao", "success_chance", "tempoEstimado"],
    'Importance': best_model.feature_importances_
}).sort_values(by='Importance', ascending=False)


print("Class distribution in reduced dataset:")
print(reduced_df['resultado'].value_counts())
print("\nBest Parameters:", grid_search.best_params_)
print("Test Set Accuracy:", test_accuracy)
print("Test Set Log Loss:", test_log_loss)
print("Feature Importance:")
print(feature_importance)


print(f"Enhanced dataset saved to: {enhanced_file_path}")

model_file_path = "/content/best_model.pkl"
joblib.dump(best_model, model_file_path)
print(f"Modelo salvo em: {model_file_path}")


Class distribution in reduced dataset:
resultado
4    125
5    125
2    125
3    125
0    125
1    125
Name: count, dtype: int64

Best Parameters: {'max_depth': 7, 'min_samples_leaf': 5, 'min_samples_split': 10, 'n_estimators': 25}
Test Set Accuracy: 0.9
Test Set Log Loss: 0.2696586076863498
Feature Importance:
          Feature  Importance
1      rankMissao    0.407637
2  success_chance    0.249505
0    nivelJogador    0.232935
3   tempoEstimado    0.109923
Enhanced dataset saved to: /content/previsaoJogador.csv
Modelo salvo em: /content/best_model.pkl
