In [None]:
import numpy as np
from numpy import loadtxt
from sklearn.metrics import accuracy_score, roc_curve, roc_auc_score, precision_recall_curve, confusion_matrix, average_precision_score, f1_score
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt

amazon = loadtxt('/kaggle/input/amazon-from-github/train_test_amazon1.sample', delimiter=' ', dtype=np.int64)
ids1 = amazon[:, :2]
meta1 = amazon[:, 2:-1]
output1 = amazon[:, -1:]

x_train, x_test, y_train, y_test = train_test_split(meta1, output1, random_state=1)
y_train = np.ravel(y_train)

param_grid = {
    'n_estimators': [50, 100, 150, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2']
}

max_score = 0
best_params = {}

for n_estimators in param_grid['n_estimators']:
    for max_depth in param_grid['max_depth']:
        for min_samples_split in param_grid['min_samples_split']:
            for min_samples_leaf in param_grid['min_samples_leaf']:
                for max_features in param_grid['max_features']:
                    params = {
                        'n_estimators': n_estimators,
                        'max_depth': max_depth,
                        'min_samples_split': min_samples_split,
                        'min_samples_leaf': min_samples_leaf,
                        'max_features': max_features
                    }
                    forest = RandomForestClassifier(**params)
                    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
                    scores = cross_val_score(forest, x_train, y_train, cv=skf)
                    score_mean = scores.mean()
                    if score_mean > max_score:
                        max_score = score_mean
                        best_params = params

print("Best Parameters for first dataset:")
print(best_params)
print("Average Accuracy:", max_score)
f1_score_final = 0.0
prc_auc_final = 0.0
roc_auc_final = 0.0
y_test_all = []
y_pred_proba_all = []

for train_index, test_index in skf.split(x_train, y_train):
    x_train_fold, x_val_fold = x_train[train_index], x_train[test_index]
    y_train_fold, y_val_fold = y_train[train_index], y_train[test_index]
    
    y_train_fold = np.ravel(y_train_fold)
    
    final_forest = RandomForestClassifier(**best_params)
    final_forest.fit(x_train_fold, y_train_fold)
    y_pred_final = final_forest.predict_proba(x_val_fold)[:, 1]  # Predict probabilities
    
    y_test_all.extend(y_val_fold)
    y_pred_proba_all.extend(y_pred_final)
    
    f1_final = f1_score(y_val_fold, (y_pred_final > 0.5).astype(int))
    f1_score_final += f1_final
    
    precision, recall, _ = precision_recall_curve(y_val_fold.ravel(), y_pred_final.ravel())
    prc_auc = average_precision_score(y_val_fold.ravel(), y_pred_final.ravel())
    prc_auc_final += prc_auc
    
fpr_final, tpr_final, _ = roc_curve(y_test_all, y_pred_proba_all)
roc_auc_final = roc_auc_score(y_test_all, y_pred_proba_all)

f1_score_final /= skf.n_splits
prc_auc_final /= skf.n_splits

conf_matrix_final = confusion_matrix(y_test_all, (np.array(y_pred_proba_all) > 0.5).astype(int))

tpr_final_value = conf_matrix_final[1, 1] / (conf_matrix_final[1, 1] + conf_matrix_final[1, 0])
fpr_final_value = conf_matrix_final[0, 1] / (conf_matrix_final[0, 1] + conf_matrix_final[0, 0])

print("F1 Score:", f1_score_final)
print("Precision: ", precision)
print("Recall:", recall)
print("Precision-Recall AUC:", prc_auc_final)
print("ROC AUC:", roc_auc_final)
print("TPR for roc:", tpr_final)
print("FPR for roc:", fpr_final)
print("TPR:", tpr_final_value)
print("FPR:", fpr_final_value)

plt.figure()
plt.plot(fpr_final, tpr_final, color='darkorange', lw=2, label='ROC curve (AUC = %0.2f)' % roc_auc_final)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc="lower right")
plt.savefig('/kaggle/working/roc_curve.png')
plt.show()

plt.figure()
plt.plot(recall, precision, color='blue', lw=2, label='Precision-Recall curve (AUC = %0.2f)' % prc_auc_final)
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve')
plt.legend(loc="lower left")
plt.savefig('/kaggle/working/precision_recall_curve.png')
plt.show()

output_file_path = '/kaggle/working/scores.txt'

with open(output_file_path, 'w') as f:
    f.write("F1 Score: {}\n".format(f1_score_final))
    f.write("Precision: {}\n".format(precision))
    f.write("Recall: {}\n".format(recall))
    f.write("PRC AUC: {}\n".format(prc_auc_final))
    f.write("ROC AUC: {}\n".format(roc_auc_final))
    f.write("TPR for roc: {}\n".format(tpr_final))
    f.write("FPR for roc: {}\n".format(fpr_final))
    f.write("TPR: {}\n".format(tpr_final_value))
    f.write("FPR: {}\n".format(fpr_final_value))
