In [None]:
#kfold
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score, confusion_matrix, roc_curve, auc, precision_recall_curve, accuracy_score, precision_score, recall_score
import matplotlib.pyplot as plt
import json
from joblib import dump

# Load your data
df = pd.read_csv('/kaggle/input/concatenated-pca/concatenated_data_Amazon_PCA.csv')


X = df.iloc[:,:]  # Features
y = df.iloc[:, -1]   # Target variable

# Check for consistency in the number of samples
print(f"Shape of X: {X.shape}")
print(f"Shape of y: {y.shape}")


# Initialize StratifiedKFold
n_splits = 5  # For example, 5 folds
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

# Lists to store results from each fold
accuracies = []
f1_scores = []
tprs = []
fprs = []
cms = []
precisions = []
recalls = []
roc_aucs = []
prc_aucs = []

# To store FPR, TPR, Recall, Precision arrays
fpr_lists = []
tpr_lists = []
recall_lists = []
precision_lists = []

# Iterate over each fold
for train_index, test_index in skf.split(X, y):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    # Scale features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Logistic Regression Model
    model = LogisticRegression()
    model.fit(X_train_scaled, y_train)

    # Make predictions and evaluate
    predictions = model.predict(X_test_scaled)
    accuracy = accuracy_score(y_test, predictions)
    accuracies.append(accuracy)

    # Calculate and append metrics
    f1_scores.append(f1_score(y_test, predictions, average='macro'))
    precisions.append(precision_score(y_test, predictions, average='macro'))
    recalls.append(recall_score(y_test, predictions, average='macro'))

    # Confusion Matrix and AUC metrics
    fpr_array, tpr_array, _ = roc_curve(y_test, model.predict_proba(X_test_scaled)[:, 1])
    roc_aucs.append(auc(fpr_array, tpr_array))
    precision_array, recall_array, _ = precision_recall_curve(y_test, model.predict_proba(X_test_scaled)[:, 1])
    prc_aucs.append(auc(recall_array, precision_array))
    
    # Calculate mean FPR and TPR for the fold and append
    fprs.append(np.mean(fpr_array))
    tprs.append(np.mean(tpr_array))

    # Store arrays for each fold
    fpr_lists.append(fpr_array.tolist())
    tpr_lists.append(tpr_array.tolist())
    recall_lists.append(recall_array.tolist())
    precision_lists.append(precision_array.tolist())

     

# Compile metrics into a dictionary
metrics = {
    'Average F1 Score': np.mean(f1_scores),
    'Average Precision': np.mean(precisions),
    'Average Recall': np.mean(recalls),
    'Average True Positive Rate': np.mean(tprs),
    'Average False Positive Rate': np.mean(fprs),
    'Average ROC AUC': np.mean(roc_aucs),
    'Average PRC AUC': np.mean(prc_aucs),
    'Average Accuracy': np.mean(accuracies),
    'FPR Lists': fpr_lists,
    'TPR Lists': tpr_lists,
    'Recall Lists': recall_lists,
    'Precision Lists': precision_lists
}

# Save metrics to a JSON file
with open('logistic_regression_evaluation_metrics_real_life_PCA.json', 'w') as file:
    json.dump(metrics, file, indent=4)

# Print summary of metrics
#print("Summary of metrics:", json.dumps(metrics, indent=4))

# Visualize ROC and PRC for the last fold
plt.figure(figsize=(10, 5))
plt.plot(fpr_array, tpr_array, label=f"ROC curve (area = {roc_aucs[-1]:.2f})")
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend(loc="lower right")
plt.savefig('logistic_regression_roc_curve_real_life_PCA.jpg')
plt.show()

plt.figure(figsize=(10, 5))
plt.plot(recall_array, precision_array, label=f"PRC curve (area = {prc_aucs[-1]:.2f})")
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve')
plt.legend(loc="lower left")
plt.savefig('logistic_regression_prc_curve_real_life_PCA.jpg')
plt.show()

# Save the model
dump(model, 'logistic_regression_model_real_life_PCA.joblib')