In [1]:
import numpy as np
import pandas as pd

In [2]:
import numpy as np

def load_numpy_array(file_path: str) -> np.ndarray:
    """
    Loads a NumPy array from a .npy file.

    Args:
        file_path (str): Path to the .npy file.

    Returns:
        np.ndarray: Loaded NumPy array.
    """
    try:
        array = np.load(file_path)
        print(f"✅ Successfully loaded: {file_path}")
        return array
    except Exception as e:
        print(f"❌ Failed to load {file_path}: {e}")
        raise


In [3]:
transforemd_X_test=pd.read_csv("../Artifacts/data_transformation/transformed/transformed_test.csv")
transforemd_X_train=pd.read_csv("../Artifacts/data_transformation/transformed/transformed_train.csv")
X_train=load_numpy_array("../Artifacts/data_vectorization/train.npy")
X_test=load_numpy_array("../Artifacts/data_vectorization/test.npy")
y_train=transforemd_X_train.iloc[:,-1]
y_test=transforemd_X_test.iloc[:,-1]

✅ Successfully loaded: ../Artifacts/data_vectorization/train.npy
✅ Successfully loaded: ../Artifacts/data_vectorization/test.npy


In [4]:
# Count positive and negative samples
n_pos = np.sum(y_train == 1)
n_neg = np.sum(y_train == 0)

# Calculate scale_pos_weight
scale_pos_weight = n_neg / n_pos
print(f"scale_pos_weight: {scale_pos_weight:.2f}")

scale_pos_weight: 4.56


In [6]:
X_train.shape

(352719, 100)

In [7]:
X_test.shape

(88180, 100)

In [8]:
y_test.shape

(88180,)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    classification_report,
    confusion_matrix
)

# Train the model
lf = LogisticRegression(max_iter=1000,class_weight="balanced")  # Increase max_iter to avoid convergence warnings
lf.fit(X_train, y_train)

# Make predictions
y_pred = lf.predict(X_test)

# Evaluate
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)

# Print results
print("✅ Model Evaluation Metrics:")
print(f"📊 Accuracy:  {accuracy:.4f}")
print(f"📊 Precision: {precision:.4f}")
print(f"📊 Recall:    {recall:.4f}")
print(f"📊 F1 Score:  {f1:.4f}\n")

print("📄 Classification Report:\n")
print(classification_report(y_test, y_pred, zero_division=0))

print("🧩 Confusion Matrix:\n")
print(confusion_matrix(y_test, y_pred))


✅ Model Evaluation Metrics:
📊 Accuracy:  0.7927
📊 Precision: 0.8562
📊 Recall:    0.7927
📊 F1 Score:  0.8113

📄 Classification Report:

              precision    recall  f1-score   support

           0       0.94      0.79      0.86     72346
           1       0.46      0.78      0.58     15834

    accuracy                           0.79     88180
   macro avg       0.70      0.79      0.72     88180
weighted avg       0.86      0.79      0.81     88180

🧩 Confusion Matrix:

[[57477 14869]
 [ 3410 12424]]


In [9]:
import xgboost as xgb
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    classification_report,
    confusion_matrix
)

# Initialize classifier
xgb_clf = xgb.XGBClassifier(
    objective='binary:logistic',  # or 'multi:softmax' for multi-class
    eval_metric='logloss',
    scale_pos_weight=4.56,           # Will update below for imbalance
    n_estimators=100,
    learning_rate=0.1,
    max_depth=10,
    random_state=42
)


# Fit the model
xgb_clf.fit(X_train, y_train)

# Predict
y_pred = xgb_clf.predict(X_test)

# Evaluation
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)

print("✅ XGBoost Evaluation:")
print(f"📊 Accuracy:  {accuracy:.4f}")
print(f"📊 Precision: {precision:.4f}")
print(f"📊 Recall:    {recall:.4f}")
print(f"📊 F1 Score:  {f1:.4f}\n")

print("📄 Classification Report:\n")
print(classification_report(y_test, y_pred, zero_division=0))

print("🧩 Confusion Matrix:\n")
print(confusion_matrix(y_test, y_pred))


✅ XGBoost Evaluation:
📊 Accuracy:  0.8386
📊 Precision: 0.8745
📊 Recall:    0.8386
📊 F1 Score:  0.8497

📄 Classification Report:

              precision    recall  f1-score   support

           0       0.95      0.85      0.90     72346
           1       0.53      0.79      0.64     15834

    accuracy                           0.84     88180
   macro avg       0.74      0.82      0.77     88180
weighted avg       0.87      0.84      0.85     88180

🧩 Confusion Matrix:

[[61407 10939]
 [ 3297 12537]]


In [4]:
import sklearn
import imblearn

print("scikit-learn version:", sklearn.__version__)
print("imbalanced-learn version:", imblearn.__version__)


scikit-learn version: 1.3.2
imbalanced-learn version: 0.11.0


In [6]:
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

# Step 2: Apply Random Undersampling on training data
rus = RandomUnderSampler(random_state=42)
X_resampled, y_resampled = rus.fit_resample(X_train, y_train)

print(X_resampled.shape)

# Step 3: Train XGBoost on balanced data
xgb_clf = XGBClassifier(
    objective='binary:logistic',
    eval_metric='logloss',
    use_label_encoder=False,
    random_state=42
)
xgb_clf.fit(X_resampled, y_resampled)

# Step 4: Evaluate on original test set
y_pred = xgb_clf.predict(X_test)

# Step 5: Evaluation Metrics
print("✅ XGBoost Evaluation After Undersampling:")
print("📊 Accuracy: ", round(accuracy_score(y_test, y_pred), 4))
print("📊 Precision:", round(precision_score(y_test, y_pred), 4))
print("📊 Recall:   ", round(recall_score(y_test, y_pred), 4))
print("📊 F1 Score: ", round(f1_score(y_test, y_pred), 4))

print("\n📄 Classification Report:\n")
print(classification_report(y_test, y_pred))

print("🧩 Confusion Matrix:\n")
print(confusion_matrix(y_test, y_pred))


(126942, 100)


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


✅ XGBoost Evaluation After Undersampling:
📊 Accuracy:  0.7987
📊 Precision: 0.4665
📊 Recall:    0.845
📊 F1 Score:  0.6012

📄 Classification Report:

              precision    recall  f1-score   support

           0       0.96      0.79      0.87     72346
           1       0.47      0.85      0.60     15834

    accuracy                           0.80     88180
   macro avg       0.71      0.82      0.73     88180
weighted avg       0.87      0.80      0.82     88180

🧩 Confusion Matrix:

[[57046 15300]
 [ 2454 13380]]


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Create and train the model
rf = RandomForestClassifier(
    class_weight='balanced',
    n_estimators=100,
    max_depth=7,
    random_state=42
)

rf.fit(X_train, y_train)

# Predict
y_pred = rf.predict(X_test)
# Accuracy
print("✅ Accuracy:", accuracy_score(y_test, y_pred))

# Classification report
print("\n📊 Classification Report:\n")
print(classification_report(y_test, y_pred))

# Confusion matrix
print("🧮 Confusion Matrix:\n")
print(confusion_matrix(y_test, y_pred))


✅ Accuracy: 0.7480834656384667

📊 Classification Report:

              precision    recall  f1-score   support

           0       0.96      0.73      0.83     72346
           1       0.40      0.85      0.55     15834

    accuracy                           0.75     88180
   macro avg       0.68      0.79      0.69     88180
weighted avg       0.86      0.75      0.78     88180

🧮 Confusion Matrix:

[[52479 19867]
 [ 2347 13487]]
