Load the libraries required

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, precision_recall_curve, f1_score
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
import matplotlib.pyplot as plt
import seaborn as sns

# Load the dataset

In [None]:
data = pd.read_excel("/content/drive/MyDrive/Sample_Claims_Data.xlsx")

In [None]:
data

# 1. Data Preprocessing

In [None]:
list(data.columns)

In [None]:
data.describe()

In [None]:
data.info()

Checking for missing values

In [None]:
print(data.isnull().sum())

# 1.1 Categorical Feature Distribution (Location, Diagnosis Code, Procedure Code)


Location distribution

In [None]:
plt.figure(figsize=(30, 6))
plt.subplot(1, 2, 1)
sns.countplot(data=data, x='Location', palette='Set2')
plt.title('Location Distribution')

Diagnosis code distribution (Top and Bottom 10 most common diagnosis codes)

In [None]:
plt.subplot(1, 2, 2)
top_diagnosis_codes = data['Diagnosis Code'].value_counts().tail(10)
sns.barplot(x=top_diagnosis_codes.index, y=top_diagnosis_codes.values, palette='Set1')
plt.title('Bottom 10 Diagnosis Codes')
plt.xticks(rotation=45)

plt.tight_layout()
plt.show()

In [None]:
plt.subplot(1, 2, 2)
top_diagnosis_codes = data['Diagnosis Code'].value_counts().head(10)
sns.barplot(x=top_diagnosis_codes.index, y=top_diagnosis_codes.values, palette='Set1')
plt.title('Bottom 10 Diagnosis Codes')
plt.xticks(rotation=45)

plt.tight_layout()
plt.show()

# 1.2 Numerical Feature Distribution (Claim Amount, Member Age, Previous Claims)

Distribution of claim amounts

In [None]:
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
sns.histplot(data['Claim Amount'], bins=30, kde=True, color='blue')
plt.title('Claim Amount Distribution')

Distribution of member age

In [None]:
plt.subplot(1, 2, 2)
sns.histplot(data['Member Age'], bins=30, kde=True, color='green')
plt.title('Member Age Distribution')

plt.tight_layout()
plt.show()

# 1.3 Correlation Heatmap for Numerical Features

In [None]:
correlation_matrix = data[['Claim Amount', 'Previous Claims', 'Member Age', 'Fraudulent']].corr()
plt.figure(figsize=(8, 6))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Heatmap for Numerical Features')
plt.show()

# 2. Data Preprocessing

Convert date columns to datetime format

In [None]:
data['Date of Service'] = pd.to_datetime(data['Date of Service'])
data['Claim Submission Date'] = pd.to_datetime(data['Claim Submission Date'])


# 3. Feature Engineering

Create a new feature for claim amount being unusually high (flagging high claims)

In [None]:
mean_claim_amount = data['Claim Amount'].mean()
std_claim_amount = data['Claim Amount'].std()
data['High Claim Amount'] = (data['Claim Amount'] > (mean_claim_amount + 2 * std_claim_amount)).astype(int)


Create a feature indicating a mismatch between Diagnosis and Procedure Codes (simple rule-based logic)

In [None]:
data['Procedure Code Valid'] = (data['Procedure Code'] != 0).astype(int)

Create a flag for frequent claim submissions

In [None]:
data['Frequent Claim'] = (data['Previous Claims'] > 3).astype(int)

Label Encoding: categorical to numeric(columns)

In [None]:
label_encoder = LabelEncoder()
data['Location'] = label_encoder.fit_transform(data['Location'])

# 4. Split the dataset into train and test sets

In [None]:
X = data[['Claim Amount', 'Previous Claims', 'Member Age', 'High Claim Amount',
          'Procedure Code Valid', 'Frequent Claim', 'Location']]
y = data['Fraudulent']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


Function to train and evaluate different classifiers of models. Usinge different metrics to check performance.

In [None]:
def train_and_evaluate_model(model, model_name, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1]

    # Metrics
    print(f"{model_name} Classification Report:")
    print(classification_report(y_test, y_pred))
    print(f"Confusion Matrix ({model_name}):")
    print(confusion_matrix(y_test, y_pred))
    print(f"ROC-AUC Score ({model_name}): {roc_auc_score(y_test, y_pred_proba):.2f}")

    # Precision-Recall Curve
    precision, recall, _ = precision_recall_curve(y_test, y_pred_proba)
    plt.figure(figsize=(8, 6))
    plt.plot(recall, precision, label=f'{model_name}')
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title('Precision-Recall Curve')
    plt.legend()
    plt.show()

    # F1-Score
    f1 = f1_score(y_test, y_pred)
    print(f"F1-Score for {model_name}: {f1:.2f}")

    return model

# 5. Model Building - Logistic Regression, Random Forest, Gradient Boosting, and SVM

In [None]:
logreg_model = LogisticRegression(max_iter=1000)
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
gb_model = GradientBoostingClassifier(n_estimators=100, random_state=42)
svm_model = SVC(probability=True, random_state=42)

Training and models evaluation classification reports, confusion matrics,f1 score, accuracy.

In [None]:
logreg_model = train_and_evaluate_model(logreg_model, "Logistic Regression", X_train, X_test, y_train, y_test)
rf_model = train_and_evaluate_model(rf_model, "Random Forest", X_train, X_test, y_train, y_test)
gb_model = train_and_evaluate_model(gb_model, "Gradient Boosting", X_train, X_test, y_train, y_test)
svm_model = train_and_evaluate_model(svm_model, "SVM", X_train, X_test, y_train, y_test)

# 6. Hyperparameter Tuning (example for Random Forest using GridSearchCV)

In [None]:
param_grid_rf = {
    'n_estimators': [50, 100, 150],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

grid_search_rf = GridSearchCV(estimator=rf_model, param_grid=param_grid_rf, cv=5, n_jobs=-1, scoring='roc_auc')
grid_search_rf.fit(X_train, y_train)

print(f"Best Parameters for Random Forest: {grid_search_rf.best_params_}")
print(f"Best Cross-Validation Score for Random Forest: {grid_search_rf.best_score_:.2f}")

Training the optimized Random Forest model : had the highest accuracy .

In [None]:
best_rf_model = grid_search_rf.best_estimator_
best_rf_model.fit(X_train, y_train)
y_pred_rf = best_rf_model.predict(X_test)
print(f"Optimized Random Forest ROC-AUC Score: {roc_auc_score(y_test, best_rf_model.predict_proba(X_test)[:, 1]):.2f}")


# 7. Risk Scoring

Assigning the risk scores to each claim (probabilities of being fraudulent from the Random Forest(the chosen) model)

In [None]:
data['Risk Score'] = best_rf_model.predict_proba(X)[:, 1]

Show the top and bottom 10 risk scores

In [None]:
# Top 10 claims with the highest risk scores
top_10_risky_claims = data[['Claim ID', 'Risk Score']].sort_values(by='Risk Score', ascending=False).head(10)
print("\nTop 10 Claims with the Highest Risk Scores:")
print(top_10_risky_claims)

In [None]:
# Bottom 10 claims with the lowest risk scores
bottom_10_risky_claims = data[['Claim ID', 'Risk Score']].sort_values(by='Risk Score', ascending=False).tail(10)
print("\nBottom 10 Claims with the Lowest Risk Scores:")
print(bottom_10_risky_claims)

# 8. Cross-Validation (for Random Forest)

In [None]:
cv_scores_rf = cross_val_score(best_rf_model, X, y, cv=5, scoring='roc_auc')
print(f"Random Forest Cross-Validation ROC-AUC Scores: {cv_scores_rf}")
print(f"Mean ROC-AUC Score for Random Forest: {cv_scores_rf.mean():.2f}")

# 9. Feature Importance

In [None]:
feature_importances = best_rf_model.feature_importances_
features = X.columns
plt.figure(figsize=(10, 6))
sns.barplot(x=features, y=feature_importances)
plt.title('Feature Importance ( Random Forest)')
plt.xticks(rotation=45)
plt.show()