## Deploying Models

### 1- Data preprocessing

In [None]:
import pandas as pd
import numpy as np
df = pd.read_csv('Healthcare_Providers_cleaned.csv')
# Drop Unnecessary Columns
df = df.drop(['City of the Provider','State Code of the Provider','Provider Type','HCPCS Code'], axis=1)

In [None]:
df

In [None]:
binary_columns = ['Entity Type of the Provider','Place of Service','HCPCS Drug Indicator']

In [None]:
from sklearn.preprocessing import OneHotEncoder
# Create the OneHotEncoder object
encoder = OneHotEncoder(sparse=False,drop='first')

# Fit and transform the data
encoded_columns = encoder.fit_transform(df[binary_columns])

# Create DataFrame with encoded features
encoded_df = pd.DataFrame(encoded_columns, columns=encoder.get_feature_names_out(binary_columns))

# Combine the original DataFrame with the encoded DataFrame
# Drop the original categorical columns from df and concatenate with encoded_df
df_encoded = pd.concat([df.drop(columns=binary_columns), encoded_df], axis=1)

df_encoded

In [None]:
from sklearn.preprocessing import RobustScaler
# Initialize RobustScaler
robust_scaler = RobustScaler()

# Apply RobustScaler to the df_encoded DataFrame
df_encoded_scaled = pd.DataFrame(robust_scaler.fit_transform(df_encoded), columns=df_encoded.columns)

# Display the scaled DataFrame
df_encoded_scaled

In [None]:
import plotly.express as px
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.neighbors import NearestNeighbors
from sklearn.cluster import KMeans, DBSCAN
from sklearn.ensemble import IsolationForest
from sklearn.metrics import pairwise_distances_argmin_min
import matplotlib.pyplot as plt



# Dimensionality Reduction with PCA to 2D and 3D
pca_pipeline_2d = Pipeline([
    ('pca', PCA(n_components=2))  # Reduce to 2 dimensions
])

pca_pipeline_3d = Pipeline([
    ('pca', PCA(n_components=3))  # Reduce to 3 dimensions
])

X_pca_2d = pca_pipeline_2d.fit_transform(df_encoded_scaled)
X_pca_3d = pca_pipeline_3d.fit_transform(df_encoded_scaled)

# Elbow Method for K-Means
inertia = []
k_range = range(1, 11)
for k in k_range:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(X_pca_2d)
    inertia.append(kmeans.inertia_)

# Plot Elbow Method
plt.figure(figsize=(12, 9))
plt.plot(k_range, inertia, marker='o')
plt.xlabel('Number of clusters')
plt.ylabel('Inertia')
plt.title('Elbow Method for Optimal K')
plt.grid(True)
plt.show()

# K-Means Clustering
kmeans = KMeans(n_clusters=3, random_state=42)
labels_kmeans_2d = kmeans.fit_predict(X_pca_2d)
labels_kmeans_3d = kmeans.fit_predict(X_pca_3d)

# Compute distances to nearest cluster centers for anomaly detection
distances_to_centers = pairwise_distances_argmin_min(X_pca_3d, kmeans.cluster_centers_)[1]
distance_threshold = np.percentile(distances_to_centers, 90)  # Define a threshold
is_anomaly_kmeans = distances_to_centers > distance_threshold

# k-Distance Graph for DBSCAN
def plot_k_distance_graph(X, k=5):
    neigh = NearestNeighbors(n_neighbors=k)
    neighbors = neigh.fit(X)
    distances, indices = neighbors.kneighbors(X)
    k_distances = distances[:, -1]  # Distance to k-th nearest neighbor
    k_distances = np.sort(k_distances)
    
    plt.figure(figsize=(12, 9))
    plt.plot(k_distances)
    plt.xlabel('Points sorted by distance to {}-th nearest neighbor'.format(k))
    plt.ylabel('Distance')
    plt.title('k-Distance Graph')
    plt.grid(True)
    plt.show()

# Plot k-distance graph for 2D data
plot_k_distance_graph(X_pca_2d, k=5)
# DBSCAN Clustering
dbscan = DBSCAN(eps=0.05, min_samples=5)
labels_dbscan = dbscan.fit_predict(X_pca_2d)

# Isolation Forest
iso_forest = IsolationForest(contamination=0.1, random_state=42)
outliers = iso_forest.fit_predict(X_pca_2d)
is_anomaly_iso = outliers == -1

# DataFrames for visualization
df_kmeans_2d = pd.DataFrame(X_pca_2d, columns=['PC 1', 'PC 2'])
df_kmeans_2d['Cluster'] = labels_kmeans_2d
df_kmeans_2d['Anomaly'] = is_anomaly_kmeans

df_kmeans_3d = pd.DataFrame(X_pca_3d, columns=['PC 1', 'PC 2', 'PC 3'])
df_kmeans_3d['Cluster'] = labels_kmeans_3d
df_kmeans_3d['Anomaly'] = is_anomaly_kmeans

df_dbscan = pd.DataFrame(X_pca_2d, columns=['PC 1', 'PC 2'])
df_dbscan['Cluster'] = labels_dbscan
df_dbscan['Anomaly'] = df_dbscan['Cluster'] == -1

df_iso = pd.DataFrame(X_pca_2d, columns=['PC 1', 'PC 2'])
df_iso['Anomaly'] = is_anomaly_iso

# Visualization Functions

def plot_pie_chart(df, title):
    anomaly_count = df['Anomaly'].sum()
    normal_count = len(df) - anomaly_count
    labels = ['Anomaly', 'Normal']
    sizes = [anomaly_count, normal_count]
    colors = ['red', 'blue']
    
    plt.figure(figsize=(8, 8))
    plt.pie(sizes, labels=labels, colors=colors, autopct='%1.1f%%', startangle=140)
    plt.title(title)
    plt.show()

# K-Means Visualizations

# Plot K-Means 2D Results
fig_kmeans_2d = px.scatter(df_kmeans_2d, x='PC 1', y='PC 2', color='Anomaly',
                          color_continuous_scale=['blue', 'red'],
                          title='K-Means 2D Anomalies (PCA Reduced)',
                          labels={'Anomaly': 'Anomaly (0 = Normal, 1 = Anomaly)'})
fig_kmeans_2d.show()

# Plot K-Means 3D Results
fig_kmeans_3d = px.scatter_3d(df_kmeans_3d, x='PC 1', y='PC 2', z='PC 3', color='Anomaly',
                          color_continuous_scale=['blue', 'red'],
                          title='K-Means 3D Anomalies (PCA Reduced)',
                          labels={'Anomaly': 'Anomaly (0 = Normal, 1 = Anomaly)'})
# Add K-Means cluster centroids
centroids = pd.DataFrame(kmeans.cluster_centers_, columns=['PC 1', 'PC 2', 'PC 3'])
fig_kmeans_3d.add_scatter3d(x=centroids['PC 1'], y=centroids['PC 2'], z=centroids['PC 3'],
                         mode='markers', marker=dict(size=12, color='red', symbol='cross'),
                         name='Centroids')
fig_kmeans_3d.show()

# DBSCAN Visualizations

# Plot DBSCAN 2D Results
fig_dbscan_2d = px.scatter(df_dbscan, x='PC 1', y='PC 2', color='Anomaly',
                          color_continuous_scale=['blue', 'red'],
                          title='DBSCAN 2D Anomalies (PCA Reduced)',
                          labels={'Anomaly': 'Anomaly (0 = Normal, 1 = Anomaly)'})
fig_dbscan_2d.show()

# Plot DBSCAN 3D Results (using the same PCA reduced data if available)
fig_dbscan_3d = px.scatter_3d(pd.DataFrame(X_pca_3d, columns=['PC 1', 'PC 2', 'PC 3']).assign(Cluster=labels_dbscan),
                          x='PC 1', y='PC 2', z='PC 3', color='Cluster',
                          color_continuous_scale=['blue', 'red'],
                          title='DBSCAN 3D Anomalies (PCA Reduced)',
                          labels={'Cluster': 'Cluster ID'})
fig_dbscan_3d.show()

# Isolation Forest Visualizations

# Plot Isolation Forest 2D Results
fig_iso_2d = px.scatter(df_iso, x='PC 1', y='PC 2', color='Anomaly',
                     color_continuous_scale=['blue', 'red'],
                     title='Isolation Forest 2D Anomalies (PCA Reduced)',
                     labels={'Anomaly': 'Anomaly (0 = Normal, 1 = Anomaly)'})
fig_iso_2d.show()

# Plot Isolation Forest 3D Results (using the same PCA reduced data if available)
fig_iso_3d = px.scatter_3d(pd.DataFrame(X_pca_3d, columns=['PC 1', 'PC 2', 'PC 3']).assign(Anomaly=is_anomaly_iso),
                          x='PC 1', y='PC 2', z='PC 3', color='Anomaly',
                          color_continuous_scale=['blue', 'red'],
                          title='Isolation Forest 3D Anomalies (PCA Reduced)',
                          labels={'Anomaly': 'Anomaly (0 = Normal, 1 = Anomaly)'})
fig_iso_3d.show()

# Plot pie charts for each model
plot_pie_chart(df_kmeans_2d, 'K-Means Anomaly vs Normal Distribution (2D)')
plot_pie_chart(df_dbscan, 'DBSCAN Anomaly vs Normal Distribution (2D)')
plot_pie_chart(df_iso, 'Isolation Forest Anomaly vs Normal Distribution (2D)')
