In [None]:
# KNN Implementation 
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import random
from matplotlib.font_manager import FontProperties
from sklearn import metrics
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score
from imblearn.over_sampling import SMOTE
from collections import defaultdict

#--------------------------------------------------------------------------------
# Read Data-
data = pd.read_csv('DataLocation...', delimiter=';')

flow_regimes = ['Data1', 'Data2', 'Data3']
df = {}
x_values = {}
y_values = {}

for regime in flow_regimes:
    df[regime] = data[(data['O1'] == 'Data' ) & (data['O2'] == regime)]
    df[regime] = df[regime][['R1', 'R2']]
    
    #Log Data if Needed
    df[regime] = np.log(df[regime][['R1', 'R2']])
    
    x_values[regime] = df[regime]['R1']
    y_values[regime] = df[regime]['R2']
    
features = pd.DataFrame(columns=['R1', 'R2'])
labels = pd.Series()

#--------------------------------------------------------------------------------
# Get the regimes 
for regime in flow_regimes:
    features = pd.concat([features, df[regime]], ignore_index=True)
    labels = pd.concat([labels, pd.Series([regime] * len(df[regime]))], ignore_index=True)
    
#Visualize the data
class_counts = labels.value_counts()
print("Class Distribution:")
print(class_counts)

plt.bar(class_counts.index, class_counts.values)
plt.xlabel("Flow Regime")
plt.ylabel("Count")
plt.title("Class Distribution")
plt.show()  

#--------------------------------------------------------------------------------
# Maintain Randomness by iterating on random states and Performing KNN  
top_results = defaultdict(float)
best_k_values = []
average_accuracy = 0

random_states = random.sample(range(1, 100), 50)
k_values = list(range(1, 100))

for random_state in random_states:
    # Split the data into training, validation, and testing sets
    X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=random_state)
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=random_state)  # 0.25 * 0.8 = 0.2

    k_folds = 5  # Number of folds for cross-validation
    skf = StratifiedKFold(n_splits=k_folds, shuffle=True, random_state=random_state)

    # Initialize lists to store shuffled data
    X_train_shuffled, y_train_shuffled = [], []

    # Perform cross-validation shuffle
    for train_index, _ in skf.split(X_train, y_train):
        X_train_fold, y_train_fold = X_train.iloc[train_index], y_train.iloc[train_index]
        X_train_shuffled.append(X_train_fold)
        y_train_shuffled.append(y_train_fold)

    # Convert shuffled lists to DataFrames/Series
    X_train_shuffled = pd.concat(X_train_shuffled, ignore_index=True)
    y_train_shuffled = pd.concat(y_train_shuffled, ignore_index=True)

    # Resample the training data using SMOTE
    ros = SMOTE(random_state=random_state)
    X_train_resampled, y_train_resampled = ros.fit_resample(X_train_shuffled, y_train_shuffled)

    # Get the Best Value of k
    acc = []
    for k in k_values:
        if k <= len(X_train_resampled):
            knn = KNeighborsClassifier(n_neighbors=k).fit(X_train_resampled, y_train_resampled)
            y_pred = knn.predict(X_val)
            acc.append(accuracy_score(y_val, y_pred))

    best_k = acc.index(max(acc)) + 1
    best_k_values.append(best_k)

    # Train a KNN classifier with the best K
    knn = KNeighborsClassifier(n_neighbors=best_k)
    knn.fit(X_train_resampled, y_train_resampled)

    # Make predictions on the test set
    y_pred = knn.predict(X_test)

    # Calculate accuracy
    accuracy = accuracy_score(y_test, y_pred)
    average_accuracy += accuracy

    # Store the result in the dictionary
    top_results[random_state] = accuracy

# Calculate the average accuracy
average_accuracy /= len(random_states)

print("Average Accuracy:", average_accuracy)

#--------------------------------------------------------------------------------
# Visualizing the results
# Define color mappings for each flow regime
color_map = {'data colors....'}
X_test['Predicted'] = y_pred

# Create a figure with two subplots
fig, axes = plt.subplots(1, 2, figsize=(12, 6))
font = FontProperties(family='Times New Roman', size=12)

# Plot the KNN model results in the first subplot
ax1 = axes[0]
ax1.set_title('Test Data Scatter Plot', fontproperties=font)
ax1.set_xlabel('R1', fontproperties=font)
ax1.set_ylabel('R2', fontproperties=font)

for regime in flow_regimes:
    ax1.scatter(X_test[X_test['Predicted'] == regime]['R1'], X_test[X_test['Predicted'] == regime]['R2'],
                color=color_map[regime], label=regime)

ax1.legend()
ax1.grid(True)

# Plot the actual data in the second subplot
ax2 = axes[1]
ax2.set_title('Actual Data Scatter Plot', fontproperties=font)
ax2.set_xlabel('R1', fontproperties=font)
ax2.set_ylabel('R2', fontproperties=font)

for regime in flow_regimes:
    ax2.scatter(x_values[regime], y_values[regime], s=10, color=color_map[regime])

# Create a combined legend for both subplots
labels = list(color_map.keys())
scatter_plots = [ax2.scatter([], [], s=10, color=color_map[label]) for label in labels]
legend_elements = ax2.legend(scatter_plots, labels, loc='upper right', title='Flow Regimes')
ax2.add_artist(legend_elements)

ax2.grid(True)

# Adjust spacing between subplots
plt.subplots_adjust(wspace=0.3)

# Show the figure
plt.show()

In [None]:
# Naive Bayes Implementation
import pandas as pd
import numpy as np
import random
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from imblearn.over_sampling import SMOTE
from sklearn.metrics import classification_report, accuracy_score
from collections import defaultdict

#--------------------------------------------------------------------------------
# Read Data-
data = pd.read_csv('DataLocation...', delimiter=';')

flow_regimes = ['Data1', 'Data2', 'Data3']
df = {}
x_values = {}
y_values = {}

for regime in flow_regimes:
    df[regime] = data[(data['O1'] == 'Data' ) & (data['O2'] == regime)]
    df[regime] = df[regime][['R1', 'R2']]
    
    #Log Data if Needed
    df[regime] = np.log(df[regime][['R1', 'R2']])
    
    x_values[regime] = df[regime]['R1']
    y_values[regime] = df[regime]['R2']

features = pd.DataFrame(columns=['R1', 'R2'])
labels = pd.Series()

#--------------------------------------------------------------------------------
# Get the regimes 
for regime in flow_regimes:
    features = pd.concat([features, df[regime]], ignore_index=True)
    labels = pd.concat([labels, pd.Series([regime] * len(df[regime]))], ignore_index=True)
    
class_counts = labels.value_counts()
print("Class Distribution:")
print(class_counts)

plt.bar(class_counts.index, class_counts.values)
plt.xlabel("Flow Regime")
plt.ylabel("Count")
plt.title("Class Distribution")
plt.show()

#--------------------------------------------------------------------------------
# Maintain Randomness by iterating on random states and Performing Naive Bayes 
top_results = defaultdict(float)
accuracy_sum = 0.0
precision_sum = defaultdict(float)
recall_sum = defaultdict(float)
f1_score_sum = defaultdict(float)

# Generate 100 random states
random_states = random.sample(range(1, 2000), 100)

for random_state in random_states:
    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=random_state)

    k_folds = 5  # Number of folds for cross-validation
    skf = StratifiedKFold(n_splits=k_folds, shuffle=True, random_state=random_state)

    # Initialize lists to store shuffled data
    X_train_shuffled, y_train_shuffled = [], []

    # Perform cross-validation shuffle
    for train_index, _ in skf.split(X_train, y_train):
        X_train_fold, y_train_fold = X_train.iloc[train_index], y_train.iloc[train_index]
        X_train_shuffled.append(X_train_fold)
        y_train_shuffled.append(y_train_fold)

    # Convert shuffled lists to DataFrames/Series
    X_train_shuffled = pd.concat(X_train_shuffled, ignore_index=True)
    y_train_shuffled = pd.concat(y_train_shuffled, ignore_index=True)

    # Resample the training data using SMOTE
    ros = SMOTE(random_state=random_state)
    X_train_resampled, y_train_resampled = ros.fit_resample(X_train_shuffled, y_train_shuffled)

    # Train a Naive Bayes classifier
    model = GaussianNB()
    model.fit(X_train_resampled, y_train_resampled)

    # Make predictions on the test set
    y_pred = model.predict(X_test)

    # Calculate accuracy
    accuracy = accuracy_score(y_test, y_pred)

    # Store the result in the dictionary
    top_results[random_state] = accuracy

    # Calculate precision, recall, and F1-score
    report = classification_report(y_test, y_pred, output_dict=True)

    # Add precision, recall, and F1-score to the sum
    for label in flow_regimes:
        precision_sum[label] += report[label]['precision']
        recall_sum[label] += report[label]['recall']
        f1_score_sum[label] += report[label]['f1-score']

    # Add accuracy to the sum
    accuracy_sum += accuracy
         
# Print the resampled data shape
print("Resampled Training Data Shape:", X_train_resampled.shape)

# Check the class distribution after resampling
resampled_class_counts = pd.Series(y_train_resampled).value_counts()
print("Class Distribution after Resampling:")
print(resampled_class_counts)

# Calculate average precision, recall, and F1-score
average_precision = {label: precision_sum[label] / len(random_states) for label in flow_regimes}
average_recall = {label: recall_sum[label] / len(random_states) for label in flow_regimes}
average_f1_score = {label: f1_score_sum[label] / len(random_states) for label in flow_regimes}

# Calculate average accuracy
average_accuracy = accuracy_sum / len(random_states)

# Print the average accuracy, precision, recall, and F1-score
print("Average Accuracy:", average_accuracy)

#--------------------------------------------------------------------------------
# Visualizing the results
# Plot bar chart
x = range(len(flow_regimes))
width = 0.2

plt.bar(x, average_precision.values(), width, label='Precision')
plt.bar([i + width for i in x], average_recall.values(), width, label='Recall')
plt.bar([i + (2 * width) for i in x], average_f1_score.values(), width, label='F1-Score')

plt.xlabel('Flow Regime')
plt.ylabel('Score')
plt.title('Average Classification Report')
plt.xticks([i + width for i in x], flow_regimes)
plt.legend()
plt.show()

In [None]:
# K-means Clustering Implementation

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score

#--------------------------------------------------------------------------------
# Read Data-
data = pd.read_csv('DataLocation...', delimiter=';')

flow_regimes = ['Data1', 'Data2', 'Data3']
df = {}
x_values = {}
y_values = {}

for regime in flow_regimes:
    df[regime] = data[(data['O1'] == 'Data' ) & (data['O2'] == regime)]
    df[regime] = df[regime][['R1', 'R2']]
    
    #Log Data if Needed
    df[regime] = np.log(df[regime][['R1', 'R2']])
    
    x_values[regime] = df[regime]['R1']
    y_values[regime] = df[regime]['R2']

features = pd.DataFrame(columns=['R1', 'R2'])
labels = pd.Series()

#--------------------------------------------------------------------------------
# Get the regimes 
for regime in flow_regimes:
    features = pd.concat([features, df[regime]], ignore_index=True)
    labels = pd.concat([labels, pd.Series([regime] * len(df[regime]))], ignore_index=True)
    
class_counts = labels.value_counts()
print("Class Distribution:")
print(class_counts)

plt.bar(class_counts.index, class_counts.values)
plt.xlabel("Flow Regime")
plt.ylabel("Count")
plt.title("Class Distribution")
plt.show()

#--------------------------------------------------------------------------------
# Performing K-means 
# Split the data into 70% training and 30% testing
train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size=0.3, random_state=42)

# Standardize the features using StandardScaler
scaler = StandardScaler()
train_features_scaled = scaler.fit_transform(train_features)
test_features_scaled = scaler.transform(test_features)

# Train the K-means clustering model on the standardized training set
k = 4
kmeans = KMeans(n_clusters=k, random_state=42)
kmeans.fit(train_features_scaled)

# Predict the clusters for the training set and test set
train_predictions = kmeans.predict(train_features_scaled)
test_predictions = kmeans.predict(test_features_scaled)

# Define the desired colors for each flow regime
colors = {
    'D1': 'orange',
    'D2 ': 'red',
    'D3': 'blue',
    'D4': 'green'
}

# Define the cluster labels
cluster_labels = {
    0: 'D1 Cluster',
    1: 'D2 Cluster',
    2: 'D3 Cluster',
    3: 'D4 Cluster'
}

# Evaluate the model on the test set
test_silhouette = silhouette_score(test_features_scaled, test_predictions)
print(f"Silhouette Coefficient on the test set: {test_silhouette}")

#--------------------------------------------------------------------------------
# Visualizing the results
# Add custom legend
custom_legend = [
    plt.Line2D([0], [0], marker='o', color='w', markerfacecolor='orange', markersize=10),
    plt.Line2D([0], [0], marker='o', color='w', markerfacecolor='red', markersize=10),
    plt.Line2D([0], [0], marker='o', color='w', markerfacecolor='blue', markersize=10),
    plt.Line2D([0], [0], marker='o', color='w', markerfacecolor='green', markersize=10),
]

legend_labels = ['D1', 'D2', 'D3', 'D4']

# Actual Data with Cluster Labels
plt.figure(figsize=(15, 6))  # Increase the figure width
plt.subplot(1, 2, 1)
for i in range(k):
    regime_label = cluster_labels[i].split()[0]  # Extract the flow regime label
    plt.scatter(train_features.loc[train_predictions == i, 'R1'], train_features.loc[train_predictions == i, 'R2'])

plt.title('Train Data with Cluster Labels')
plt.xlabel('Gas Superficial Velocity (mps)')
plt.ylabel('Liquid Superficial Velocity (mps)')
plt.legend(custom_legend, legend_labels)

plt.tight_layout()
plt.show()

# Test Set with Predicted Cluster Labels
plt.subplot(1, 2, 2)
for i in range(k):
    regime_label = cluster_labels[i].split()[0]  # Extract the flow regime label
    plt.scatter(test_features.loc[test_predictions == i, 'R1'], test_features.loc[test_predictions == i, 'R2'])
plt.scatter(
    scaler.inverse_transform(kmeans.cluster_centers_)[:, 0],
    scaler.inverse_transform(kmeans.cluster_centers_)[:, 1],
    c='red', marker='*', s=100, label='Cluster Centers'
)
plt.title('Test Set with Predicted Cluster Labels')
plt.xlabel('Gas Superficial Velocity (mps)')
plt.ylabel('Liquid Superficial Velocity (mps)')
plt.legend(custom_legend, legend_labels)


# Test Set with Predicted Cluster Labels
plt.figure(figsize=(10, 6))

# Create a color palette for the clusters
palette = sns.color_palette('bright', 4)
legend_labels = ['D1', 'D2', 'D3', 'D4']
# Plot the scatter points for each cluster with colored areas
for i in range(k):
    cluster_points = test_features[test_predictions == i]
    plt.scatter(
        cluster_points['R1'],
        cluster_points['R2'],
        color=palette[i],
        label=f'{legend_labels[i]}'
    )
    sns.kdeplot(
        data=cluster_points,
        x='R1',
        y='R2',
        shade=True,
        color=palette[i],
        alpha=0.6
    )

# Plot the cluster centers as stars
plt.scatter(
    scaler.inverse_transform(kmeans.cluster_centers_)[:, 0],
    scaler.inverse_transform(kmeans.cluster_centers_)[:, 1],
    c='red', marker='*', s=50, label='Cluster Centers'
)

plt.title('Test Set with Predicted Cluster Labels')
plt.xlabel('Gas Superficial Velocity (mps)')
plt.ylabel('Liquid Superficial Velocity (mps)')
#plt.legend()
plt.show()

In [None]:
# CATBOOST & XGBOOST Implementation 

import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, matthews_corrcoef
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from sklearn.utils import shuffle

# Load the dataset from CSV file
data = pd.read_csv('DataLocation...', delimiter=';')

# Map class labels to numeric values
label_map = {'Case': 0, 'Control': 1}
data['Label'] = data['Label'].map(label_map)

# Extract the feature variables (X) and target variable (y)
x_values = data.iloc[:, 1:].values  # Use all columns except the first one as features
y_values = data.iloc[:, 0].values  # Use the first column as the target variable

# Normalize the feature variables
scaler = StandardScaler()
x_values = scaler.fit_transform(x_values)

# Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(x_values, y_values, test_size=0.2, random_state=34)

# Print the shapes of the resulting datasets
print("Training set shape:", x_train.shape, y_train.shape)
print("Testing set shape:", x_test.shape, y_test.shape)

# Create an CatBoost classifier
model = CatBoostClassifier()

# Create an XGBoost classifier
model = XGBClassifier()

# Perform 5-fold cross-validation on the training set
scores = cross_val_score(model, x_train, y_train, cv=5)

# Print the cross-validation scores
print("Cross-Validation Scores:", scores)
print("Mean Score:", np.mean(scores))

# Train the classifier
model.fit(x_train, y_train)

# Make predictions
y_pred = model.predict(x_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Calculate sensitivity (recall)
sensitivity = recall_score(y_test, y_pred)
print("Sensitivity (Recall):", sensitivity)

# Calculate specificity
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
specificity = tn / (tn + fp)
print("Specificity:", specificity)

# Calculate precision
precision = precision_score(y_test, y_pred)
print("Precision:", precision)

# Calculate MCC
mcc = matthews_corrcoef(y_test, y_pred)
print("MCC:", mcc)

# Calculate f1-score
f1 = f1_score(y_test, y_pred)
print("F1-score:", f1)

# Calculate confusion matrix
conf_mat = confusion_matrix(y_test, y_pred)

# Display confusion matrix
print("Confusion Matrix:")
print(conf_mat)