In [1]:
import os
import pandas as pd

# Set the path to your root folder
root_folder = r"C:\Users\asfam\OneDrive\Desktop\ml assignment\abc"

# Get the list of folders within the root folder
subfolders = [f.path for f in os.scandir(root_folder) if f.is_dir()]

# Create an empty DataFrame to store the reshaped data
combined_reshaped_data = pd.DataFrame()

# Define the maximum number of instances to take from each dataset
max_instances = 100000

# Loop through each subfolder and read each dataset into a DataFrame
for subfolder in subfolders:
    files = os.listdir(subfolder)
    for file in files:
        file_path = os.path.join(subfolder, file)
        # Read only the 'Current-A' column and limit to 100,000 instances
        data = pd.read_csv(file_path, usecols=[' Current-A'], nrows=max_instances)
        
        # Transpose the DataFrame to have a single row
        transposed_current_a = data.transpose()

        # Reshape the transposed "Current-A" column into blocks of 1000 columns for each row
        block_size = 1000
        num_blocks = transposed_current_a.shape[1] // block_size
        reshaped_current_a = transposed_current_a.iloc[:, :num_blocks * block_size].values.reshape(-1, block_size)

        # Create a DataFrame from the reshaped "Current-A" column
        reshaped_current_a_df = pd.DataFrame(reshaped_current_a, columns=[f'Block_{i+1}' for i in range(block_size)])

        # Extract file name without extension from the full file path
        file_name_without_extension = os.path.splitext(file)[0]

        # Extract label from file name before the '-' symbol
        label = file_name_without_extension.split('-')[0]

        # Add a new column 'Label' with the corresponding label for each row
        reshaped_current_a_df['Label'] = label

        # Append the reshaped data to the combined DataFrame
        combined_reshaped_data = pd.concat([combined_reshaped_data, reshaped_current_a_df], ignore_index=True)

        
# Save the combined reshaped DataFrame to a new CSV file
output_file_path_reshaped = r'C:\Users\asfam\OneDrive\Desktop\ml assignment\combined_reshaped_data.csv'
combined_reshaped_data.to_csv(output_file_path_reshaped, index=False)

# Display the combined reshaped DataFrame
print(combined_reshaped_data)
print(combined_reshaped_data.shape)


      Block_1  Block_2  Block_3  Block_4  Block_5  Block_6  Block_7  Block_8  \
0      2.3309   2.3309   2.3309   2.3309   2.3309   2.3309   2.3309   2.3309   
1      2.7228   2.7228   2.7228   2.7228   2.7228   2.7228   2.7228   2.7228   
2      2.9524   2.9524   2.9524   2.9524   2.9524   2.9524   2.9524   2.9524   
3      2.1111   2.1111   2.1111   2.1111   2.1111   2.1111   2.1111   2.1111   
4      2.1282   2.1282   2.1282   2.1282   2.1282   2.1282   2.1282   2.1282   
...       ...      ...      ...      ...      ...      ...      ...      ...   
3861   2.4603   2.4530   2.4530   2.4530   2.4530   2.4530   2.4530   2.4530   
3862   2.3712   2.3712   2.3712   2.3712   2.3712   2.3712   2.3712   2.3712   
3863   2.3065   2.3065   2.3065   2.3065   2.3065   2.3065   2.3065   2.3065   
3864   2.2552   2.2552   2.2552   2.2552   2.2552   2.2552   2.2552   2.2552   
3865   2.2479   2.2479   2.2479   2.2479   2.2479   2.2479   2.2479   2.2479   

      Block_9  Block_10  ...  Block_992

In [2]:
df1 = pd.read_csv("combined_reshaped_data.csv")
df1

Unnamed: 0,Block_1,Block_2,Block_3,Block_4,Block_5,Block_6,Block_7,Block_8,Block_9,Block_10,...,Block_992,Block_993,Block_994,Block_995,Block_996,Block_997,Block_998,Block_999,Block_1000,Label
0,2.3309,2.3309,2.3309,2.3309,2.3309,2.3309,2.3309,2.3309,2.3309,2.3309,...,2.7228,2.7228,2.7228,2.7228,2.7228,2.7228,2.7228,2.7228,2.7228,0.7inner
1,2.7228,2.7228,2.7228,2.7228,2.7228,2.7228,2.7228,2.7228,2.7228,2.7228,...,2.9524,2.9524,2.9524,2.9524,2.9524,2.9524,2.9524,2.9524,2.9524,0.7inner
2,2.9524,2.9524,2.9524,2.9524,2.9524,2.9524,2.9524,2.9524,2.9524,2.9524,...,2.1111,2.1111,2.1111,2.1111,2.1111,2.1111,2.1111,2.1111,2.1111,0.7inner
3,2.1111,2.1111,2.1111,2.1111,2.1111,2.1111,2.1111,2.1111,2.1111,2.1111,...,2.1282,2.1282,2.1282,2.1282,2.1282,2.1282,2.1282,2.1282,2.1282,0.7inner
4,2.1282,2.1282,2.1282,2.1282,2.1282,2.1282,2.1282,2.1282,2.1282,2.1282,...,2.9805,2.9805,2.9805,2.9805,3.0024,3.0024,3.0024,3.0024,3.0024,0.7inner
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3861,2.4603,2.4530,2.4530,2.4530,2.4530,2.4530,2.4530,2.4530,2.4530,2.4530,...,2.3712,2.3712,2.3712,2.3712,2.3712,2.3712,2.3712,2.3712,2.3712,healthy
3862,2.3712,2.3712,2.3712,2.3712,2.3712,2.3712,2.3712,2.3712,2.3712,2.3712,...,2.3065,2.3065,2.3065,2.3065,2.3065,2.3065,2.3065,2.3065,2.3065,healthy
3863,2.3065,2.3065,2.3065,2.3065,2.3065,2.3065,2.3065,2.3065,2.3065,2.3065,...,2.2552,2.2552,2.2552,2.2552,2.2552,2.2552,2.2552,2.2552,2.2552,healthy
3864,2.2552,2.2552,2.2552,2.2552,2.2552,2.2552,2.2552,2.2552,2.2552,2.2552,...,2.2466,2.2466,2.2466,2.2466,2.2466,2.2466,2.2466,2.2466,2.2466,healthy


In [4]:
df = pd.read_csv("combined_reshaped_data.csv")
# Convert labels to binary classes
df['Label'] = df['Label'].apply(lambda x: 'healthy' if x == 'healthy' else 'unhealthy')

# Separate features and labels
X = df.iloc[:, :-1].values
y = df['Label'].values

# Now 'healthy' is represented as 'healthy' and all other labels as 'unhealthy'
df

Unnamed: 0,Block_1,Block_2,Block_3,Block_4,Block_5,Block_6,Block_7,Block_8,Block_9,Block_10,...,Block_992,Block_993,Block_994,Block_995,Block_996,Block_997,Block_998,Block_999,Block_1000,Label
0,2.3309,2.3309,2.3309,2.3309,2.3309,2.3309,2.3309,2.3309,2.3309,2.3309,...,2.7228,2.7228,2.7228,2.7228,2.7228,2.7228,2.7228,2.7228,2.7228,unhealthy
1,2.7228,2.7228,2.7228,2.7228,2.7228,2.7228,2.7228,2.7228,2.7228,2.7228,...,2.9524,2.9524,2.9524,2.9524,2.9524,2.9524,2.9524,2.9524,2.9524,unhealthy
2,2.9524,2.9524,2.9524,2.9524,2.9524,2.9524,2.9524,2.9524,2.9524,2.9524,...,2.1111,2.1111,2.1111,2.1111,2.1111,2.1111,2.1111,2.1111,2.1111,unhealthy
3,2.1111,2.1111,2.1111,2.1111,2.1111,2.1111,2.1111,2.1111,2.1111,2.1111,...,2.1282,2.1282,2.1282,2.1282,2.1282,2.1282,2.1282,2.1282,2.1282,unhealthy
4,2.1282,2.1282,2.1282,2.1282,2.1282,2.1282,2.1282,2.1282,2.1282,2.1282,...,2.9805,2.9805,2.9805,2.9805,3.0024,3.0024,3.0024,3.0024,3.0024,unhealthy
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3861,2.4603,2.4530,2.4530,2.4530,2.4530,2.4530,2.4530,2.4530,2.4530,2.4530,...,2.3712,2.3712,2.3712,2.3712,2.3712,2.3712,2.3712,2.3712,2.3712,healthy
3862,2.3712,2.3712,2.3712,2.3712,2.3712,2.3712,2.3712,2.3712,2.3712,2.3712,...,2.3065,2.3065,2.3065,2.3065,2.3065,2.3065,2.3065,2.3065,2.3065,healthy
3863,2.3065,2.3065,2.3065,2.3065,2.3065,2.3065,2.3065,2.3065,2.3065,2.3065,...,2.2552,2.2552,2.2552,2.2552,2.2552,2.2552,2.2552,2.2552,2.2552,healthy
3864,2.2552,2.2552,2.2552,2.2552,2.2552,2.2552,2.2552,2.2552,2.2552,2.2552,...,2.2466,2.2466,2.2466,2.2466,2.2466,2.2466,2.2466,2.2466,2.2466,healthy


In [5]:
#binary class naïve bayes


import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

X = df.iloc[:, :-1].values
y = df.iloc[:, -1].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

def calculate_statistics(X, y):
    class_stats = {}
    unique_classes = np.unique(y)
    
    for cls in unique_classes:
        class_data = X[y == cls]
        mean = np.mean(class_data, axis=0)
        std = np.std(class_data, axis=0)
        class_stats[cls] = {'mean': mean, 'std': std}
        
    return class_stats

def calculate_probability(x, mean, std):
    exponent = np.exp(-((x - mean)**2) / (2 * std**2))
    return (1 / (np.sqrt(2 * np.pi) * std)) * exponent

def predict_class(example, class_stats):
    probabilities = {}
    
    for cls, stats in class_stats.items():
        probabilities[cls] = 1
        for i in range(len(stats['mean'])):
            mean = stats['mean'][i]
            std = stats['std'][i]
            x = example[i]
            probabilities[cls] *= calculate_probability(x, mean, std)
            
    return max(probabilities, key=probabilities.get)

# Function to train the Naive Bayes model
def train_naive_bayes(X_train, y_train):
    class_stats = calculate_statistics(X_train, y_train)
    return class_stats

# Function to make predictions on the test set
def predict_naive_bayes(X_test, class_stats):
    predictions = [predict_class(example, class_stats) for example in X_test]
    return predictions

# Train the model
class_stats = train_naive_bayes(X_train, y_train)

# Make predictions
predictions = predict_naive_bayes(X_test, class_stats)

accuracy = accuracy_score(y_test, predictions)
precision = precision_score(y_test, predictions, pos_label='unhealthy', average='binary')
recall = recall_score(y_test, predictions, pos_label='unhealthy', average='binary')
f1 = f1_score(y_test, predictions, pos_label='unhealthy', average='binary')

conf_matrix = confusion_matrix(y_test, predictions, labels=['healthy', 'unhealthy'])
tn, fp, fn, tp = conf_matrix.ravel()

specificity = tn / (tn + fp)
sensitivity = tp / (tp + fn)

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")
print(f"Sensitivity: {sensitivity}")
print(f"Specificity: {specificity}")


Accuracy: 0.6925064599483204
Precision: 0.9942418426103646
Recall: 0.6879150066401063
F1 Score: 0.8131868131868133
Sensitivity: 0.6879150066401063
Specificity: 0.8571428571428571


In [6]:
#binary class SVM


from sklearn.preprocessing import StandardScaler

class SVM:
    def __init__(self, learning_rate=0.01, lambda_param=0.01, n_iterations=500):
        self.learning_rate = learning_rate
        self.lambda_param = lambda_param
        self.n_iterations = n_iterations
        self.weights = None
        self.bias = None

    def fit(self, X, y):
        n_samples, n_features = X.shape
        y_ = np.where(y <= 0, -1, 1)
        self.weights = np.zeros(n_features)
        self.bias = 0

        # Gradient Descent
        for _ in range(self.n_iterations):
            for idx, x in enumerate(X):
                condition = y_[idx] * (np.dot(x, self.weights) - self.bias) >= 1
                if condition:
                    self.weights -= self.learning_rate * (2 * self.lambda_param * self.weights)
                else:
                    self.weights -= self.learning_rate * (2 * self.lambda_param * self.weights - np.dot(x, y_[idx]))
                    self.bias -= self.learning_rate * y_[idx]

                

    def predict(self, X):
        linear_output = np.dot(X, self.weights) - self.bias
        return np.sign(linear_output)

X = df.iloc[:, :-1].values
y = df.iloc[:, -1].values

label_dict = {label: i for i, label in enumerate(np.unique(y))}
y_numeric = np.array([label_dict[label] for label in y])

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

svm_model = SVM()
svm_model.fit(X_scaled, y_numeric)

# Make predictions
predictions = svm_model.predict(X_scaled)

accuracy = accuracy_score(y_numeric, predictions)
precision = precision_score(y_numeric, predictions, average='weighted', zero_division=1)
recall = recall_score(y_numeric, predictions, average='weighted', zero_division=1) 
f1 = f1_score(y_numeric, predictions, average='weighted', zero_division=1) 

conf_matrix = confusion_matrix(y_numeric, predictions)

tn, fp, fn, tp = conf_matrix.ravel()[:4]

if (tn + fp) != 0:
    specificity = tn / (tn + fp)
else:
    specificity = 0.0

sensitivity = tp / (tp + fn)

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")
print(f"Sensitivity: {sensitivity}")
print(f"Specificity: {specificity}")


Accuracy: 0.5675116399379203
Precision: 0.9736487176281253
Recall: 0.5675116399379203
F1 Score: 0.7099315183544953
Sensitivity: 1.0
Specificity: 0.0


In [7]:
#multiclass naive bayes

import numpy as np
from sklearn.metrics import precision_score, recall_score, f1_score

class NaiveBayes:
    def fit(self, X, y):
        self.classes = np.unique(y)
        self.parameters = {}
        for c in self.classes:
            X_c = X[y == c]
            self.parameters[c] = {
                'mean': X_c.mean(axis=0),
                'std': X_c.std(axis=0) + 1e-10  # Adding a small constant to avoid division by zero
            }
        # Calculate priors based on the training data
        self.priors = {c: len(X[y == c]) / len(X) for c in self.classes}

    def _pdf(self, X, mean, std):
        return np.exp(-0.5 * ((X - mean) / std) ** 2) / (np.sqrt(2 * np.pi) * std)

    def predict(self, X):
        predictions = []
        for x in X:
            posteriors = []
            for c in self.classes:
                likelihood = np.prod(self._pdf(x, self.parameters[c]['mean'], self.parameters[c]['std']))
                # Use the pre-calculated priors instead of recalculating them in the predict method
                prior = self.priors[c]
                posterior = prior * likelihood
                posteriors.append(posterior)
            predictions.append(self.classes[np.argmax(posteriors)])
        return np.array(predictions)

def calculate_measures(y_true, y_pred):
    accuracy = np.mean(y_true == y_pred)
    precision = precision_score(y_true, y_pred, average='weighted')
    recall = recall_score(y_true, y_pred, average='weighted')
    f1 = f1_score(y_true, y_pred, average='weighted')
    
    return {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1
    }

# Assuming df is your DataFrame
X = df1.iloc[:, :-1].values
y = df1.iloc[:, -1].values

# Train-test split
split_ratio = 0.8
split_index = int(split_ratio * len(X))
X_train, X_test = X[:split_index], X[split_index:]
y_train, y_test = y[:split_index], y[split_index:]

# Create and train the Naive Bayes model
nb_model = NaiveBayes()
nb_model.fit(X_train, y_train)

# Make predictions on the test set
predictions = nb_model.predict(X_test)

# Calculate classification measures
measures = calculate_measures(y_test, predictions)
print("Classification Measures:")
for key, value in measures.items():
    print(f"{key}: {value}")


Classification Measures:
accuracy: 0.07235142118863049
precision: 0.03496985357450474
recall: 0.07235142118863049
f1: 0.04715036437011875


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [8]:
#multiclass SVM

import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

class MulticlassSVM:
    def __init__(self, learning_rate=0.01, epochs=1000, lambda_param=0.01):
        self.learning_rate = learning_rate
        self.epochs = epochs
        self.lambda_param = lambda_param
        self.weights = None
        self.classes = None

    def fit(self, X, y):
        # Ensure y contains integer class labels
        le = LabelEncoder()
        y_encoded = le.fit_transform(y)
        self.classes = np.unique(y_encoded)
        num_classes = len(self.classes)
        num_features = X.shape[1]
        self.weights = np.zeros((num_classes, num_features))

        for epoch in range(self.epochs):
            for c in self.classes:
                y_binary = np.where(y_encoded == c, 1, -1)
                hinge_loss = 1 - y_binary * np.dot(X, self.weights[c])
                hinge_loss[hinge_loss < 0] = 0  # max(0, hinge_loss)

                gradient = -np.dot(hinge_loss * y_binary, X) / len(y_encoded)
                self.weights[c] -= self.learning_rate * (gradient + self.lambda_param * self.weights[c])

    def predict(self, X):
        scores = np.dot(X, self.weights.T)
        predictions = np.argmax(scores, axis=1)
        return predictions

def calculate_measures(y_true, y_pred):
    accuracy = np.mean(y_true == y_pred)
    precision = np.zeros(len(np.unique(y_true)))
    recall = np.zeros(len(np.unique(y_true)))
    f1 = np.zeros(len(np.unique(y_true)))

    for i, c in enumerate(np.unique(y_true)):
        true_positives = np.sum((y_true == c) & (y_pred == c))
        false_positives = np.sum((y_true != c) & (y_pred == c))
        false_negatives = np.sum((y_true == c) & (y_pred != c))

        precision[i] = true_positives / (true_positives + false_positives + 1e-10)
        recall[i] = true_positives / (true_positives + false_negatives + 1e-10)
        f1[i] = 2 * (precision[i] * recall[i]) / (precision[i] + recall[i] + 1e-10)

    return {
        "accuracy": accuracy,
        "precision": np.mean(precision),
        "recall": np.mean(recall),
        "f1": np.mean(f1)
    }

# Assuming df1 is your DataFrame
X = df1.iloc[:, :-1].values
y = df1.iloc[:, -1].values

# Encode class labels to integers
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and train the Multiclass SVM model
svm_model = MulticlassSVM()
svm_model.fit(X_train, y_train)

# Make predictions on the test set
predictions = svm_model.predict(X_test)

# Calculate classification measures
measures = calculate_measures(y_test, predictions)
print("Classification Measures:")
for key, value in measures.items():
    print(f"{key}: {value}")


  self.weights[c] -= self.learning_rate * (gradient + self.lambda_param * self.weights[c])


Classification Measures:
accuracy: 0.09560723514211886
precision: 0.006829088224436179
recall: 0.0714285714284749
f1: 0.012466307276487247


In [None]:
from scipy.spatial import distance

# Select only the columns containing numeric data for distance calculation (excluding 'Label')
numeric_columns = combined_reshaped_data.columns[1:-1]

# Extract the numeric data for distance calculation
numeric_data = combined_reshaped_data[numeric_columns].values

# Calculate the Euclidean distance matrix
euclidean_distance_matrix = distance.cdist(numeric_data, numeric_data, 'euclidean')

# Create a DataFrame from the distance matrix
euclidean_distance_df = pd.DataFrame(euclidean_distance_matrix, columns=combined_reshaped_data['Label'], index=combined_reshaped_data['Label'])

# Save the Euclidean distance DataFrame to a new CSV file
output_file_path_distance = r'C:\Users\asfam\OneDrive\Desktop\ml assignment\euclidean_distance_matrix.csv'
euclidean_distance_df.to_csv(output_file_path_distance)

# Display the Euclidean distance DataFrame
print(euclidean_distance_df)
