#Data Minining and Machine Learning
##  Task 1 - Pre-processing and Transformation

In [None]:
# Hyper-perameters
# User can specify the number of features they
# wish to prioritise.

features = 10

In [None]:
# Assuming this is the first notebook viewed, 
# libraries are imported as required, so they may appear further down in blocks.
# As of this comment there are no hyper-perameters i.e. number of features

# This block loads the provided datasets and carries out data validation.

import numpy as np
from sklearn.preprocessing import StandardScaler

train_data = np.load('x_train.npy')
train_labels = np.load('y_train.npy')
test_data = np.load('x_test.npy')
test_labels = np.load('y_test.npy')

missing_train_data = np.isnan(train_data)
missing_train_labels = np.isnan(train_labels)
missing_test_data = np.isnan(test_data)
missing_test_labels = np.isnan(test_labels)

missing_indices_traind = np.argwhere(missing_train_data)
missing_indices_trainl = np.argwhere(missing_train_labels)
missing_indices_testd = np.argwhere(missing_test_data)
missing_indices_testl = np.argwhere(missing_test_labels)

print("Missing values in the dataset:")
for row_idx, col_idx in missing_indices_testd:
    print(f"Row {row_idx}, Column {col_idx}")

print("Missing values in the dataset:")
for row_idx, col_idx in missing_indices_testl:
    print(f"Row {row_idx}, Column {col_idx}")

print("Missing values in the dataset:")
for row_idx, col_idx in missing_indices_traind:
    print(f"Row {row_idx}, Column {col_idx}")

print("Missing values in the dataset:")
for row_idx, col_idx in missing_indices_trainl:
    print(f"Row {row_idx}, Column {col_idx}")

In [None]:
# This block performs the the preprocessing 
# and transformation.

scaler = StandardScaler()
train_data_scaled = scaler.fit_transform(train_data)
test_data_scaled = scaler.transform(test_data)

In [None]:
# This block calculates the Pearson Correlation coefficients for the 
# labels in the training sets and them sorts them by highest 
# absolute PCC value.

from scipy.stats import pearsonr

correlation_coefficients = [pearsonr(train_data_scaled[:, i], train_labels)[0] for i in range(train_data_scaled.shape[1])]

top_indices = np.argsort(np.abs(correlation_coefficients))[-features:]
top_coefficients = np.sort(np.abs(correlation_coefficients))[-features:]

# Print top 10 feature indices (index of feature) and the absolute value of their coefficients
print(f"Top {features} feature indices:", top_indices)
print(f"Top {features} feature coefficients:", top_coefficients)

In [None]:
# This block defines a new training and test set based on the 
# identified top 10 features.
# then initialises an SVC (an SVM) trains on the selected 
# features, makes predictions and produces an accuracy score.

from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

selected_train_data = train_data_scaled[:, top_indices]
selected_test_data = test_data_scaled[:, top_indices]

svm_classifier = SVC()
svm_classifier.fit(selected_train_data, train_labels)

predictions = svm_classifier.predict(selected_test_data)

accuracy = accuracy_score(test_labels, predictions)
print(f"Accuracy: {accuracy * 100} %")


In [None]:
# This block performs predition iterations using an 
# increasing number of features up to a defined maximum.

from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

accuracies = []

# Iterate through different numbers of features
for num_features in range(1, features + 1):
    selected_train_data = train_data_scaled[:, top_indices[-num_features:]]
    selected_test_data = test_data_scaled[:, top_indices[-num_features:]]

    svm_classifier = SVC()
    svm_classifier.fit(selected_train_data, train_labels)

    predictions = svm_classifier.predict(selected_test_data)

    accuracy = accuracy_score(test_labels, predictions)
    accuracies.append(accuracy)
    print(f"Accuracy using {num_features} feature(s): {100 * accuracy} %")

In [None]:
# This block plots the accuracies from the previous 
# block against the number of features.

import matplotlib.pyplot as plt

plt.plot(range(1, features + 1), accuracies, marker='o')
plt.xlabel('Number of Features')
plt.ylabel('Accuracy')
plt.title('Accuracy vs Number of Features')
plt.grid(True)
plt.show()

# Markdown Question

The difficulty in using pearson correlation method for feature selection is determining the appropriate number of features to select. The method described in the module to automatically choose the appropriate number of features will be computationally very expensive when the number of features is high. Describe an alternative method to reduce the computational complexity of the method discussed in class for datasets with large number of features. **Describe the method. No need to write the program**.

#### Answer

Particularly for high-dimensional datasets, dimensionality reduction techniques such as Principal Component Analysis (PCA) can lessen the computing load of feature selection based on Pearson correlation. 

PCA projects data onto a lower-dimensional space while preserving much of the variance in the data. 

The process consists of calculating the covariance matrix, picking primary components, performing eigenvalue decomposition, and projecting the data into the subspace that these components span. 

PCA makes feature selection more effective by lowering dimensionality, especially for datasets with a large number of features. 

To mitigate multicollinearity and processing expenses associated with high dimensionality, post-PCA feature selection based on Pearson correlation focuses on the reduced-dimensional dataset. 

To sum up, using PCA as a pre-processing step prior to feature selection based on Pearson correlation provides a workable way to deal with computing difficulties in high-dimensional datasets.
