<h4 style="background-color: #80c4e6; display: flex; padding: 0.5em;">
    NASA Breath Diagnosmtics Challenge
</h4>

##### 1 - Data Exploration and Preparation

In [1]:
# general setup
import pandas as pd
import os, glob, sys
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns

# utils setup
current_directory = os.getcwd()
root_directory = os.path.abspath(os.path.join(current_directory, os.pardir))
sys.path.append(root_directory)

# custom utils
from utils.io import ENoseIO

##### 2 - Read and Load Data

In [3]:
train_files = glob.glob('../dataset/train/*.txt')
test_files = glob.glob('../dataset/test/*.txt')
print(train_files)
train_data = pd.concat([ENoseIO.read_patient_data(f) for f in train_files])
test_data = pd.concat([ENoseIO.read_patient_data(f) for f in test_files])

['../dataset/train/NTL E-Nose - Patient 21.txt', '../dataset/train/NTL E-Nose - Patient 35.txt', '../dataset/train/NTL E-Nose - Patient 20.txt', '../dataset/train/NTL E-Nose - Patient 36.txt', '../dataset/train/NTL E-Nose - Patient 22.txt', '../dataset/train/NTL E-Nose - Patient 37.txt', '../dataset/train/NTL E-Nose - Patient 27.txt', '../dataset/train/NTL E-Nose - Patient 26.txt', '../dataset/train/NTL E-Nose - Patient 18.txt', '../dataset/train/NTL E-Nose - Patient 30.txt', '../dataset/train/NTL E-Nose - Patient 31.txt', '../dataset/train/NTL E-Nose - Patient 19.txt', '../dataset/train/NTL E-Nose - Patient 42.txt', '../dataset/train/NTL E-Nose - Patient 56.txt', '../dataset/train/NTL E-Nose - Patient 57.txt', '../dataset/train/NTL E-Nose - Patient 55.txt', '../dataset/train/NTL E-Nose - Patient 41.txt', '../dataset/train/NTL E-Nose - Patient 40.txt', '../dataset/train/NTL E-Nose - Patient 8.txt', '../dataset/train/NTL E-Nose - Patient 50.txt', '../dataset/train/NTL E-Nose - Patient 4

IndexError: list index out of range

##### 3 - Data Exploration

In [None]:
# Display some basic information about the dataset
print("Train Data Info")
print(train_data.info())
print("Test Data Info")
print(test_data.info())

# Display the first few rows of the training data
print("Train Data Sample")
print(train_data.head())

# Count of POSITIVE and NEGATIVE cases
print("Train Data Result Distribution")
print(train_data['Result'].value_counts())

#### 4 - Data Preprocessing

In [None]:
# Convert 'Result' to binary format (1 for POSITIVE, 0 for NEGATIVE)
train_data['Result'] = train_data['Result'].apply(lambda x: 1 if x == 'POSITIVE' else 0)
test_data['Result'] = test_data['Result'].apply(lambda x: 1 if x == 'POSITIVE' else 0)

# Drop 'Min:Sec' column as it is not useful for training
X_train = train_data.drop(columns=['Min:Sec', 'Patient_ID', 'Result'])
y_train = train_data['Result']

X_test = test_data.drop(columns=['Min:Sec', 'Patient_ID', 'Result'])
y_test = test_data['Result']

# Standardize the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

##### 5 - Model Training

In [None]:
# Train a Random Forest Classifier
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train_scaled, y_train)

##### 6 - Model Evaluation

In [None]:
# Make predictions on the test set
y_pred = model.predict(X_test_scaled)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy * 100:.2f}%')

# Confusion matrix
from sklearn.metrics import confusion_matrix
conf_matrix = confusion_matrix(y_test, y_pred)

# Plot confusion matrix
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

##### 7 - Feature Importance

In [None]:
# Plot feature importance
importances = model.feature_importances_
indices = np.argsort(importances)[::-1]

plt.figure(figsize=(12, 6))
plt.title("Feature Importances")
plt.bar(range(X_train.shape[1]), importances[indices], align="center")
plt.xticks(range(X_train.shape[1]), X_train.columns[indices], rotation=90)
plt.xlim([-1, X_train.shape[1]])
plt.show()