<h4 style="background-color: #80c4e6; display: flex; padding: 0.5em;">
    NASA Breath Diagnosmtics Challenge
</h4>

##### 1 - Data Exploration and Preparation

In [1]:
# general setup
import os, glob, sys
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# model setup
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# utils setup
current_directory = os.getcwd()
root_directory = os.path.abspath(os.path.join(current_directory, os.pardir))
sys.path.append(root_directory)

# custom utils
from utils.io import ENoseIO

##### 2 - Read and Load Data

In [3]:
train_files = glob.glob('../dataset/train/*.txt')
test_files = glob.glob('../dataset/test/*.txt')
train_data = pd.concat([ENoseIO.read_patient_data(f) for f in train_files])
test_data = pd.concat([ENoseIO.read_patient_data(f, is_labeled=False) for f in test_files])

##### 3 - Data Exploration

In [5]:
# Display some basic information about the dataset
print("Train Data Info")
print(train_data.info())
print("Test Data Info")
print(test_data.info())

# Display the first few rows of the training data
print("Train Data Sample")
train_data.head()

Train Data Info
<class 'pandas.core.frame.DataFrame'>
Index: 17011 entries, 0 to 375
Data columns (total 67 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Min:Sec     17011 non-null  object 
 1    D1         17011 non-null  float64
 2    D2         17011 non-null  float64
 3    D3         17011 non-null  float64
 4    D4         17011 non-null  float64
 5    D5         17011 non-null  float64
 6    D6         17011 non-null  float64
 7    D7         17011 non-null  float64
 8    D8         17011 non-null  float64
 9    D9         17011 non-null  float64
 10   D10        17011 non-null  float64
 11   D11        17011 non-null  float64
 12   D12        17011 non-null  float64
 13   D13        17011 non-null  float64
 14   D14        17011 non-null  float64
 15   D15        17011 non-null  float64
 16   D16        17011 non-null  float64
 17   D17        17011 non-null  float64
 18   D18        17011 non-null  float64
 19   D19        1701

Unnamed: 0,Min:Sec,D1,D2,D3,D4,D5,D6,D7,D8,D9,...,D57,D58,D59,D60,D61,D62,D63,D64,Result,Patient_ID
0,07:29.6,9934.068034,1493.775934,695.734215,1274.504328,10810.90837,1501.151543,57410.52632,2011158.375,710.478301,...,836.833188,12150.26004,302148.8224,23528.25517,22332.09609,18073.57374,108110.7166,770474.7984,POSITIVE,21
1,07:31.8,9938.208689,1493.775934,696.15825,1274.903259,10836.83381,1501.704284,57422.80702,2014549.041,711.651245,...,838.24511,12148.76342,302083.1378,23550.83018,22341.90069,18060.1764,108148.6333,770639.8227,POSITIVE,21
2,07:34.1,9935.979106,1488.842785,695.204173,1260.461962,10768.57443,1495.808383,57307.01754,1984772.825,711.694687,...,835.575294,12151.75665,302186.3564,23503.49548,22206.03684,18074.13196,108079.6939,769909.0009,POSITIVE,21
3,07:36.4,9922.283093,1482.803135,692.829581,1249.132325,10680.29667,1483.41778,56922.80702,1937919.98,700.073852,...,834.728141,12138.66128,302270.8079,23487.47451,22190.6296,18056.82706,107941.8152,769060.3046,POSITIVE,21
4,07:38.6,9922.920117,1481.09728,691.8331,1242.071249,10672.42058,1478.949793,56750.87719,1918192.467,694.534949,...,834.137701,12147.26681,302120.6719,23471.45354,22370.61419,18051.80306,107907.3455,768824.5556,POSITIVE,21


In [6]:
# Count of POSITIVE and NEGATIVE cases
print("Train Data Result Distribution")
print(train_data['Result'].value_counts())

Train Data Result Distribution
Result
NEGATIVE    9881
POSITIVE    7130
Name: count, dtype: int64


#### 4 - Data Preprocessing

In [None]:
# Convert 'Result' to binary format (1 for POSITIVE, 0 for NEGATIVE)
train_data['Result'] = train_data['Result'].apply(lambda x: 1 if x == 'POSITIVE' else 0)
test_data['Result'] = test_data['Result'].apply(lambda x: 1 if x == 'POSITIVE' else 0)

# Drop 'Min:Sec' column as it is not useful for training
X_train = train_data.drop(columns=['Min:Sec', 'Patient_ID', 'Result'])
y_train = train_data['Result']

X_test = test_data.drop(columns=['Min:Sec', 'Patient_ID', 'Result'])
y_test = test_data['Result']

# Standardize the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

##### 5 - Model Training

In [None]:
# Train a Random Forest Classifier
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train_scaled, y_train)

##### 6 - Model Evaluation

In [None]:
# Make predictions on the test set
y_pred = model.predict(X_test_scaled)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy * 100:.2f}%')

# Confusion matrix
from sklearn.metrics import confusion_matrix
conf_matrix = confusion_matrix(y_test, y_pred)

# Plot confusion matrix
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

##### 7 - Feature Importance

In [None]:
# Plot feature importance
importances = model.feature_importances_
indices = np.argsort(importances)[::-1]

plt.figure(figsize=(12, 6))
plt.title("Feature Importances")
plt.bar(range(X_train.shape[1]), importances[indices], align="center")
plt.xticks(range(X_train.shape[1]), X_train.columns[indices], rotation=90)
plt.xlim([-1, X_train.shape[1]])
plt.show()