In [5]:
!pip install -q kagglehub

# Import all required libraries
import os
import kagglehub
import pandas as pd
import numpy as np
import librosa
from tqdm.auto import tqdm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

# Downloading the dataset using KaggleHub
# This will download the files to a temporary cache directory in Colab
path = kagglehub.dataset_download("nasrulhakim86/coughvid-wav")
print(f"Dataset downloaded to: {path}")

# Defining paths and constants
data_folder_path = os.path.join(path, 'public_dataset')
metadata_path = os.path.join(data_folder_path, 'metadata_compiled.csv')
SAMPLE_RATE = 22050
DURATION = 5 # seconds
N_MFCC = 40 # Number of MFCC features to extract

print("\nSetup complete.")

#LOADING AND PREPROCESSING METADATA ---
df = pd.read_csv(metadata_path)

#Filtering for relevant health statuses
valid_stati = ['healthy', 'symptomatic', 'COVID-19']
df_processed = df[df['status'].isin(valid_stati)].copy()

df_processed['label'] = df_processed['status'].apply(lambda x: 0 if x == 'healthy' else 1)

print("Class distribution:")
print(df_processed['label'].value_counts())


#FEATURE EXTRACTION (MFCCs)
X = [] # To store features (MFCC vectors)
y = [] # To store labels

expected_length = SAMPLE_RATE * DURATION

for index, row in tqdm(df_processed.iterrows(), total=df_processed.shape[0]):
    filename = row['uuid'] + '.wav'
    file_path = os.path.join(data_folder_path, filename)

    if not os.path.exists(file_path):
        continue # Skip if file is missing

    try:
        # Loading audio file
        audio, sr = librosa.load(file_path, sr=SAMPLE_RATE)

        # Pad or truncate to ensure uniform length
        if len(audio) > expected_length:
            audio = audio[:expected_length]
        else:
            audio = np.pad(audio, (0, expected_length - len(audio)), 'constant')

        # Calculating MFCCs
        mfccs = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=N_MFCC)

        # Average MFCCs over the time axis to get a single feature vector
        mfccs_mean = np.mean(mfccs, axis=1)

        X.append(mfccs_mean)
        y.append(row['label'])

    except Exception as e:
        print(f"Error processing {filename}: {e}")

# Convert lists to NumPy arrays
X = np.array(X)
y = np.array(y)

print(f"\nFeature extraction complete. Shape of feature matrix X: {X.shape}")


# DATA SPLITTING AND SCALING

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Scale features for better model performance
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("Data split and scaled successfully.")


#STEP 5: MODEL TRAINING
# Initializing the Random Forest Classifier
model = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)


model.fit(X_train_scaled, y_train)# Training the model

print("Model training complete.")
y_pred = model.predict(X_test_scaled)# Making predictions on the test set


accuracy = accuracy_score(y_test, y_pred)# Calculating and printing the accuracy
print(f"Model Accuracy: {accuracy * 100:.2f}%")

# Printing the detailed classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=['Healthy (0)', 'Unhealthy (1)']))

Dataset downloaded to: /kaggle/input/coughvid-wav

Setup complete.
Class distribution:
label
0    12479
1     3745
Name: count, dtype: int64


  0%|          | 0/16224 [00:00<?, ?it/s]


Feature extraction complete. Shape of feature matrix X: (16224, 40)
Data split and scaled successfully.
Model training complete.
Model Accuracy: 76.80%

Classification Report:
               precision    recall  f1-score   support

  Healthy (0)       0.77      1.00      0.87      2496
Unhealthy (1)       0.00      0.00      0.00       749

     accuracy                           0.77      3245
    macro avg       0.38      0.50      0.43      3245
 weighted avg       0.59      0.77      0.67      3245



In [None]:
import librosa
import numpy as np
from google.colab import files

def predict_cough(file_path, model, scaler):
    """
    Takes a path to a new audio file, preprocesses it, and predicts
    the cough severity using the trained model and scaler.
    """
    print(f"--- Analyzing new file: {file_path} ---")
    try:

        audio, sr = librosa.load(file_path, sr=SAMPLE_RATE)# Preprocessing the new audio file

        # Pad or truncate to 5 seconds
        expected_length = SAMPLE_RATE * DURATION
        if len(audio) > expected_length:
            audio = audio[:expected_length]
        else:
            audio = np.pad(audio, (0, expected_length - len(audio)), 'constant')

        # Extracting MFCCs
        mfccs = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=N_MFCC)
        mfccs_mean = np.mean(mfccs, axis=1)

        # Reshaping for scaler and model
        features = mfccs_mean.reshape(1, -1)

        #Scaling the features ---
        # Use the SAME scaler that was fitted on the training data
        features_scaled = scaler.transform(features)

        #Making a prediction ---
        # Get the probability of each class
        probabilities = model.predict_proba(features_scaled)

        # The probability of the 'Unhealthy' class (class 1)
        severity_score = probabilities[0][1]

        # Get the final predicted class (0 or 1)
        prediction = model.predict(features_scaled)
        predicted_class = 'Unhealthy' if prediction[0] == 1 else 'Healthy'

        print("\n--- Prediction Results ---")
        print(f"Predicted Class: {predicted_class}")
        print(f"Severity Score (Probability of being Unhealthy): {severity_score:.2f}")

    except Exception as e:
        print(f"An error occurred: {e}")

# --- EXAMPLE USAGE ---
# 1. Upload your cough recording (.wav, .mp3)
print("Please upload your cough audio file:")
uploaded = files.upload()

# 2. Get the filename of the uploaded file
if uploaded:
    your_audio_file = list(uploaded.keys())[0]

    # 3. Call the function to get a prediction
    # NOTE: We use the 'model' and 'scaler' variables from the previous training step
    predict_cough(your_audio_file, model, scaler)