In [1]:
# Install necessary libraries
!pip install librosa transformers torch soundfile audioread ipywidgets --upgrade jupyterlab_widgets imageio[ffmpeg] pydub

# Import required libraries
import os
import librosa
import librosa.display
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Flatten, BatchNormalization
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC, AutoTokenizer, AutoModelForSequenceClassification
import torch
from pydub import AudioSegment

# Load and process audio file
input_audio_path = "../data/tartanaviation/raw/kbtp/2020/11/11-02-20_audio/1.wav"
output_audio_path = "../data/tartanaviation/raw/kbtp/2020/11/11-02-20_audio/1_fixed.wav"

def preprocess_audio(input_path, output_path):
    try:
        audio = AudioSegment.from_file(input_path)
        audio = audio.set_frame_rate(44100).set_channels(1)
        audio.export(output_path, format="wav")
        print(f"Converted file saved at: {output_path}")
    except Exception as e:
        print(f"Error converting file: {e}")

preprocess_audio(input_audio_path, output_audio_path)

# Load and analyze audio
audio_folder = "../data/tartanaviation/raw/kbtp/2020/11/11-02-20_audio"
audio_files = [f for f in os.listdir(audio_folder) if f.endswith(".wav")]

if not audio_files:
    raise ValueError("No audio files found in the specified directory.")

def analyze_audio(audio_path):
    try:
        audio, sr = librosa.load(audio_path, sr=44100)
        resampled_audio = librosa.resample(audio, orig_sr=sr, target_sr=16000)

        print(f"Loaded file: {os.path.basename(audio_path)}")
        print(f"Original Sample Rate: {sr}")
        print(f"Resampled Sample Rate: 16000")
        print(f"Duration: {len(audio) / sr:.2f} seconds")

        plt.figure(figsize=(10, 4))
        librosa.display.waveshow(resampled_audio, sr=16000)
        plt.title("Waveform of the Resampled Audio File")
        plt.xlabel("Time (seconds)")
        plt.ylabel("Amplitude")
        plt.show()
    except Exception as e:
        print(f"Failed to load {audio_path}: {e}")

analyze_audio(os.path.join(audio_folder, audio_files[0]))

# Extract MFCC features and save
mfcc_folder = "../data/tartanaviation/processed/"
os.makedirs(mfcc_folder, exist_ok=True)

def extract_mfcc(audio_files, output_folder):
    for audio_file in audio_files:
        audio_path = os.path.join(audio_folder, audio_file)
        audio, sr = librosa.load(audio_path, sr=16000)

        mfccs = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=13)
        mfccs_delta = librosa.feature.delta(mfccs)
        mfccs_delta2 = librosa.feature.delta(mfccs, order=2)

        combined_mfccs = np.vstack([mfccs, mfccs_delta, mfccs_delta2])

        mfcc_df = pd.DataFrame(combined_mfccs.T)
        output_path = os.path.join(output_folder, f"{audio_file.split('.')[0]}_mfcc.csv")
        mfcc_df.to_csv(output_path, index=False)
        print(f"MFCC features saved to {output_path}")

extract_mfcc(audio_files, mfcc_folder)

# Speech-to-text using Wav2Vec2
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-960h")
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h")

def transcribe_audio(audio_path):
    audio, sr = librosa.load(audio_path, sr=16000)
    input_values = processor(audio, return_tensors="pt", sampling_rate=16000).input_values
    with torch.no_grad():
        logits = model(input_values).logits
    predicted_ids = torch.argmax(logits, dim=-1)
    return processor.batch_decode(predicted_ids)[0]

transcriptions = {}
for audio_file in audio_files:
    transcription = transcribe_audio(os.path.join(audio_folder, audio_file))
    transcriptions[audio_file] = transcription
    print(f"{audio_file}: {transcription}")

# Intent classification
intent_tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
intent_model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=3)
intent_labels = ["clearance_request", "alert", "weather_request"]

def classify_intent(transcription):
    inputs = intent_tokenizer(transcription, return_tensors="pt", truncation=True, padding=True)
    outputs = intent_model(**inputs)
    return intent_labels[torch.argmax(outputs.logits, dim=-1).item()]

for audio_file, transcription in transcriptions.items():
    intent = classify_intent(transcription)
    print(f"{audio_file} - Intent: {intent}")

# Suggest action based on intent
def suggest_action(intent, trajectory_data, weather_data):
    if intent == "clearance_request":
        return f"Clearance granted for {trajectory_data['flight_id']}."
    elif intent == "alert":
        return f"Notify authorities about restricted airspace violation by {trajectory_data['flight_id']}!"
    elif intent == "weather_request":
        return f"Weather update: Visibility {weather_data['visibility']}, Wind Speed {weather_data['wind_speed']}!"
    else:
        return "No action required."

trajectory_data = {"flight_id": "ABC123", "altitude": 30000, "heading": 180}
weather_data = {"location": "runway_22", "visibility": "5 miles", "wind_speed": "10 knots"}

for audio_file, transcription in transcriptions.items():
    intent = classify_intent(transcription)
    action = suggest_action(intent, trajectory_data, weather_data)
    print(f"{audio_file}: {action}")





Error converting file: [WinError 2] The system cannot find the file specified


  audio, sr = librosa.load(audio_path, sr=44100)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)
  audio, sr = librosa.load(audio_path, sr=16000)


Failed to load ../data/tartanaviation/raw/kbtp/2020/11/11-02-20_audio\1.wav: 


NoBackendError: 