Disclaimer: Some outputs and print messages remain in Italian, as the notebook was translated after execution. I apologize for any mismatches between code and output.

# Environment setup

To begin, the environment was configured by installing Spark and Parselmouth, and mounting the Google Drive containing the MP3 audio files and the accompanying text document with the transcriptions.

In [None]:
#Spark installation

!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget https://downloads.apache.org/spark/spark-3.5.5/spark-3.5.5-bin-hadoop3.tgz
!tar xf spark-3.5.5-bin-hadoop3.tgz
!pip install -q findspark
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.5.5-bin-hadoop3"
import findspark
findspark.init()
import pyspark
print(pyspark.version)
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()
sc=spark.sparkContext

--2025-03-06 14:25:12--  https://downloads.apache.org/spark/spark-3.5.5/spark-3.5.5-bin-hadoop3.tgz
Resolving downloads.apache.org (downloads.apache.org)... 88.99.208.237, 135.181.214.104, 2a01:4f9:3a:2c57::2, ...
Connecting to downloads.apache.org (downloads.apache.org)|88.99.208.237|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 400724056 (382M) [application/x-gzip]
Saving to: ‘spark-3.5.5-bin-hadoop3.tgz’


2025-03-06 14:25:43 (12.6 MB/s) - ‘spark-3.5.5-bin-hadoop3.tgz’ saved [400724056/400724056]

<module 'pyspark.version' from '/content/spark-3.5.5-bin-hadoop3/python/pyspark/version.py'>


In [None]:
#Parselmouth installation

!pip install praat-parselmouth

Collecting praat-parselmouth
  Downloading praat_parselmouth-0.4.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (2.9 kB)
Downloading praat_parselmouth-0.4.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (10.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.8/10.8 MB[0m [31m17.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: praat-parselmouth
Successfully installed praat-parselmouth-0.4.5


In [None]:
#Drive mounting

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Utility functions

Subsequently, three functions were implemented to extract the features required to build the samples, along with a function that retrieves the transcriptions from the text file and stores them in a dictionary.

The functions can also be found in the `src` folder with further documentation.

In [None]:
import librosa
import librosa.feature
import numpy as np
import parselmouth
from parselmouth.praat import call
from sklearn.feature_extraction.text import TfidfVectorizer

# Extracts audio features with Librosa
def extract_audio_features(audio_path, sr=22050, n_mfcc=13):
    y, sr = librosa.load(audio_path, sr=sr)

    # Extracts MFCC
    mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc)
    mfccs_mean = np.mean(mfccs, axis=1)  # Mean to stabilize data

    # Extracts RMSE (Energy)
    rms = librosa.feature.rms(y=y)
    rms_mean = np.mean(rms)

    return np.concatenate((mfccs_mean, [rms_mean]))

# Extracts pitch with Parselmouth
def extract_pitch(audio_path):
    snd = parselmouth.Sound(audio_path)
    pitch = call(snd, "To Pitch", 0.0, 75, 600)
    mean_pitch = call(pitch, "Get mean", 0, 0, "Hertz")  # Mean

    return np.array([mean_pitch])

# Obteins textual representation (TF-IDF)
def extract_text_features(text, vectorizer):
    return vectorizer.transform([text]).toarray()[0]

# Extracts transcriptions from .txt file
def load_transcriptions(txt_file):
    transcriptions = {}
    with open(txt_file, "r", encoding="utf-8") as f:
        lines = f.read().strip().split("\n\n")  # File is divided in blocks thanks to empty lines

    for block in lines:
        lines = block.split("\n")  # Each block contains file name + transcription
        if len(lines) >= 2:
            filename = lines[0].strip()  # First row: mp3 file name
            transcript = " ".join(lines[1:]).strip()  # Everything else is the transcription
            transcriptions[filename] = transcript  # Adds to dictionary

    return transcriptions

# Complete samples creation (Librosa + Parselmouth + TF-IDF)

In this phase, I created the complete samples and saved them in a `.pkl` file on Google Drive (the file is also available in the repository)

Please note that the folder containing all the audio files and transcriptions is named 'audiozzi'.

In [None]:
import pandas as pd

# Paths
audio_folder = "/content/drive/MyDrive/audiozzi"
transcriptions_file = "/content/drive/MyDrive/audiozzi/Trascrizioni.txt"

# Loads transcriptions in dictionary
transcriptions = load_transcriptions(transcriptions_file)

# Creates TF-IDF vectorizer and fits it on transcriptions
vectorizer = TfidfVectorizer()
vectorizer.fit(list(transcriptions.values()))

# Final data list
dataset = []

# Audio file in folder loop
for file in os.listdir(audio_folder):
    if file.endswith(".mp3"):  # Just mp3 files
        audio_path = os.path.join(audio_folder, file)

        # Sets label (1 = Urgent, 0 = Normal) (see file naming in the report)
        label = 1 if file.endswith("u.mp3") else 0

        # Retrieves transcription
        transcript = transcriptions.get(file, None)
        if transcript is None:
            print(f"No transcription for {file}, skipped.")    # Helps for errors in txt file
            continue

        # Extracts audio features
        audio_features = extract_audio_features(audio_path)
        pitch_feature = extract_pitch(audio_path)

        # Extracts textual features
        text_features = extract_text_features(transcript, vectorizer)

        # Concatenates features
        sample = np.concatenate((audio_features, pitch_feature, text_features))

        # Saves data
        dataset.append({
            "filename": file,
            "features": sample,
            "transcript": transcript,
            "label": label
        })

# Converts to dataframe for storing
df = pd.DataFrame(dataset)
df.to_pickle("/content/drive/MyDrive/audiozzi/samplesCompleti.pkl")  # Saves in binary file
#df.to_csv("/content/drive/MyDrive/audiozzi/aaaa.csv", index=False)

print(f"{len(df)} saved samples.")

300 campioni salvati.


# Test dei modelli

In this phase, I loaded the samples from the binary file (this step is not necessary if the samples were just generated in the same notebook).

I then split the dataset into a training set and a test set (80-20 split).

Finally, I applied three models: Random Forest, Logistic Regression, and Gradient Boosted Trees.

In [None]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.linalg import DenseVector
from pyspark.sql import Row

# Loads dataset from binary file
df = pd.read_pickle("/content/drive/MyDrive/audiozzi/samplesCompleti.pkl")

# Converts to Spark DataFrame with DenseVector
spark_df = spark.createDataFrame([
    Row(filename=row["filename"],
        features=DenseVector(row["features"]),  
        label=int(row["label"]))  
    for _, row in df.iterrows()
])

In [None]:
#Split training and test sets 80-20

train_df, test_df = spark_df.randomSplit([0.8, 0.2], seed=42)

In [None]:
#Random Forest

from pyspark.ml.classification import RandomForestClassifier

# Defines Random Forest model
rf = RandomForestClassifier(featuresCol="features", labelCol="label", numTrees=50)

# Trains the model
model = rf.fit(train_df)

# Generates predictions on test set
predictions = model.transform(test_df)

#predictions.select("features", "label", "prediction").show(10)

# Defines accuracy evaluator
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")

# Calculates accuracy
accuracy = evaluator.evaluate(predictions)
print(f"Random Forest Accuracy: {accuracy * 100:.2f}%")

Accuratezza del modello Random Forest: 98.08%


In [None]:
#Logistic regression

from pyspark.ml.classification import LogisticRegression

# Defines Logistic Regression model
lr = LogisticRegression(featuresCol="features", labelCol="label")

# Trains the model
lr_model = lr.fit(train_df)

# Predictions
lr_predictions = lr_model.transform(test_df)

# Evaluation
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
lr_accuracy = evaluator.evaluate(lr_predictions)
print(f"Logistic Regression Accuracy: {lr_accuracy * 100:.2f}%")

Accuratezza Logistic Regression: 100.00%


In [None]:
#gradient boosting trees

from pyspark.ml.classification import GBTClassifier

# Defines model
gbt = GBTClassifier(featuresCol="features", labelCol="label", maxIter=50)

# training
gbt_model = gbt.fit(train_df)

# predictions
gbt_predictions = gbt_model.transform(test_df)

# evaluation
gbt_accuracy = evaluator.evaluate(gbt_predictions)
print(f"Gradient-Boosted Trees Accuracy: {gbt_accuracy * 100:.2f}%")

Accuratezza Gradient-Boosted Trees: 96.15%
