Disclaimer: Some outputs and print messages remain in Italian, as the notebook was translated after execution. I apologize for any mismatches between code and output.

# Environment setting

Initially, I repeated all the steps already performed and described in the notebook with the complete samples, namely:

- Installed Spark and Parselmouth
- Mounted Google Drive
- Added the utility functions
- Defined the folder paths

In [1]:
#Spark installation

!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget https://downloads.apache.org/spark/spark-3.5.5/spark-3.5.5-bin-hadoop3.tgz
!tar xf spark-3.5.5-bin-hadoop3.tgz
!pip install -q findspark
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.5.5-bin-hadoop3"
import findspark
findspark.init()
import pyspark
print(pyspark.version)
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()
sc=spark.sparkContext

--2025-03-06 19:17:02--  https://downloads.apache.org/spark/spark-3.5.5/spark-3.5.5-bin-hadoop3.tgz
Resolving downloads.apache.org (downloads.apache.org)... 135.181.214.104, 88.99.208.237, 2a01:4f8:10a:39da::2, ...
Connecting to downloads.apache.org (downloads.apache.org)|135.181.214.104|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 400724056 (382M) [application/x-gzip]
Saving to: ‘spark-3.5.5-bin-hadoop3.tgz’


2025-03-06 19:17:16 (26.6 MB/s) - ‘spark-3.5.5-bin-hadoop3.tgz’ saved [400724056/400724056]

<module 'pyspark.version' from '/content/spark-3.5.5-bin-hadoop3/python/pyspark/version.py'>


In [2]:
#Parselmouth installation

!pip install praat-parselmouth

Collecting praat-parselmouth
  Downloading praat_parselmouth-0.4.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (2.9 kB)
Downloading praat_parselmouth-0.4.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (10.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.8/10.8 MB[0m [31m38.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: praat-parselmouth
Successfully installed praat-parselmouth-0.4.5


In [3]:
#Drive mounting

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
#Utility functions

import librosa
import librosa.feature
import numpy as np
import parselmouth
from parselmouth.praat import call
from sklearn.feature_extraction.text import TfidfVectorizer

# Audio features extraction
def extract_audio_features(audio_path, sr=22050, n_mfcc=13):
    y, sr = librosa.load(audio_path, sr=sr)

    #MFCC
    mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc)
    mfccs_mean = np.mean(mfccs, axis=1)  # mean

    #RMSE (Energy)
    rms = librosa.feature.rms(y=y)
    rms_mean = np.mean(rms)

    return np.concatenate((mfccs_mean, [rms_mean]))

# pitch extraction
def extract_pitch(audio_path):
    snd = parselmouth.Sound(audio_path)
    pitch = call(snd, "To Pitch", 0.0, 75, 600)
    mean_pitch = call(pitch, "Get mean", 0, 0, "Hertz")  # mean

    return np.array([mean_pitch])

# textual feature extraction
def extract_text_features(text, vectorizer):
    return vectorizer.transform([text]).toarray()[0]

# transcriptions extraction from txt file
def load_transcriptions(txt_file):
    transcriptions = {}
    with open(txt_file, "r", encoding="utf-8") as f:
        lines = f.read().strip().split("\n\n")  # block split
    for block in lines:
        lines = block.split("\n")  # Each block: file name + transcription
        if len(lines) >= 2:
            filename = lines[0].strip()  # file name
            transcript = " ".join(lines[1:]).strip()  # transcription
            transcriptions[filename] = transcript  # Add to dictionary

    return transcriptions

In [5]:
import pandas as pd

# Paths
audio_folder = "/content/drive/MyDrive/audiozzi"
transcriptions_file = "/content/drive/MyDrive/audiozzi/Trascrizioni.txt"

# Samples creation

Subsequently, I created two sets of samples:

- One using only Librosa and Parselmouth to extract features from the audio files.
- One using only TF-IDF to extract features from the transcriptions.

In both cases, the samples were saved as `.pkl` files to be used in the next stage.

In [6]:
# Sample creation (just librosa and parselmouth)

# Final data list
datasetlp = []

# Loop over every file in the folder
for file in os.listdir(audio_folder):
    if file.endswith(".mp3"):  # mp3 only
        audio_path = os.path.join(audio_folder, file)

        # Label setting based on file name
        label = 1 if file.endswith("u.mp3") else 0

        # Audio feature extraction
        audio_features = extract_audio_features(audio_path)
        pitch_feature = extract_pitch(audio_path)

        # Concatenating
        sample = np.concatenate((audio_features, pitch_feature))

        # Saves data
        datasetlp.append({
            "filename": file,
            "features": sample,
            "label": label  # adds label
        })

# DataFrame conversion for storage
dflp = pd.DataFrame(datasetlp)
dflp.to_pickle("/content/drive/MyDrive/audiozzi/samplesSoloLibrosaParselmouth.pkl")  # binary file
#df.to_csv("/content/drive/MyDrive/audiozzi/sss.csv", index=False)

print(f"{len(dflp)} saved samples.")

300 campioni salvati.


In [8]:
# Sample creation (tf-idf only)

# Loads transcriptions
transcriptions = load_transcriptions(transcriptions_file)

# Creates and fits vectorizer
vectorizer = TfidfVectorizer()
vectorizer.fit(list(transcriptions.values()))

datasettf = []

# Loop over every file in audio folder
for file in os.listdir(audio_folder):
    if file.endswith(".mp3"):  # mp3 only
        audio_path = os.path.join(audio_folder, file)

        # Label setting
        label = 1 if file.endswith("u.mp3") else 0

        # Retrieves transcription
        transcript = transcriptions.get(file, None)
        if transcript is None:
            print(f"No transcription for {file}, skipped.")
            continue

        # Extracts text features
        sample = extract_text_features(transcript, vectorizer)

        # Saves data
        datasettf.append({
            "filename": file,
            "features": sample,
            "transcript": transcript,
            "label": label  # Adds label
        })

# DataFrame conversion
dftf = pd.DataFrame(datasettf)
dftf.to_pickle("/content/drive/MyDrive/audiozzi/samplesSoloTFIDF.pkl")  # pkl
#dftf.to_csv("/content/drive/MyDrive/audiozzi/ssssssss.csv", index=False)

print(f"{len(dftf)} saved samples.")

300 campioni salvati.


# Model testing and comparison

Finally, I loaded the samples from the binary files and trained and tested the models to compare the results.

### Librosa and Parselmouth only

In [10]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.linalg import DenseVector
from pyspark.sql import Row

# Loads dataset from pkl file
dflp = pd.read_pickle("/content/drive/MyDrive/audiozzi/samplesSoloLibrosaParselmouth.pkl")

# DataFrame conversion
spark_df = spark.createDataFrame([
    Row(filename=row["filename"],
        features=DenseVector(row["features"]),  # DenseVector conversion
        label=int(row["label"]))  # Label check
    for _, row in dflp.iterrows()
])

# dataset split 80-20

train_df_lp, test_df_lp = spark_df.randomSplit([0.8, 0.2], seed=42)

In [11]:
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.classification import GBTClassifier

# Random forest 
rf_lp = RandomForestClassifier(featuresCol="features", labelCol="label", numTrees=50)

# Model training
model = rf_lp.fit(train_df_lp)

# predictions generation
predictions = model.transform(test_df_lp)

#predictions.select("features", "label", "prediction").show(10)

# Evaluator setting
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")

# Accuracy 
accuracy = evaluator.evaluate(predictions)
print(f"Random Forest accuracy: {accuracy * 100:.2f}%")



# Logistic regression
lr_lp = LogisticRegression(featuresCol="features", labelCol="label")

# Model training
lr_model = lr_lp.fit(train_df_lp)

# Predictions
lr_predictions = lr_model.transform(test_df_lp)

# Evaluation
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
lr_accuracy = evaluator.evaluate(lr_predictions)
print(f"Logistic Regression accuracy: {lr_accuracy * 100:.2f}%")



# Gradient Boosted Trees
gbt_lp = GBTClassifier(featuresCol="features", labelCol="label", maxIter=50)

# Training
gbt_model = gbt_lp.fit(train_df_lp)

# predictions
gbt_predictions = gbt_model.transform(test_df_lp)

# Evaluation
gbt_accuracy = evaluator.evaluate(gbt_predictions)
print(f"Gradient-Boosted Trees accuracy: {gbt_accuracy * 100:.2f}%")

Accuratezza del modello Random Forest: 94.23%
Accuratezza Logistic Regression: 98.08%
Accuratezza Gradient-Boosted Trees: 98.08%


### TF-IDF only

In [12]:
# Loads dataset
dftf = pd.read_pickle("/content/drive/MyDrive/audiozzi/samplesSoloTFIDF.pkl")

# DataFrame conversion
spark_df = spark.createDataFrame([
    Row(filename=row["filename"],
        features=DenseVector(row["features"]),  # DenseVector
        label=int(row["label"]))  # label check
    for _, row in dftf.iterrows()
])

#dataset split

train_df_tf, test_df_tf = spark_df.randomSplit([0.8, 0.2], seed=42)

In [13]:
# Random forest
rf_tf = RandomForestClassifier(featuresCol="features", labelCol="label", numTrees=50)

# Training
model = rf_tf.fit(train_df_tf)

# Predictions
predictions = model.transform(test_df_tf)

#predictions.select("features", "label", "prediction").show(10)

# Evaluator setting
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")

# Evaluation
accuracy = evaluator.evaluate(predictions)
print(f"Random Forest accuracy: {accuracy * 100:.2f}%")



# Logistic regression
lr_tf = LogisticRegression(featuresCol="features", labelCol="label")

# Training
lr_model = lr_tf.fit(train_df_tf)

# predictions
lr_predictions = lr_model.transform(test_df_tf)

# Evaluation
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
lr_accuracy = evaluator.evaluate(lr_predictions)
print(f"Logistic Regression accuracy: {lr_accuracy * 100:.2f}%")



# GBT
gbt_tf = GBTClassifier(featuresCol="features", labelCol="label", maxIter=50)

# Training
gbt_model = gbt_tf.fit(train_df_tf)

# predictions
gbt_predictions = gbt_model.transform(test_df_tf)

# Evaluation
gbt_accuracy = evaluator.evaluate(gbt_predictions)
print(f"Gradient-Boosted Trees accuracy: {gbt_accuracy * 100:.2f}%")

Accuratezza del modello Random Forest: 100.00%
Accuratezza Logistic Regression: 100.00%
Accuratezza Gradient-Boosted Trees: 90.38%
