In [None]:
import sys
is_colab = 'google.colab' in sys.modules

In [None]:
sys.executable

In [None]:
# Load the Drive helper and mount
from google.colab import drive

# Attempt to mount the drive, if it fails, provide troubleshooting steps
try:
    drive.mount('/content/drive')
except ValueError:
    print("Drive mounting failed. Please try the following steps:")
    print("1. Ensure you have a stable internet connection.")
    print("2. Check your Google Drive authorization.")
    print("3. Restart the runtime and try again.")
    print("4. If the issue persists, search for solutions online or report the error to Google Colab support.")

In [None]:
!pip install altair

In [None]:
!pip install spacy

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import warnings
import altair as alt
import pickle
import string
import spacy
import nltk
import re

from sklearn.naive_bayes import *
from sklearn.ensemble import *
from sklearn.neighbors import *
from sklearn.tree import *
from sklearn.calibration import *
from sklearn.linear_model import *
from sklearn.multiclass import *
from sklearn.svm import *
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from nltk.stem import WordNetLemmatizer
from collections import Counter

from spacy.lang.en.stop_words import STOP_WORDS
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, auc, roc_curve
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer, HashingVectorizer
from sklearn.model_selection import train_test_split, cross_val_score, KFold, GridSearchCV
from sklearn.pipeline import Pipeline, make_pipeline

#nltk.download('stopwords')
sns.set(style='whitegrid')
%matplotlib inline
warnings.filterwarnings('ignore')

# Load data
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Data/Medical Speech, Transcription, and Intent/recordings/overview-of-recordings.csv')
df.head()

# [**Exploratory Data Analysis**](http://)

In [None]:
#Analyze Data
def explore_data(df):
    print(f"The data contains {df.shape[0]} rows and {df.shape[1]} columns.")
    print('\n')
    print('Dataset columns:',df.columns)
    print('\n')
    print(df.info())

explore_data(df)

# [**Checking for Nan Values and duplicates**¶](http://)

In [None]:
df.isna().sum()

In [None]:
def checking_removing_duplicates(df):
    count_dups = df.duplicated().sum()
    print("Number of Duplicates: ", count_dups)
    if count_dups >= 1:
        df.drop_duplicates(inplace=True)
        print('Duplicate values removed!')
    else:
        print('No Duplicate values')
checking_removing_duplicates(df)

# **Corpus**

In [None]:
df_text = df[['phrase', 'prompt']]
df_text

# **Document-Term Matrix**

In [None]:
!pip install -U scikit-learn
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer()
df_cv = cv.fit_transform(df_text.phrase)
# Use get_feature_names_out() instead of get_feature_names()
data_dtm = pd.DataFrame(df_cv.toarray(), columns=cv.get_feature_names_out())
data_dtm.index = df_text.index
data_dtm

In [None]:
# Add features
# Number of characters in the text
df_text['phrase_length'] = df_text['phrase'].apply(len)
# Number of words in the text
df_text['phrase_num_words'] = df_text['phrase'].apply(lambda x: len(x.split()))
# Average length of the words in the text
df_text["mean_word_len"] = df_text["phrase"].apply(lambda x: np.mean([len(w) for w in str(x).split()]))
# Number of non-stopwords in the text
df_text['phrase_non_stopwords'] = df_text['phrase'].apply(lambda x: len([t for t in x.split() if t not in STOP_WORDS]))
df_text.describe().T

In [None]:
import altair as alt
import pandas as pd

# Assuming 'df_text' contains your data and 'prompt' is the column with categories
cat_dist = df_text['prompt'].value_counts().reset_index()
cat_dist.columns = ['Count', 'count']  # Rename columns for clarity

alt.Chart(cat_dist).mark_bar(opacity=0.7).encode(
    x=alt.X('Count', title='Count'),
    y=alt.Y('count', sort='-x', title='Category'),
    tooltip=['count', 'Count']
).properties(height=800, width=700, title="Class Distribution")

In [None]:
target = df_text['prompt'].values
counter = Counter(target)
for k,v in counter.items():
    per = v / len(target) * 100
    print('Class=%s, Count=%d, Percentage=%.3f%%' % (k, v, per))

In [None]:
alt.data_transformers.disable_max_rows()
alt.Chart(df_text).mark_bar(color="violet",opacity=0.7,
    interpolate='step').encode(
    alt.X("phrase_length:Q",  bin=alt.Bin(maxbins=100), title='Phrase Length Class'),
    alt.Y('count()', axis=alt.Axis(labels=False), title='Frequency'),
    tooltip=['phrase_length']
).properties(
    height=400,
    width=700, title="Length Distribution")

In [None]:
import nltk
nltk.download('punkt')

In [None]:
nltk.download('averaged_perceptron_tagger')


In [None]:
nltk.download('wordnet')

In [None]:
nltk.download('omw-1.4')

In [None]:
def clean_txt(docs):
    lemmatizer = WordNetLemmatizer()
    # split into words
    speech_words = nltk.word_tokenize(docs)
    # convert to lower case
    lower_text = [w.lower() for w in speech_words]
    # prepare regex for char filtering
    re_punc = re.compile('[%s]' % re.escape(string.punctuation))
    # remove punctuation from each word
    stripped = [re_punc.sub('', w) for w in lower_text]
    # remove remaining tokens that are not alphabetic
    words = [word for word in stripped if word.isalpha()]
    # filter out stop words
    words = [w for w in words if not w in  list(STOP_WORDS)]
    # filter out short tokens
    words = [word for word in words if len(word) > 2]
    #Stemm all the words in the sentence
    lem_words = [lemmatizer.lemmatize(word) for word in words]
    combined_text = ' '.join(lem_words)
    return combined_text

# Cleaning the text data
df_text['cleaned_phrase'] = df_text['phrase'].apply(clean_txt)
df_text

In [None]:
from nltk.probability import FreqDist

In [None]:
freq_splits = FreqDist(df_text['phrase'])
print(f"***** 10 most common strings ***** \n{freq_splits.most_common(10)}", "\n")

#Text Data Preparation and Model Training


In [None]:
# Spot-Check Normalized Text Models
def NormalizedTextModel(nameOfvect):
    if nameOfvect == 'countvect':
        vectorizer = CountVectorizer()
    elif nameOfvect =='tfvect':
        vectorizer = TfidfVectorizer()
    elif nameOfvect == 'hashvect':
        vectorizer = HashingVectorizer()

    pipelines = []
    pipelines.append((nameOfvect+'MultinomialNB'  , Pipeline([('Vectorizer', vectorizer),('NB'  , MultinomialNB())])))
    pipelines.append((nameOfvect+'CCCV' , Pipeline([('Vectorizer', vectorizer),('CCCV' , CalibratedClassifierCV())])))
    pipelines.append((nameOfvect+'KNN' , Pipeline([('Vectorizer', vectorizer),('KNN' , KNeighborsClassifier())])))
    pipelines.append((nameOfvect+'CART', Pipeline([('Vectorizer', vectorizer),('CART', DecisionTreeClassifier())])))
    pipelines.append((nameOfvect+'PAC'  , Pipeline([('Vectorizer', vectorizer),('PAC'  , PassiveAggressiveClassifier())])))
    pipelines.append((nameOfvect+'SVM' , Pipeline([('Vectorizer', vectorizer),('RC' , RidgeClassifier())])))
    pipelines.append((nameOfvect+'AB'  , Pipeline([('Vectorizer', vectorizer),('AB'  , AdaBoostClassifier())])  ))
    pipelines.append((nameOfvect+'GBM' , Pipeline([('Vectorizer', vectorizer),('GMB' , GradientBoostingClassifier())])))
    pipelines.append((nameOfvect+'RF'  , Pipeline([('Vectorizer', vectorizer),('RF'  , RandomForestClassifier())])))
    pipelines.append((nameOfvect+'ET'  , Pipeline([('Vectorizer', vectorizer),('ET'  , ExtraTreesClassifier())])))
    pipelines.append((nameOfvect+'SGD'  , Pipeline([('Vectorizer', vectorizer),('SGD'  , SGDClassifier())])))
    pipelines.append((nameOfvect+'OVRC'  , Pipeline([('Vectorizer', vectorizer),('OVRC'  , OneVsRestClassifier(LogisticRegression()))])))
    pipelines.append((nameOfvect+'Bagging'  , Pipeline([('Vectorizer', vectorizer),('Bagging'  , BaggingClassifier())])))
    pipelines.append((nameOfvect+'NN'  , Pipeline([('Vectorizer', vectorizer),('NN'  , MLPClassifier())])))
    #pipelines.append((nameOfvect+'xgboost', Pipeline([('Vectorizer', vectorizer), ('xgboost', XGBClassifier())])))
    return pipelines

# Traing model
def fit_model(X_train, y_train,models):
    # Test options and evaluation metric
    num_folds = 10
    scoring = 'accuracy'

    results = []
    names = []
    for name, model in models:
        kfold = KFold(n_splits=num_folds)
        cv_results = cross_val_score(model, X_train, y_train, cv=kfold, scoring=scoring)
        results.append(cv_results)
        names.append(name)
        msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
        print(msg)

# Split data to training and validation set
def read_in_and_split_data(data, features,target):
    X = data[features]
    y = data[target]
    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=0)
    return X_train, X_test, y_train, y_test

X = 'cleaned_phrase'
target_class = 'prompt'
X_train, X_test, y_train, y_test = read_in_and_split_data(df_text, X, target_class)

#Bag of Words Model


## [Word Counts with countvectorizer ]


In [None]:
# sample text
sample_text_count = X_train[:10]
# create the transform
vectorizer = CountVectorizer()
# tokenize and build vocab
vectorizer.fit(sample_text_count)
# summarize
print(vectorizer.vocabulary_)
# encode document
vector = vectorizer.transform(sample_text_count)
# summarize encoded vector
print(vector.shape)
print(type(vector))
print(vector.toarray())

# [**Spot-Check Algorithms with Countvectorizer**](http://)

In [None]:
# Contvectorizer
models = NormalizedTextModel('countvect')
fit_model(X_train, y_train, models)

## [**Word Frequencies with TfidfVectorizer** ]


In [None]:
# sample text
sample_text_Tfid = X_train[:10]
# create the transform
vectorizer = TfidfVectorizer()
# tokenize and build vocab
vectorizer.fit(sample_text_Tfid)
# summarize
print(vectorizer.vocabulary_)
print(vectorizer.idf_)
# encode document
vector = vectorizer.transform(sample_text_Tfid)
# summarize encoded vector
print(vector.shape)
print(vector.toarray())

# [**Spot-Check Algorithms with TfidfVectorizer**](http://)

In [None]:
# TfidfVectorizer
models = NormalizedTextModel('tfvect')
fit_model(X_train, y_train, models)

## [Hashing with HashingVectorize]

In [None]:
# sample text
sample_text_hash = X_train[:10]
# create the transform
vectorizer = HashingVectorizer(n_features=20)
# encode document
vector = vectorizer.transform(sample_text_hash)
# summarize encoded vector
print(vector.shape)
print(vector.toarray())

# [**Spot-Check Algorithms  with HashingVectorizer**](http://)

# [**Fine tuning**](http://)

In [None]:
vectorizer = TfidfVectorizer()
X_train_1 = vectorizer.fit_transform(X_train)
model = BaggingClassifier()
n_estimators = [10, 100, 1000]
#learning_rate= [0.1, 0.001, 0.0001]
#max_depth = [4,5,6]
#min_child_weight=[4,5,6]

#define grid search
grid = dict(n_estimators=n_estimators)
cv = KFold(n_splits=10)
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy')
grid_result = grid_search.fit(X_train_1, y_train)
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

# [**Predict unseen data**](http://)

In [None]:
def classification_metrics(model, y_test, y_pred):
    print(f"Training Accuracy Score: {model.score(X_train, y_train) * 100:.1f}%")
    print(f"Validation Accuracy Score: {model.score(X_test, y_test) * 100:.1f}%")

    conf_matrix = confusion_matrix(y_test, y_pred)
    fig,ax = plt.subplots(figsize=(8,6))
    sns.heatmap(pd.DataFrame(conf_matrix), annot = True, cmap = 'YlGnBu',fmt = 'g')
    ax.xaxis.set_label_position('top')
    plt.tight_layout()
    plt.title('Confusion matrix for Logisitic Regression Model', fontsize=20, y=1.1)
    plt.ylabel('Actual label', fontsize=15)
    plt.xlabel('Predicted label', fontsize=15)
    plt.show()
    print(classification_report(y_test, y_pred))

text_clf = Pipeline([('vect', TfidfVectorizer()),('bagging', BaggingClassifier(n_estimators=10))])
model = text_clf.fit(X_train, y_train)
y_pred = model.predict(X_test)
classification_metrics(model,y_test, y_pred)

In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier  # Simple deep learning (multi-layer perceptron)
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

# Function to fit and evaluate models
def fit_model(X_train, y_train, models):
    num_folds = 10
    scoring = 'accuracy'

    results = []
    names = []
    for name, model in models:
        kfold = KFold(n_splits=num_folds)
        # Convert y_train to numerical labels using LabelEncoder
        le = LabelEncoder()
        y_train_encoded = le.fit_transform(y_train)
        cv_results = cross_val_score(model, X_train, y_train_encoded, cv=kfold, scoring=scoring)
        results.append(cv_results)
        names.append(name)
        msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
        print(msg)

# Split data to training and validation set
def read_in_and_split_data(data, features, target):
    X = data[features]
    y = data[target]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
    return X_train, X_test, y_train, y_test

# Function to train and predict with Naive Bayes
def naive_bayes_classifier(X_train, X_test, y_train, y_test):
    model = MultinomialNB()
    model.fit(X_train, y_train)  # Train the Naive Bayes model
    predictions = model.predict(X_test)  # Predict on the test set
    accuracy = accuracy_score(y_test, predictions)  # Evaluate accuracy
    print(f"Naive Bayes Accuracy: {accuracy * 100:.2f}%")
    return predictions

# Example preprocessing step: Convert text data into numerical format (TF-IDF)
def vectorize_text_data(train_data, test_data):
    vectorizer = TfidfVectorizer(max_features=1000)  # Limit to top 1000 features for performance
    X_train_tfidf = vectorizer.fit_transform(train_data)
    X_test_tfidf = vectorizer.transform(test_data)
    return X_train_tfidf, X_test_tfidf

# Assume df_text is your DataFrame, X represents the feature column (text), and target_class represents the target column
X = 'cleaned_phrase'  # Replace with actual feature column name
target_class = 'prompt'  # Replace with actual target column

# Split data
X_train_raw, X_test_raw, y_train, y_test = read_in_and_split_data(df_text, X, target_class)

# Vectorize text data using TF-IDF
X_train, X_test = vectorize_text_data(X_train_raw, X_test_raw)

# Define models to train
models = []
models.append(('Random Forest', RandomForestClassifier(n_estimators=100)))
models.append(('XGBoost', XGBClassifier()))
models.append(('Decision Tree', DecisionTreeClassifier()))
models.append(('Deep Learning', MLPClassifier(max_iter=500)))  # Simple deep learning

# Train and evaluate other models using cross-validation
fit_model(X_train, y_train, models)

# Train and evaluate Naive Bayes classifier separately
naive_bayes_predictions = naive_bayes_classifier(X_train, X_test, y_train, y_test)


In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier  # Simple deep learning (multi-layer perceptron)
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

# Function to fit and evaluate models
def fit_model(X_train, y_train, models):
    num_folds = 10
    scoring = 'accuracy'

    results = []
    names = []
    for name, model in models:
        kfold = KFold(n_splits=num_folds)

        # Convert y_train to numerical labels using LabelEncoder
        le = LabelEncoder()
        y_train_encoded = le.fit_transform(y_train)

        cv_results = cross_val_score(model, X_train, y_train_encoded, cv=kfold, scoring=scoring)
        results.append(cv_results)
        names.append(name)
        msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
        print(msg)

# Split data to training and validation set
def read_in_and_split_data(data, features, target):
    X = data[features]
    y = data[target]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
    return X_train, X_test, y_train, y_test

# Function to train and predict with Naive Bayes and print classification report
def naive_bayes_classifier(X_train, X_test, y_train, y_test):
    model = MultinomialNB()
    model.fit(X_train, y_train)  # Train the Naive Bayes model
    predictions = model.predict(X_test)  # Predict on the test set

    # Evaluate accuracy
    accuracy = accuracy_score(y_test, predictions)
    print(f"Naive Bayes Accuracy: {accuracy * 100:.2f}%")

    # Print classification report
    report = classification_report(y_test, predictions)
    print("Classification Report:\n", report)

    return predictions

# Example preprocessing step: Convert text data into numerical format (TF-IDF)
def vectorize_text_data(train_data, test_data):
    vectorizer = TfidfVectorizer(max_features=1000)  # Limit to top 1000 features for performance
    X_train_tfidf = vectorizer.fit_transform(train_data)
    X_test_tfidf = vectorizer.transform(test_data)
    return X_train_tfidf, X_test_tfidf

# Assume df_text is your DataFrame, X represents the feature column (text), and target_class represents the target column
X = 'cleaned_phrase'  # Replace with actual feature column name
target_class = 'prompt'  # Replace with actual target column

# Split data
X_train_raw, X_test_raw, y_train, y_test = read_in_and_split_data(df_text, X, target_class)

# Vectorize text data using TF-IDF
X_train, X_test = vectorize_text_data(X_train_raw, X_test_raw)

# Define models to train
models = []
models.append(('Random Forest', RandomForestClassifier(n_estimators=100)))
models.append(('XGBoost', XGBClassifier()))
models.append(('Decision Tree', DecisionTreeClassifier()))
models.append(('Deep Learning', MLPClassifier(max_iter=500)))  # Simple deep learning

# Train and evaluate other models using cross-validation
fit_model(X_train, y_train, models)

# Train and evaluate Naive Bayes classifier separately, including classification report
naive_bayes_predictions = naive_bayes_classifier(X_train, X_test, y_train, y_test)


In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import label_binarize

# Function to plot ROC curve and AUC
def plot_roc_curve(y_test, y_probs, model_name):
    fpr, tpr, _ = roc_curve(y_test, y_probs)  # Compute ROC curve
    roc_auc = auc(fpr, tpr)  # Compute AUC

    plt.figure()
    plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (AUC = {roc_auc:.2f})')
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')  # Random chance line
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(f'Receiver Operating Characteristic: {model_name}')
    plt.legend(loc="lower right")
    plt.show()

# Function to fit models and plot ROC curves with AUC
def fit_and_plot_roc(X_train, X_test, y_train, y_test, models):
    for name, model in models:
        model.fit(X_train, y_train)

        # Predict probabilities or decision function for ROC (binary classification)
        if hasattr(model, "predict_proba"):
            y_probs = model.predict_proba(X_test)[:, 1]  # Probability for class 1
        elif hasattr(model, "decision_function"):
            y_probs = model.decision_function(X_test)
        else:
            raise Exception(f'Model {name} does not have predict_proba or decision_function')

        # Plot ROC curve and AUC
        plot_roc_curve(y_test, y_probs, name)

# Example models (define your models)
# models = [('Logistic Regression', LogisticRegression()), ('Random Forest', RandomForestClassifier())]

# Ensure binary targets for ROC (binarize if necessary)
if len(set(y_train)) > 2:
    y_train = label_binarize(y_train, classes=list(set(y_train)))[:, 0]  # Class 0 vs others
    y_test = label_binarize(y_test, classes=list(set(y_test)))[:, 0]

# Train and plot ROC curve with AUC for multiple models
fit_and_plot_roc(X_train, X_test, y_train, y_test, models)

# For Naive Bayes, get probabilities and plot ROC with AUC
naive_bayes_model = MultinomialNB()
naive_bayes_model.fit(X_train, y_train)
nb_probs = naive_bayes_model.predict_proba(X_test)[:, 1]  # Probability for class 1
plot_roc_curve(y_test, nb_probs, 'Naive Bayes')


In [None]:
!pip install scispacy
!pip install pysoundfile
!apt-get install libav-tools -y
!apt-get install zip

In [None]:
from fastai.text import *
from fastai.vision import *
import spacy
from spacy import displacy
import scispacy
import librosa
import librosa.display
import soundfile as sf
from nltk.corpus import stopwords
from wordcloud import WordCloud, STOPWORDS
from collections import Counter
import IPython
import os
from glob import glob
from tqdm import tqdm
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
import pylab
import gc
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

**Defining Helper Functions**

In [None]:
def get_wav_info(wav_file):
    data, rate = sf.read(wav_file)
    return data, rate

def create_spectrogram(wav_file):
    # adapted from Andrew Ng Deep Learning Specialization Course 5
    data, rate = get_wav_info(wav_file)
    nfft = 200 # Length of each window segment
    fs = 8000 # Sampling frequencies
    noverlap = 120 # Overlap between windows
    nchannels = data.ndim
    if nchannels == 1:
        pxx, freqs, bins, im = plt.specgram(data, nfft, fs, noverlap = noverlap)
    elif nchannels == 2:
        pxx, freqs, bins, im = plt.specgram(data[:,0], nfft, fs, noverlap = noverlap)
    return pxx

def create_melspectrogram(filename,name):

    plt.interactive(False)
    clip, sample_rate = librosa.load(filename, sr=None)
    fig = plt.figure(figsize=[0.72,0.72])
    ax = fig.add_subplot(111)
    ax.axes.get_xaxis().set_visible(False)
    ax.axes.get_yaxis().set_visible(False)
    ax.set_frame_on(False)
    S = librosa.feature.melspectrogram(y=clip, sr=sample_rate)
    librosa.display.specshow(librosa.power_to_db(S, ref=np.max))
    filename  = Path('/content/drive/MyDrive/test/spectrograms/' + name + '.jpg')
    plt.savefig(filename, dpi=400, bbox_inches='tight',pad_inches=0)
    plt.close()
    fig.clf()
    plt.close(fig)
    plt.close('all')
    del filename,name,clip,sample_rate,fig,ax,S

def wordBarGraphFunction(df,column,title):

    topic_words = [ z.lower() for y in
                       [ x.split() for x in df[column] if isinstance(x, str)]
                       for z in y]
    word_count_dict = dict(Counter(topic_words))
    popular_words = sorted(word_count_dict, key = word_count_dict.get, reverse = True)
    popular_words_nonstop = [w for w in popular_words if w not in stopwords.words("english")]
    plt.barh(range(50), [word_count_dict[w] for w in reversed(popular_words_nonstop[0:50])])
    plt.yticks([x + 0.5 for x in range(50)], reversed(popular_words_nonstop[0:50]))
    plt.title(title)
    plt.show()

def wordCloudFunction(df,column,numWords):
    topic_words = [ z.lower() for y in
                       [ x.split() for x in df[column] if isinstance(x, str)]
                       for z in y]
    word_count_dict = dict(Counter(topic_words))
    popular_words = sorted(word_count_dict, key = word_count_dict.get, reverse = True)
    popular_words_nonstop = [w for w in popular_words if w not in stopwords.words("english")]
    word_string=str(popular_words_nonstop)
    wordcloud = WordCloud(stopwords=STOPWORDS,
                          background_color='white',
                          max_words=numWords,
                          width=1000,height=1000,
                         ).generate(word_string)
    plt.clf()
    plt.imshow(wordcloud)
    plt.axis('off')
    plt.show()

In [None]:
overview = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Data/Medical Speech, Transcription, and Intent/recordings/overview-of-recordings.csv')
overview = overview[['file_name','phrase','prompt','overall_quality_of_the_audio','speaker_id']]
overview=overview.dropna()
overviewAudio = overview[['file_name','prompt']]
overviewAudio['spec_name'] = overviewAudio['file_name'].str.rstrip('.wav')
overviewAudio = overviewAudio[['spec_name','prompt']]
overviewText = overview[['phrase','prompt']]
noNaNcsv = '/content/drive/MyDrive/Colab Notebooks/Data/Medical Speech, Transcription, and Intent/recordings/overview-of-recordings.csv'
noNaNcsv = pd.read_csv(noNaNcsv)
noNaNcsv = noNaNcsv.dropna()
noNaNcsv = noNaNcsv.to_csv('overview-of-recordings.csv',index=False)
noNaNcsv

**Data exploratory analysis and visualization.**

In [None]:
overview[110:120]

**The categories of ailments and the quality of the audio descriptions are described below**

In [None]:
sns.set_style("whitegrid")
promptsPlot = sns.countplot(y='prompt',data=overview)
promptsPlot

qualityPlot = sns.FacetGrid(overview,aspect=2.5)
qualityPlot.map(sns.kdeplot,'overall_quality_of_the_audio',shade= True)
qualityPlot.set(xlim=(2.5, overview['overall_quality_of_the_audio'].max()))
qualityPlot.set_axis_labels('overall_quality_of_the_audio', 'Proportion')
qualityPlot

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import scipy.stats as stats

# QQ Plot
plt.figure(figsize=(4, 4))
stats.probplot(overview['overall_quality_of_the_audio'], dist="norm", plot=plt)
plt.title('QQ Plot of Overall Quality of Audio')
plt.xlabel('Theoretical Quantiles')
plt.ylabel('Sample Quantiles')
plt.grid()
plt.show()

In [None]:
# Boxplot
plt.figure(figsize=(6, 4))
sns.boxplot(y='overall_quality_of_the_audio', data=overview)
plt.title('Boxplot of Overall Quality of Audio')
plt.ylabel('Overall Quality of Audio')
plt.grid()
plt.show()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Set the aesthetics for the plots
sns.set_style("whitegrid")

# Histogram
plt.figure(figsize=(6,4))
sns.histplot(overview['overall_quality_of_the_audio'], bins=30, kde=True)
plt.title('Histogram of Overall Quality of Audio')
plt.xlabel('Overall Quality of Audio')
plt.ylabel('Frequency')
plt.grid()
plt.show()

In [None]:
overview[62:63]

In [None]:
IPython.display.Audio('/content/drive/MyDrive/Colab Notebooks/Data/Medical Speech, Transcription, and Intent/recordings/test/1249120_20518958_23074828.wav')

In [None]:
overview[118:119]

In [None]:
import nltk

# Download the stopwords dataset
nltk.download('stopwords')

In [None]:
plt.figure(figsize=(6,6))
wordCloudFunction(overview,'phrase',10000000)

In [None]:
plt.figure(figsize=(10,10))
wordBarGraphFunction(overview,'phrase',"Most Common Words in Medical Text Transcripts")

In [None]:
from pathlib import Path

In [None]:
pip install fastai

In [None]:
!pip install fastai --upgrade

In [None]:
from fastai.text import *
from fastai.vision import *

In [None]:
!pip install fastai --upgrade

In [None]:
# Import necessary libraries
from fastai.text.all import * # Changed to import using the all wildcard
import numpy as np
from pathlib import Path

# Set random seed
np.random.seed(7)

# Define the path to your data
path = Path('/content/drive/MyDrive/Colab Notebooks/Data/Medical Speech, Transcription, and Intent/recordings/')

# Load data using the newer DataBlock API
data_clas = TextDataLoaders.from_csv(path, 'overview-of-recordings.csv',
                                      cols='phrase',
                                      label_col='prompt',
                                      valid_pct=0.2,
                                      bs=42)

# Set model path
MODEL_PATH = "/tmp/model/"

# Create a text classifier learner
learn = text_classifier_learner(data_clas, model_dir=MODEL_PATH, arch=AWD_LSTM)

# Fit the model
learn.fit_one_cycle(5)

In [None]:
learn.unfreeze()
learn.fit_one_cycle(5)

In [None]:
interp = ClassificationInterpretation.from_learner(learn)
interp.plot_confusion_matrix(figsize=(10,10), dpi=60)

**Part 3 of 3: Classify Ailment from Audio Description**


In [None]:
testAudio = "/content/drive/MyDrive/Colab Notebooks/Data/Medical Speech, Transcription, and Intent/recordings/train/1249120_44176037_58635902.wav"
x = create_spectrogram(testAudio)

**Mel-frequency_cepstrum Application.**


In [None]:
!pip uninstall scipy
!pip install scipy

In [None]:
filename = "/content/drive/MyDrive/Colab Notebooks/Data/Medical Speech, Transcription, and Intent/recordings/train/1249120_44176037_58635902.wav"
clip, sample_rate = librosa.load(filename, sr=None)
fig = plt.figure(figsize=[5,5])
S = librosa.feature.melspectrogram(y=clip, sr=sample_rate)
librosa.display.specshow(librosa.power_to_db(S, ref=np.max))

In [None]:
from glob import glob

In [None]:
!cat /etc/issue


In [None]:
import numpy as np
import pandas as pd
from glob import glob
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, mean_absolute_error, mean_squared_error
from sklearn.ensemble import RandomForestClassifier  # Choose an appropriate model
import librosa
import librosa.display
import matplotlib.pyplot as plt

# Function to create mel spectrograms (Assuming you have this defined)
def create_melspectrogram(filename, name):
    # Load the audio file
    y, sr = librosa.load(filename)
    # Create a mel spectrogram
    mel_spectrogram = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128)
    mel_spectrogram_db = librosa.power_to_db(mel_spectrogram, ref=np.max)

    # Flatten the mel spectrogram for feature extraction
    return mel_spectrogram_db.flatten()  # Flatten for later use

# Define directories
data_dir_train = np.array(glob("/content/drive/MyDrive/Colab Notebooks/Data/Medical Speech, Transcription, and Intent/recordings/train/*"))
data_dir_test = np.array(glob("/content/drive/MyDrive/Colab Notebooks/Data/Medical Speech, Transcription, and Intent/recordings/test/*"))
data_dir_val = np.array(glob("/content/drive/MyDrive/Colab Notebooks/Data/Medical Speech, Transcription, and Intent/recordings/validate/*"))

# Create mel spectrograms and extract features
features = []
labels = []  # Add corresponding labels based on your dataset

for file in tqdm(data_dir_train):
    filename, name = file, file.split('/')[-1].split('.')[0]
    mel_features = create_melspectrogram(filename, name)
    features.append(mel_features)
    labels.append('your_label_here')  # Replace with the actual label for each file

# Repeat for test and validation sets if necessary

# Prepare your data into a DataFrame
df = pd.DataFrame(features)
df['label'] = labels

# Check the shape of the DataFrame
print("DataFrame shape:", df.shape)

# Ensure there are features and labels before proceeding
if df.empty or df['label'].isnull().all():
    raise ValueError("No data available for training. Check your feature extraction process.")

# Split the dataset into features and target
X = df.drop('label', axis=1)
y = df['label']

# Check if there are enough samples
if len(X) == 0 or len(y) == 0:
    raise ValueError("Feature or target variable is empty.")

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Train a classifier
model = RandomForestClassifier()
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Generate classification report
report = classification_report(y_test, y_pred)
print("Classification Report:\n", report)

# Calculate and print MAE, MSE, RMSE
# Convert predictions to numerical values for error metrics calculation
# If your labels are categorical, you can map them to numerical values
# For this example, we'll assume binary labels; adjust accordingly for your use case.
y_test_numeric = pd.factorize(y_test)[0]  # Convert to numerical
y_pred_numeric = pd.factorize(y_pred)[0]  # Convert to numerical

mae = mean_absolute_error(y_test_numeric, y_pred_numeric)
mse = mean_squared_error(y_test_numeric, y_pred_numeric)
rmse = np.sqrt(mse)

# Print metrics
print(f"Mean Absolute Error (MAE): {mae:.4f}")
print(f"Mean Squared Error (MSE): {mse:.4f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")
