In [1]:
# !pip install fasttext
# !pip install --upgrade scikit-learn

In [2]:
import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

In [3]:
# Read the data and select relevant columns
train_data = pd.read_csv("/Users/kunalindore/Library/CloudStorage/OneDrive-NortheasternUniversity/Capstone/Multi-Modal-Intent-Recognition-in-Healthcare/metadata/train_metadata.csv")
train_data = train_data[["phrase", "medical intent", "label"]]
train_data.head()


Unnamed: 0,phrase,medical intent,label
0,When I carry heavy things I feel like breaking...,Dermatological Issues,1
1,there is too much pain when i move my arm,Abdominal Pain and Weakness,0
2,My son had his lip pierced and it is swollen a...,Wound Trauma,5
3,My muscles in my lower back are aching,Wound Trauma,5
4,I have muscle pain in my left leg,Musculoskeletal Pain,3


In [4]:
# Read the data and select relevant columns
test_data = pd.read_csv("/Users/kunalindore/Library/CloudStorage/OneDrive-NortheasternUniversity/Capstone/Multi-Modal-Intent-Recognition-in-Healthcare/metadata/test_metadata.csv")
test_data = test_data[["phrase", "medical intent", "label"]]
test_data.head()


Unnamed: 0,phrase,medical intent,label
0,I have a sharp pain in my lower stomach.,Abdominal Pain and Weakness,0
1,Don't cry,Musculoskeletal Pain,3
2,When I get out of bed in the morning my body f...,Abdominal Pain and Weakness,0
3,i have a great pain in my thorax from heart in...,Abdominal Pain and Weakness,0
4,I have a hard muscle pain since i went to the gym,Musculoskeletal Pain,3


In [6]:
# Read the data and select relevant columns
validate_data = pd.read_csv("/Users/kunalindore/Library/CloudStorage/OneDrive-NortheasternUniversity/Capstone/Multi-Modal-Intent-Recognition-in-Healthcare/metadata/validation_metadata.csv")
validate_data = validate_data[["phrase", "medical intent", "label"]]
validate_data.head()


Unnamed: 0,phrase,medical intent,label
0,I read a book for along time and when I finish...,Head and Neck Discomfort,2
1,My hair is falling out in huge amount,Dermatological Issues,1
2,i feel pain in my stomach,Abdominal Pain and Weakness,0
3,"my child has cough all night, she can't sleep",Respiratory and Ear Issues,4
4,I feel dizzy when I set in-front of my laptop ...,Head and Neck Discomfort,2


In [7]:
# Check the number of rows for each merged_prompt class
train_class_counts = train_data['medical intent'].value_counts()
print(train_class_counts)

Musculoskeletal Pain           1075
Respiratory and Ear Issues      939
Head and Neck Discomfort        927
Abdominal Pain and Weakness     891
Dermatological Issues           764
Wound Trauma                    664
Name: medical intent, dtype: int64


In [8]:
# Check the number of rows for each merged_prompt class
test_class_counts = test_data['medical intent'].value_counts()
print(test_class_counts)

Musculoskeletal Pain           77
Abdominal Pain and Weakness    72
Respiratory and Ear Issues     57
Head and Neck Discomfort       52
Dermatological Issues          45
Wound Trauma                   42
Name: medical intent, dtype: int64


In [9]:
# Check the number of rows for each merged_prompt class
validate_class_counts = validate_data['medical intent'].value_counts()
print(validate_class_counts)

Musculoskeletal Pain           73
Head and Neck Discomfort       63
Respiratory and Ear Issues     63
Abdominal Pain and Weakness    60
Dermatological Issues          45
Wound Trauma                   38
Name: medical intent, dtype: int64


In [10]:
# Preprocess text
def preprocess_text(text):
    cleaned_text = re.sub(r'[^a-zA-Z\s]', '', text)
    cleaned_text = cleaned_text.lower()
    stop_words = set(stopwords.words('english'))
    lemma = WordNetLemmatizer()
    tokens = word_tokenize(cleaned_text)
    cleaned_text = ' '.join([word for word in tokens if word not in stop_words])
    cleaned_text = ' '.join([lemma.lemmatize(word) for word in cleaned_text.split()])
    return cleaned_text

In [11]:
# Rename columns
train_data.rename(columns={"medical intent": "intent", "phrase": "audio_phrase"}, inplace=True)
test_data.rename(columns={"medical intent": "intent", "phrase": "audio_phrase"}, inplace=True)
validate_data.rename(columns={"medical intent": "intent", "phrase": "audio_phrase"}, inplace=True)

In [12]:
# Preprocess text
train_data['audio_phrase'] = train_data['audio_phrase'].apply(preprocess_text)
test_data['audio_phrase'] = test_data['audio_phrase'].apply(preprocess_text)
validate_data['audio_phrase'] = validate_data['audio_phrase'].apply(preprocess_text)

In [13]:
train_data.head()

Unnamed: 0,audio_phrase,intent,label
0,carry heavy thing feel like breaking back,Dermatological Issues,1
1,much pain move arm,Abdominal Pain and Weakness,0
2,son lip pierced swollen skin inside lip grey l...,Wound Trauma,5
3,muscle lower back aching,Wound Trauma,5
4,muscle pain left leg,Musculoskeletal Pain,3


In [15]:
unique_dataset = train_data.drop_duplicates(subset=['label'])
label_to_intent_mapping = dict(zip(unique_dataset['label'], unique_dataset['intent']))
id2label = dict(sorted(label_to_intent_mapping.items()))
label2id = {v: k for k, v in id2label.items()}

In [16]:
print(label2id)
print('-'*100)
print(id2label)

{'Abdominal Pain and Weakness': 0, 'Dermatological Issues': 1, 'Head and Neck Discomfort': 2, 'Musculoskeletal Pain': 3, 'Respiratory and Ear Issues': 4, 'Wound Trauma': 5}
----------------------------------------------------------------------------------------------------
{0: 'Abdominal Pain and Weakness', 1: 'Dermatological Issues', 2: 'Head and Neck Discomfort', 3: 'Musculoskeletal Pain', 4: 'Respiratory and Ear Issues', 5: 'Wound Trauma'}


In [19]:
tfidf_vectorizer = TfidfVectorizer(sublinear_tf=True, smooth_idf=True)
X_train = tfidf_vectorizer.fit_transform(train_data['audio_phrase'])
y_train = train_data['label']
X_test = tfidf_vectorizer.transform(test_data['audio_phrase'])
y_test = test_data['label']
X_val = tfidf_vectorizer.transform(validate_data['audio_phrase'])
y_val = validate_data['label']

In [20]:
random_forest = RandomForestClassifier(random_state=42)
random_forest.fit(X_train, y_train)

# Evaluate the model
y_pred_train = random_forest.predict(X_train)
y_pred_test = random_forest.predict(X_test)
y_pred_val = random_forest.predict(X_val)

print("Train data classification report:")
print(classification_report(y_train, y_pred_train))

print("Test data classification report:")
print(classification_report(y_test, y_pred_test))

print("Validation data classification report:")
print(classification_report(y_val, y_pred_val))

Train data classification report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       891
           1       1.00      1.00      1.00       764
           2       1.00      1.00      1.00       927
           3       1.00      1.00      1.00      1075
           4       1.00      1.00      1.00       939
           5       1.00      1.00      1.00       664

    accuracy                           1.00      5260
   macro avg       1.00      1.00      1.00      5260
weighted avg       1.00      1.00      1.00      5260

Test data classification report:
              precision    recall  f1-score   support

           0       0.99      1.00      0.99        72
           1       1.00      1.00      1.00        45
           2       1.00      1.00      1.00        52
           3       1.00      0.99      0.99        77
           4       1.00      1.00      1.00        57
           5       1.00      1.00      1.00        42

    accura

In [21]:
# Implement Random Forest
random_forest = RandomForestClassifier(random_state=42)
random_forest.fit(X_train, y_train)

In [22]:
# Define parameter grid for GridSearchCV
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}
grid_search = GridSearchCV(estimator=random_forest, param_grid=param_grid, cv=10)
grid_search.fit(X_train, y_train)

In [23]:
best_random_forest = grid_search.best_estimator_

In [24]:
best_params = grid_search.best_params_
print("Best parameters:", best_params)

Best parameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}


In [25]:
# Evaluate the model
y_pred_train = best_random_forest.predict(X_train)
y_pred_test = best_random_forest.predict(X_test)
y_pred_val = best_random_forest.predict(X_val)

print("Train data classification report:")
print(classification_report(y_train, y_pred_train))

print("Test data classification report:")
print(classification_report(y_test, y_pred_test))

print("Validation data classification report:")
print(classification_report(y_val, y_pred_val))

Train data classification report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       891
           1       1.00      1.00      1.00       764
           2       1.00      1.00      1.00       927
           3       1.00      1.00      1.00      1075
           4       1.00      1.00      1.00       939
           5       1.00      1.00      1.00       664

    accuracy                           1.00      5260
   macro avg       1.00      1.00      1.00      5260
weighted avg       1.00      1.00      1.00      5260

Test data classification report:
              precision    recall  f1-score   support

           0       0.99      1.00      0.99        72
           1       1.00      1.00      1.00        45
           2       1.00      1.00      1.00        52
           3       1.00      0.99      0.99        77
           4       1.00      1.00      1.00        57
           5       1.00      1.00      1.00        42

    accura

In [27]:
# !pip install joblib

In [28]:
import joblib
joblib.dump(best_random_forest, '/Users/kunalindore/Library/CloudStorage/OneDrive-NortheasternUniversity/Capstone/Multi-Modal-Intent-Recognition-in-Healthcare/project/models/random_forest_model_v3.pkl',protocol=4)
joblib.dump(tfidf_vectorizer, '/Users/kunalindore/Library/CloudStorage/OneDrive-NortheasternUniversity/Capstone/Multi-Modal-Intent-Recognition-in-Healthcare/project/models/tfidf_vectorizer_v3.pkl')


['/Users/kunalindore/Library/CloudStorage/OneDrive-NortheasternUniversity/Capstone/Multi-Modal-Intent-Recognition-in-Healthcare/project/models/tfidf_vectorizer_v3.pkl']