In [1]:
import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report




ModuleNotFoundError: No module named 'pandas'

In [None]:
# Read the data and select relevant columns
train_data = pd.read_csv("/Users/kunalindore/Library/CloudStorage/OneDrive-NortheasternUniversity/Capstone/Multi-Modal-Intent-Recognition-in-Healthcare/project/data/metadata_train.csv")
train_data = train_data[["phrase", "merged_prompt"]]
train_data.head()


Unnamed: 0,phrase,merged_prompt
0,when i remember her i feel down,Emotional and mental health
1,when i carry heavy things i feel like breaking...,Hair and skin issues
2,there is too much pain when i move my arm,Chest pain
3,my son had his lip pierced and it is swollen a...,Wound and injury
4,my muscles in my lower back are aching,Wound and injury


In [None]:
# Read the data and select relevant columns
test_data = pd.read_csv("/Users/kunalindore/Library/CloudStorage/OneDrive-NortheasternUniversity/Capstone/Multi-Modal-Intent-Recognition-in-Healthcare/project/data/metadata_test.csv")
test_data = test_data[["phrase", "merged_prompt"]]
test_data.head()


Unnamed: 0,phrase,merged_prompt
0,i have a sharp pain in my lower stomach,Digestive issues
1,dont cry,Muscle and joint pain
2,when i get out of bed in the morning my body f...,General weakness
3,i have a great pain in my thorax from heart in...,Chest pain
4,i have a hard muscle pain since i went to the gym,Muscle and joint pain


In [None]:
# Read the data and select relevant columns
validate_data = pd.read_csv("/Users/kunalindore/Library/CloudStorage/OneDrive-NortheasternUniversity/Capstone/Multi-Modal-Intent-Recognition-in-Healthcare/project/data/metadata_validate.csv")
validate_data = validate_data[["phrase", "merged_prompt"]]
validate_data.head()


Unnamed: 0,phrase,merged_prompt
0,i read a book for along time and when i finish...,Headache
1,my hair is falling out in huge amount,Hair and skin issues
2,my foot is hurting so much,Leg and foot pain
3,i feel pain in the lower back,"Neck, back or spinal issues"
4,i feel pain in my stomach,Digestive issues


In [None]:
# Check the number of rows for each merged_prompt class
train_class_counts = train_data['merged_prompt'].value_counts()
print(train_class_counts)

Hair and skin issues           764
Wound and injury               664
Muscle and joint pain          526
Leg and foot pain              472
Respiratory issue              470
Sensory issues                 458
Neck, back or spinal issues    451
Shoulder pain                  278
Dizziness and vertigo          256
Chest pain                     231
Headache                       231
Digestive issues               230
Feeling cold/hot               230
General weakness               215
Internal pain                  215
Emotional and mental health    204
Name: merged_prompt, dtype: int64


In [None]:
# Check the number of rows for each merged_prompt class
test_class_counts = test_data['merged_prompt'].value_counts()
print(test_class_counts)

Hair and skin issues           45
Wound and injury               42
Muscle and joint pain          39
Sensory issues                 34
Leg and foot pain              29
Neck, back or spinal issues    28
Chest pain                     25
Shoulder pain                  21
Internal pain                  21
Respiratory issue              19
Feeling cold/hot               18
Digestive issues               16
Headache                       16
General weakness               10
Emotional and mental health    10
Dizziness and vertigo           8
Name: merged_prompt, dtype: int64


In [None]:
# Check the number of rows for each merged_prompt class
validate_class_counts = validate_data['merged_prompt'].value_counts()
print(validate_class_counts)

Hair and skin issues           45
Wound and injury               38
Respiratory issue              37
Muscle and joint pain          35
Neck, back or spinal issues    31
Leg and foot pain              27
Sensory issues                 24
Shoulder pain                  21
Dizziness and vertigo          19
Emotional and mental health    17
Chest pain                     17
Headache                       16
General weakness               16
Digestive issues               15
Feeling cold/hot               15
Internal pain                  12
Name: merged_prompt, dtype: int64


In [None]:
# Preprocess text
def preprocess_text(text):
    cleaned_text = re.sub(r'[^a-zA-Z\s]', '', text)
    cleaned_text = cleaned_text.lower()
    stop_words = set(stopwords.words('english'))
    lemma = WordNetLemmatizer()
    tokens = word_tokenize(cleaned_text)
    cleaned_text = ' '.join([word for word in tokens if word not in stop_words])
    cleaned_text = ' '.join([lemma.lemmatize(word) for word in cleaned_text.split()])
    return cleaned_text

In [None]:
# Rename columns
train_data.rename(columns={"merged_prompt": "intent", "phrase": "audio_phrase"}, inplace=True)
test_data.rename(columns={"merged_prompt": "intent", "phrase": "audio_phrase"}, inplace=True)
validate_data.rename(columns={"merged_prompt": "intent", "phrase": "audio_phrase"}, inplace=True)

In [None]:
# Preprocess text
train_data['audio_phrase'] = train_data['audio_phrase'].apply(preprocess_text)
test_data['audio_phrase'] = test_data['audio_phrase'].apply(preprocess_text)
validate_data['audio_phrase'] = validate_data['audio_phrase'].apply(preprocess_text)

In [None]:
# Perform label encoding
label_encoder = LabelEncoder()
train_data['intent'] = label_encoder.fit_transform(train_data['intent'])
test_data['intent'] = label_encoder.fit_transform(test_data['intent'])
validate_data['intent'] = label_encoder.fit_transform(validate_data['intent'])


In [None]:
label_mapping = dict(zip(range(len(label_encoder.classes_)), label_encoder.classes_))
print("Label mapping:", label_mapping)


Label mapping: {0: 'Chest pain', 1: 'Digestive issues', 2: 'Dizziness and vertigo', 3: 'Emotional and mental health', 4: 'Feeling cold/hot', 5: 'General weakness', 6: 'Hair and skin issues', 7: 'Headache', 8: 'Internal pain', 9: 'Leg and foot pain', 10: 'Muscle and joint pain', 11: 'Neck, back or spinal issues', 12: 'Respiratory issue', 13: 'Sensory issues', 14: 'Shoulder pain', 15: 'Wound and injury'}


In [None]:
tfidf_vectorizer = TfidfVectorizer(sublinear_tf=True, smooth_idf=True)
X_train = tfidf_vectorizer.fit_transform(train_data['audio_phrase'])
y_train = train_data['intent']
X_test = tfidf_vectorizer.transform(test_data['audio_phrase'])
y_test = test_data['intent']
X_val = tfidf_vectorizer.transform(validate_data['audio_phrase'])
y_val = validate_data['intent']

In [None]:
random_forest = RandomForestClassifier(random_state=42)
random_forest.fit(X_train, y_train)

# Evaluate the model
y_pred_train = random_forest.predict(X_train)
y_pred_test = random_forest.predict(X_test)
y_pred_val = random_forest.predict(X_val)

print("Train data classification report:")
print(classification_report(y_train, y_pred_train))

print("Test data classification report:")
print(classification_report(y_test, y_pred_test))

print("Validation data classification report:")
print(classification_report(y_val, y_pred_val))

Train data classification report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       231
           1       0.97      1.00      0.98       230
           2       1.00      1.00      1.00       256
           3       1.00      1.00      1.00       204
           4       1.00      1.00      1.00       230
           5       1.00      1.00      1.00       215
           6       1.00      1.00      1.00       764
           7       1.00      1.00      1.00       231
           8       1.00      0.96      0.98       215
           9       1.00      1.00      1.00       472
          10       1.00      1.00      1.00       526
          11       1.00      1.00      1.00       451
          12       1.00      1.00      1.00       470
          13       1.00      1.00      1.00       458
          14       1.00      1.00      1.00       278
          15       1.00      1.00      1.00       664

    accuracy                           1.00   

In [None]:
# Implement Random Forest
random_forest = RandomForestClassifier(random_state=42)
random_forest.fit(X_train, y_train)

RandomForestClassifier(random_state=42)

In [None]:
# Define parameter grid for GridSearchCV
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}
grid_search = GridSearchCV(estimator=random_forest, param_grid=param_grid, cv=10)
grid_search.fit(X_train, y_train)

GridSearchCV(cv=10, estimator=RandomForestClassifier(random_state=42),
             param_grid={'max_depth': [None, 10, 20],
                         'min_samples_leaf': [1, 2, 4],
                         'min_samples_split': [2, 5, 10],
                         'n_estimators': [50, 100, 200]})

In [None]:
best_random_forest = grid_search.best_estimator_

In [None]:
best_params = grid_search.best_params_
print("Best parameters:", best_params)

Best parameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}


In [None]:
# Evaluate the model
y_pred_train = best_random_forest.predict(X_train)
y_pred_test = best_random_forest.predict(X_test)
y_pred_val = best_random_forest.predict(X_val)

print("Train data classification report:")
print(classification_report(y_train, y_pred_train))

print("Test data classification report:")
print(classification_report(y_test, y_pred_test))

print("Validation data classification report:")
print(classification_report(y_val, y_pred_val))

Train data classification report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       231
           1       0.97      1.00      0.98       230
           2       1.00      1.00      1.00       256
           3       1.00      1.00      1.00       204
           4       1.00      1.00      1.00       230
           5       1.00      1.00      1.00       215
           6       1.00      1.00      1.00       764
           7       1.00      1.00      1.00       231
           8       1.00      0.96      0.98       215
           9       1.00      1.00      1.00       472
          10       1.00      1.00      1.00       526
          11       1.00      1.00      1.00       451
          12       1.00      1.00      1.00       470
          13       1.00      1.00      1.00       458
          14       1.00      1.00      1.00       278
          15       1.00      1.00      1.00       664

    accuracy                           1.00   

In [None]:
# !pip install joblib



In [None]:
# import joblib
# joblib.dump(best_random_forest, '/Users/kunalindore/Library/CloudStorage/OneDrive-NortheasternUniversity/Capstone/Multi-Modal-Intent-Recognition-in-Healthcare/project/models/random_forest_model.pkl',protocol=4)
# joblib.dump(tfidf_vectorizer, '/Users/kunalindore/Library/CloudStorage/OneDrive-NortheasternUniversity/Capstone/Multi-Modal-Intent-Recognition-in-Healthcare/project/models/tfidf_vectorizer.pkl')
# joblib.dump(label_encoder, '/Users/kunalindore/Library/CloudStorage/OneDrive-NortheasternUniversity/Capstone/Multi-Modal-Intent-Recognition-in-Healthcare/project/models/label_encoder.pkl')

['/Users/kunalindore/Library/CloudStorage/OneDrive-NortheasternUniversity/Capstone/Multi-Modal-Intent-Recognition-in-Healthcare/project/models/label_encoder.pkl']