In [36]:
pip install nltk scikit-learn imbalanced-learn


Collecting imbalanced-learn
  Downloading imbalanced_learn-0.12.0-py3-none-any.whl.metadata (8.2 kB)
Downloading imbalanced_learn-0.12.0-py3-none-any.whl (257 kB)
   ---------------------------------------- 0.0/257.7 kB ? eta -:--:--
   ---- ---------------------------------- 30.7/257.7 kB 660.6 kB/s eta 0:00:01
   ------------------- -------------------- 122.9/257.7 kB 1.2 MB/s eta 0:00:01
   ---------------------------------------- 257.7/257.7 kB 2.3 MB/s eta 0:00:00
Installing collected packages: imbalanced-learn
Successfully installed imbalanced-learn-0.12.0


In [1]:
import pandas as pd
import numpy as np
import re
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\shrey\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\shrey\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\shrey\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
train_metadata = pd.read_csv('./data/metadata_train.csv')
val_metadata = pd.read_csv('./data/metadata_validate.csv')
test_metadata = pd.read_csv('./data/metadata_test.csv')

In [3]:
train_metadata = train_metadata[['phrase', 'merged_prompt']].rename(columns={'phrase':'audio', 'merged_prompt':'intent'})
val_metadata = val_metadata[['phrase', 'merged_prompt']].rename(columns={'phrase':'audio', 'merged_prompt':'intent'})
test_metadata = test_metadata[['phrase', 'merged_prompt']].rename(columns={'phrase':'audio', 'merged_prompt':'intent'})

In [4]:
train_metadata.head()

Unnamed: 0,audio,intent
0,when i remember her i feel down,Emotional and mental health
1,when i carry heavy things i feel like breaking...,Hair and skin issues
2,there is too much pain when i move my arm,Chest pain
3,my son had his lip pierced and it is swollen a...,Wound and injury
4,my muscles in my lower back are aching,Wound and injury


In [5]:
train_metadata['intent'].unique()

array(['Emotional and mental health', 'Hair and skin issues',
       'Chest pain', 'Wound and injury', 'Leg and foot pain',
       'Shoulder pain', nan, 'General weakness', 'Dizziness and vertigo',
       'Neck, back or spinal issues', 'Internal pain', 'Sensory issues',
       'Muscle and joint pain', 'Respiratory issue', 'Digestive issues',
       'Feeling cold/hot'], dtype=object)

In [6]:
train_metadata['intent'].nunique()

15

In [7]:
train_metadata['intent'].value_counts()

intent
Hair and skin issues           764
Wound and injury               664
Muscle and joint pain          526
Leg and foot pain              472
Sensory issues                 458
Neck, back or spinal issues    451
Shoulder pain                  278
Respiratory issue              266
Dizziness and vertigo          256
Chest pain                     231
Digestive issues               230
Feeling cold/hot               230
General weakness               215
Internal pain                  215
Emotional and mental health    204
Name: count, dtype: int64

In [8]:
def sample_data(data):
    min_class_count = min(data['intent'].value_counts())
    grouped = data.groupby('intent')
    data = grouped.apply(lambda x: x.sample(min_class_count))
    data = data.reset_index(drop=True)
    return data

In [9]:
def preprocess_text(text):
    cleaned_text = re.sub(r'[^a-zA-Z\s]','', text) 
    cleaned_text = cleaned_text.lower()
    stop_words = set(stopwords.words('english'))
    Lemma = WordNetLemmatizer()
    tokens = word_tokenize(cleaned_text)
    cleaned_text = ' '.join([word for word in tokens if word not in stop_words])
    cleaned_text = ' '.join([Lemma.lemmatize(word) for word in cleaned_text.split()])
    return cleaned_text

In [10]:
sampled_train_metadata = sample_data(train_metadata).sample(frac=1).reset_index(drop=True)
sampled_val_metadata = sample_data(val_metadata).sample(frac=1).reset_index(drop=True)
sampled_test_metadata = sample_data(test_metadata).sample(frac=1).reset_index(drop=True)

  data = grouped.apply(lambda x: x.sample(min_class_count))
  data = grouped.apply(lambda x: x.sample(min_class_count))
  data = grouped.apply(lambda x: x.sample(min_class_count))


In [11]:
sampled_train_metadata['audio'] = sampled_train_metadata['audio'].apply(preprocess_text)
sampled_val_metadata['audio'] = sampled_val_metadata['audio'].apply(preprocess_text)
sampled_test_metadata['audio'] = sampled_test_metadata['audio'].apply(preprocess_text)

In [12]:
train_metadata['audio'] = train_metadata['audio'].apply(preprocess_text)
val_metadata['audio'] = val_metadata['audio'].apply(preprocess_text)
test_metadata['audio'] = test_metadata['audio'].apply(preprocess_text)

In [13]:
sampled_train_metadata.head()

Unnamed: 0,audio,intent
0,feel dizzy set infront laptop hour two possibl...,Dizziness and vertigo
1,im hearing well problem ear,Sensory issues
2,hair falling huge amount,Hair and skin issues
3,pain internal,Internal pain
4,dont know im constantly sad,Emotional and mental health


In [14]:
train_metadata.head()

Unnamed: 0,audio,intent
0,remember feel,Emotional and mental health
1,carry heavy thing feel like breaking back,Hair and skin issues
2,much pain move arm,Chest pain
3,son lip pierced swollen skin inside lip grey l...,Wound and injury
4,muscle lower back aching,Wound and injury


In [15]:
X_train_sample = sampled_train_metadata['audio']
X_val_sample = sampled_val_metadata['audio']
X_test_sample = sampled_test_metadata['audio']

X_train = train_metadata['audio']
X_val = val_metadata['audio']
X_test = test_metadata['audio']

In [16]:
from sklearn.preprocessing import LabelEncoder

lr = LabelEncoder()
y_train_sample = lr.fit_transform(sampled_train_metadata['intent'])
y_val_sample = lr.transform(sampled_val_metadata['intent'])
y_test_sample = lr.transform(sampled_test_metadata['intent'])

lr = LabelEncoder()
y_train = lr.fit_transform(train_metadata['intent'])
y_val = lr.transform(val_metadata['intent'])
y_test = lr.transform(test_metadata['intent'])

In [17]:
tfidf = TfidfVectorizer()
X_train_sample_tfidf = tfidf.fit_transform(X_train_sample)
X_val_sample_tfidf = tfidf.transform(X_val_sample)
X_test_sample_tfidf = tfidf.transform(X_test_sample)

tfidf = TfidfVectorizer()
X_train_tfidf = tfidf.fit_transform(X_train)
X_val_tfidf = tfidf.transform(X_val)
X_test_tfidf = tfidf.transform(X_test)

#### Naive Bayes

##### Need to sample data to deal with class imbalance

In [20]:
from sklearn.naive_bayes import MultinomialNB

nb_clf = MultinomialNB()
nb_clf.fit(X_train_sample_tfidf, y_train_sample)

In [21]:
y_val_pred = nb_clf.predict(X_val_sample_tfidf)
print('Validation Accuracy:', round(accuracy_score(y_val_sample, y_val_pred), 4))

Validation Accuracy: 0.9778


In [22]:
y_test_pred = nb_clf.predict(X_test_sample_tfidf)
print('Test Accuracy:', round(accuracy_score(y_test_sample, y_test_pred), 4))

Test Accuracy: 0.9619


#### SMOTE : Synthetic Minority Over-sampling Technique

##### Generates synthetic samples for the minority class, which helps in balancing the class distribution

In [35]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)
X_train_resampled_tfidf, y_train_resampled = smote.fit_resample(X_train_tfidf, y_train)

In [36]:
from collections import Counter

Counter(y_train_resampled)

Counter({3: 764,
         6: 764,
         0: 764,
         14: 764,
         8: 764,
         13: 764,
         15: 764,
         5: 764,
         2: 764,
         10: 764,
         7: 764,
         12: 764,
         9: 764,
         11: 764,
         1: 764,
         4: 764})

In [37]:
nb_clf = MultinomialNB()
nb_clf.fit(X_train_resampled_tfidf, y_train_resampled)

In [38]:
y_val_pred = nb_clf.predict(X_val_tfidf)
print('Validation Accuracy:', round(accuracy_score(y_val, y_val_pred), 4))

Validation Accuracy: 0.987


In [40]:
y_test_pred = nb_clf.predict(X_test_tfidf)
print('Test Accuracy:', round(accuracy_score(y_test, y_test_pred), 4))

Test Accuracy: 0.979


#### Random Forest

##### No need to sample data, since RF specifically deals with imbalanced data

In [42]:
from sklearn.ensemble import RandomForestClassifier

rf_clf = RandomForestClassifier(n_estimators=50, random_state=42)
rf_clf.fit(X_train_tfidf, y_train)

In [43]:
y_val_pred = rf_clf.predict(X_val_tfidf)
print('Validation Accuracy:', round(accuracy_score(y_val, y_val_pred), 4))

Validation Accuracy: 0.9974


In [44]:
y_test_pred = rf_clf.predict(X_test_tfidf)
print('Test Accuracy:', round(accuracy_score(y_test, y_test_pred), 4))

Test Accuracy: 0.9974
