# Classification with text and audio features
Updates: 
* 03/11 - reorganize the code and rename some variables, 
  change `X` to `x_text`, `X_train` to `x_txt_train`, `X_test` to `x_txt_test`, `Y` to `y_text`


In [3]:
!pip install soundfile

Collecting soundfile
  Downloading https://files.pythonhosted.org/packages/eb/f2/3cbbbf3b96fb9fa91582c438b574cff3f45b29c772f94c400e2c99ef5db9/SoundFile-0.10.3.post1-py2.py3-none-any.whl
Installing collected packages: soundfile
Successfully installed soundfile-0.10.3.post1


In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
import librosa
import soundfile
import os, glob, pickle
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, classification_report

from sklearn.utils import class_weight
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB

import pickle

## ref: https://data-flair.training/blogs/python-mini-project-speech-emotion-recognition/
#DataFlair - Extract features (mfcc, chroma, mel) from a sound file
def extract_feature(file_name, is_mfcc, is_chroma, is_mel):
    
    X, sample_rate = librosa.load(file_name, sr=None, mono=True)
    
    if is_chroma:
        stft=np.abs(librosa.stft(X))
    result=np.array([])
    if is_mfcc:
        mfccs=np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=40).T, axis=0)
        result=np.hstack((result, mfccs))
    if is_chroma:
        chroma=np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T,axis=0)
        result=np.hstack((result, chroma))
    if is_mel:
        mel=np.mean(librosa.feature.melspectrogram(X, sr=sample_rate).T,axis=0)
        result=np.hstack((result, mel))
    return result


#DataFlair - Load the data and extract features for each sound file
def load_data(test_size=0.2):
    x = []
    audio_list = sorted(os.listdir(train_dir))
    # gen = (x for x in audio_list)
    
    gen = (u for u in train_df[['Dialogue_ID', 'Utterance_ID']].values)
    for dia_utt in gen:
        dia_id = dia_utt[0]
        utt_id = dia_utt[1]
        file_name = f"dia{dia_id}_utt{utt_id}.wav"
        
        feature=extract_feature(train_dir + file_name, is_mfcc=True, is_chroma=True, is_mel=True)
        x.append(feature)
    
    labels_1d = np.reshape(labels_matrix, max_dia * max_utt)
    labels_1d = labels_1d[~np.equal(labels_1d, 99)]
    y = list(labels_1d)
    
    return x, y

# for file_name in gen:
#     # file_name=os.path.basename(file) ## in the form of dia*_utt*.wav
#     print(file_name)
#     ## get ids for dialogue 
#     temp = file_name.split("_")
#     dia_id = int(temp[0][3:])
#     utt_id = int(temp[1].split(".")[0][3:])

#     # emotion=labels_matrix[dia_id, utt_id]
#     # if emotion not in emotions:
#     #     continue
#     feature=extract_feature(train_dir + file_name, is_mfcc=True, is_chroma=True, is_mel=True)
#     x_matrix[dia_id, utt_id] = feature

In [0]:
max_dia = 1039
max_utt = 24
is_reload = False ## please set it to False if you are using features saved in pickle file 
test_size = 0.25
total = 9988

## Extract audio features

In [0]:
#train_dir = "../data/train/"
emotions = ['neutral', 'surprise', 'fear', 'sadness', 'joy', 'disgust', 'anger']
emo2int = dict(zip(emotions, range(len(emotions))))
emo2int['empty'] = 99

train_df = pd.read_csv("/content/drive/Shared drives/Mastery Course/Code/train_sent_emo.csv")
## dia125_utt_3 not working
remove_index = train_df[(train_df['Dialogue_ID'] == 125) & (train_df['Utterance_ID'] == 3)].index
train_df = train_df.drop(index=remove_index)

label_pivot = pd.pivot(data=train_df, columns="Utterance_ID", index="Dialogue_ID", values="Emotion").fillna("empty")
label_pivot.loc[60] = ["empty"] * label_pivot.shape[1] ## for some reasons, there is no dialogue 60.
label_pivot = label_pivot.sort_index()
label_pivot = label_pivot.applymap(lambda x: emo2int[x])
labels_matrix = label_pivot.values

In [5]:
train_df.iloc[1163:1166]

Unnamed: 0,Sr No.,Utterance,Speaker,Emotion,Sentiment,Dialogue_ID,Utterance_ID,Season,Episode,StartTime,EndTime
1163,1225,"Hey, if mommy can have a wife, daddy can have ...",Ross,neutral,neutral,125,1,4,18,"00:02:35,613","00:02:38,323"
1164,1226,"Ohh, its time to go.",Emily,neutral,neutral,125,2,4,18,"00:02:40,869","00:02:42,119"
1166,1228,"Huh, what can we do in 17 minutes?",Ross,neutral,neutral,125,4,4,18,"00:02:48,209","00:02:51,295"


In [0]:
#DataFlair - Split the dataset

if is_reload:
    x_audio, y = load_data(test_size=test_size)

    with open("/content/drive/Shared drives/Mastery Course/data/train_audio.pickle", 'wb') as pickle_out:
        pickle.dump(x, pickle_out)
    with open("/content/drive/Shared drives/Mastery Course/data/train_labels.pickle", 'wb') as pickle_out:
        pickle.dump(y, pickle_out)
else:
    with open("/content/drive/Shared drives/Mastery Course/data/train_audio.pickle", 'rb') as pickle_in:
        x_audio = pickle.load(pickle_in)
    with open("/content/drive/Shared drives/Mastery Course/data/train_labels.pickle", 'rb') as pickle_in:
        y = pickle.load(pickle_in)
## x_audio was named as x before

## Try some classifiers

In [0]:
## Multi-layer Perceptron with only audio
x_au_train,x_test,y_train,y_test = train_test_split(np.array(x_audio), y, test_size=test_size, random_state=9)
model=MLPClassifier(alpha=0.01, batch_size=256, epsilon=1e-08, hidden_layer_sizes=(300,), learning_rate='adaptive', max_iter=500)
model.fit(x_au_train,y_train)
y_pred=model.predict(x_test)

In [8]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.49      0.84      0.62      1152
           1       0.17      0.12      0.14       308
           2       0.12      0.02      0.03        62
           3       0.00      0.00      0.00       185
           4       0.26      0.09      0.13       439
           5       0.00      0.00      0.00        64
           6       0.30      0.15      0.20       287

    accuracy                           0.44      2497
   macro avg       0.19      0.17      0.16      2497
weighted avg       0.33      0.44      0.35      2497



In [9]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.49      0.84      0.62      1152
           1       0.17      0.12      0.14       308
           2       0.12      0.02      0.03        62
           3       0.00      0.00      0.00       185
           4       0.26      0.09      0.13       439
           5       0.00      0.00      0.00        64
           6       0.30      0.15      0.20       287

    accuracy                           0.44      2497
   macro avg       0.19      0.17      0.16      2497
weighted avg       0.33      0.44      0.35      2497



## Extract text features 
(code from EmotionBaseline.ipynb)

In [0]:
def cleaning(element):
    final_list = []
    """ Clean up text. Tokenize, lowercase, and remove punctuation and stopwords """
    #print("Running cleaner")
    # Remove punctuation, symbols (#) and stopwords
    element = str(element).lower()
    new_string = ''
    for char in element:
        if char not in punctuation_list:
            new_string += char
    all_wrds = new_string.split(" ")
    for word in all_wrds:
        if word not in stop_words_list:
            final_list.append(word)
    final_string = ' '.join(final_list)
    return final_string

def tokenize(str_arg):
    words = str_arg.split()
    return words

In [0]:
# !python -m spacy download en

In [0]:
import spacy
import string

nlp = spacy.load('en')
stop_words_list = list(spacy.lang.en.STOP_WORDS)
punctuation_list = list(string.punctuation)
punctuation_list.extend('\r')
punctuation_list.extend('\n')
punctuation_list.extend(['0', '1', '2', '3', '4', '5', '6', '7', '8', '9'])

In [13]:
train_df['cleaned_Utterance'] = train_df['Utterance'].apply(cleaning)
train_df['tokens'] = train_df['cleaned_Utterance'].apply(tokenize)

train_df.index = range(total)
train_df.iloc[1163:1166]

Unnamed: 0,Sr No.,Utterance,Speaker,Emotion,Sentiment,Dialogue_ID,Utterance_ID,Season,Episode,StartTime,EndTime,cleaned_Utterance,tokens
1163,1225,"Hey, if mommy can have a wife, daddy can have ...",Ross,neutral,neutral,125,1,4,18,"00:02:35,613","00:02:38,323",hey mommy wife daddy bra,"[hey, mommy, wife, daddy, bra]"
1164,1226,"Ohh, its time to go.",Emily,neutral,neutral,125,2,4,18,"00:02:40,869","00:02:42,119",ohh its time,"[ohh, its, time]"
1165,1228,"Huh, what can we do in 17 minutes?",Ross,neutral,neutral,125,4,4,18,"00:02:48,209","00:02:51,295",huh minutes,"[huh, minutes]"


In [0]:
cleaned_utt = np.reshape(train_df['cleaned_Utterance'].values, (total,1))

In [15]:
y[-10:]

[1, 0, 0, 5, 5, 0, 0, 1, 0, 4]

In [16]:
train_df['Emotion'][-10:]

9978    surprise
9979     neutral
9980     neutral
9981     disgust
9982     disgust
9983     neutral
9984     neutral
9985    surprise
9986     neutral
9987         joy
Name: Emotion, dtype: object

In [0]:
x_text = train_df['cleaned_Utterance'] # it was named as X
y_text = train_df['Emotion'] # was named as Y
x_txt_train, x_txt_test, y_txt_train, y_txt_test = train_test_split(x_text, y_text, test_size = test_size, random_state = 42)

In [18]:
## IMPORTANT
[i for i in range(total + 1) if i not in y_text.index]

[9988]

In [19]:
len(x_audio)

9988

In [20]:
len(x_text)

9988

In [0]:
# train_index = [i if i < 1165 else i - 1 for i in x_txt_train.index ]
# aud_train = np.array(x)[train_index]
# test_index = [i if i < 1165 else i - 1 for i in x_txt_test.index ]
# aud_test = np.array(x)[test_index]

aud_train = np.array(x_audio)[x_txt_train.index]
aud_test = np.array(x_audio)[x_txt_test.index]


In [22]:
## check whether they are aligned
train_df.loc[x_txt_train.index[:10]]['Emotion']

2022     neutral
9873         joy
2542    surprise
2034        fear
2732        fear
8653     neutral
7349    surprise
1219     neutral
6292     neutral
4149         joy
Name: Emotion, dtype: object

In [23]:
y_txt_train[:10]

2022     neutral
9873         joy
2542    surprise
2034        fear
2732        fear
8653     neutral
7349    surprise
1219     neutral
6292     neutral
4149         joy
Name: Emotion, dtype: object

In [24]:
train_df.loc[x_text[1160:1170].index]['Emotion']

1160     neutral
1161         joy
1162         joy
1163     neutral
1164     neutral
1165     neutral
1166     neutral
1167     neutral
1168     neutral
1169    surprise
Name: Emotion, dtype: object

In [25]:
y_text[1160:1170]

1160     neutral
1161         joy
1162         joy
1163     neutral
1164     neutral
1165     neutral
1166     neutral
1167     neutral
1168     neutral
1169    surprise
Name: Emotion, dtype: object

In [0]:
class_weights = class_weight.compute_class_weight('balanced',np.unique(y_train),y_train)

In [0]:
count_vect = CountVectorizer()
x_txt_train_counts = count_vect.fit_transform(x_txt_train)
x_txt_test_counts = count_vect.transform(x_txt_test)

In [28]:
x_txt_train_counts.shape

(7491, 4657)

In [29]:
aud_train.shape

(7491, 180)

In [0]:
from scipy.sparse import coo_matrix, hstack

In [0]:
x_mix_train = hstack((x_txt_train_counts, aud_train))
x_mix_test = hstack((x_txt_test_counts, aud_test))


In [32]:
log_clf_wgt = LogisticRegression(class_weight='balanced').fit(x_mix_train, y_txt_train)

y_log_pred = log_clf_wgt.predict(x_mix_test)
accuracy = accuracy_score(np.array(y_txt_test), y_log_pred)
# Combined report with all above metrics
print(classification_report(y_txt_test, y_log_pred))

              precision    recall  f1-score   support

       anger       0.24      0.29      0.26       312
     disgust       0.06      0.35      0.10        65
        fear       0.02      0.12      0.04        57
         joy       0.24      0.17      0.20       422
     neutral       0.58      0.22      0.32      1152
     sadness       0.11      0.28      0.15       160
    surprise       0.21      0.17      0.19       329

    accuracy                           0.22      2497
   macro avg       0.21      0.23      0.18      2497
weighted avg       0.37      0.22      0.25      2497



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [0]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.ensemble import GradientBoostingClassifier

In [36]:
#log_clf_wgt = LogisticRegression(class_weight='balanced').fit(x_mix_train, y_txt_train)
gb_clf2 = GradientBoostingClassifier(n_estimators=20, learning_rate=0.5, max_features=2, max_depth=2, random_state=0)
gb_clf2.fit(x_mix_train, y_txt_train)
predictions = gb_clf2.predict(x_mix_test)
accuracy = accuracy_score(np.array(y_txt_test), y_log_pred)
# Combined report with all above metrics
print(classification_report(y_txt_test, y_log_pred))

              precision    recall  f1-score   support

       anger       0.24      0.29      0.26       312
     disgust       0.06      0.35      0.10        65
        fear       0.02      0.12      0.04        57
         joy       0.24      0.17      0.20       422
     neutral       0.58      0.22      0.32      1152
     sadness       0.11      0.28      0.15       160
    surprise       0.21      0.17      0.19       329

    accuracy                           0.22      2497
   macro avg       0.21      0.23      0.18      2497
weighted avg       0.37      0.22      0.25      2497



## Logistic with Grid Search

In [0]:
from sklearn.model_selection import GridSearchCV

In [38]:
param_grid = {'penalty' : ['l1', 'l2'],
    'class_weight': ['balanced', None],
    'C' : [0.5, 0.8, 0.9, 1, 1.2, 1.5, 2, 5],
    'solver' : ['liblinear']}

# Create grid search object
logit = LogisticRegression(random_state=1)
log_clf = GridSearchCV(logit, param_grid = param_grid, cv = 5, verbose=2, n_jobs=-1)

# Fit on data
grid_cv =log_clf.fit(x_mix_train, y_txt_train)

Fitting 5 folds for each of 32 candidates, totalling 160 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed: 12.4min
[Parallel(n_jobs=-1)]: Done 160 out of 160 | elapsed: 69.4min finished


In [39]:
pd.DataFrame(grid_cv.cv_results_).sort_values(['rank_test_score']).head(6)
# best score may lie between C=[0.1, 10], l1 or l2, need deeper search

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_class_weight,param_penalty,param_solver,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
2,19.633643,2.581675,0.007324,0.000106,0.5,,l1,liblinear,"{'C': 0.5, 'class_weight': None, 'penalty': 'l...",0.521681,0.514019,0.521362,0.52737,0.497997,0.516486,0.01017,1
3,50.701747,3.987026,0.007521,0.000261,0.5,,l2,liblinear,"{'C': 0.5, 'class_weight': None, 'penalty': 'l...",0.521681,0.512016,0.523364,0.523364,0.49733,0.515551,0.010042,2
7,58.526701,2.942081,0.00881,0.001016,0.8,,l2,liblinear,"{'C': 0.8, 'class_weight': None, 'penalty': 'l...",0.520347,0.503338,0.526035,0.518692,0.497997,0.513282,0.010717,3
6,26.623937,5.539791,0.007419,0.000132,0.8,,l1,liblinear,"{'C': 0.8, 'class_weight': None, 'penalty': 'l...",0.514343,0.507343,0.5247,0.519359,0.490654,0.51128,0.011795,4
11,59.130403,3.893428,0.00767,0.000191,0.9,,l2,liblinear,"{'C': 0.9, 'class_weight': None, 'penalty': 'l...",0.517678,0.49733,0.5247,0.519359,0.49733,0.511279,0.011623,5
10,30.495706,2.204232,0.007302,0.000181,0.9,,l1,liblinear,"{'C': 0.9, 'class_weight': None, 'penalty': 'l...",0.512342,0.504673,0.526702,0.519359,0.491989,0.511013,0.011997,6


In [0]:
best_logit = LogisticRegression(random_state=1, **grid_cv.best_params_)
best_logit.fit(x_mix_train, y_txt_train)
best_logit_unwgt = best_logit.predict(x_mix_test)

In [41]:
print(classification_report(y_txt_test, best_logit_unwgt))

              precision    recall  f1-score   support

       anger       0.36      0.14      0.20       312
     disgust       0.50      0.02      0.03        65
        fear       0.17      0.02      0.03        57
         joy       0.46      0.21      0.29       422
     neutral       0.52      0.92      0.66      1152
     sadness       0.32      0.07      0.12       160
    surprise       0.65      0.19      0.29       329

    accuracy                           0.51      2497
   macro avg       0.43      0.22      0.23      2497
weighted avg       0.49      0.51      0.43      2497



In [0]:
model2=MLPClassifier(alpha=0.01, batch_size=256, epsilon=1e-08, hidden_layer_sizes=(300,), learning_rate='adaptive', max_iter=500)
model2.fit(x_mix_train, y_txt_train)
y_pred=model2.predict(x_mix_test)


In [45]:
print(classification_report(y_txt_test, y_pred))

              precision    recall  f1-score   support

       anger       0.25      0.30      0.27       312
     disgust       0.07      0.03      0.04        65
        fear       0.10      0.09      0.09        57
         joy       0.32      0.23      0.27       422
     neutral       0.60      0.54      0.57      1152
     sadness       0.17      0.11      0.13       160
    surprise       0.23      0.40      0.30       329

    accuracy                           0.39      2497
   macro avg       0.25      0.24      0.24      2497
weighted avg       0.41      0.39      0.39      2497



In [0]:
## using logistic on only audio data
log_clf_wgt2 = LogisticRegression(class_weight='balanced').fit(x_train, y_train)
y_pred_log2 = log_clf_wgt2.predict(x_test)
print(classification_report(y_test, y_pred_log2))

In [0]:
sample_rate = 44100
this_audio = train_dir + "dia39_utt16.wav"
X, sr = librosa.load(this_audio, sr=None, mono=True)

In [0]:
stft=np.abs(librosa.stft(X))
stft.shape

In [0]:
result=np.array([])
mfccs=np.mean(librosa.feature.mfcc(y=X, sr=sr, n_mfcc=40).T, axis=0)
result=np.hstack((result, mfccs))

In [0]:
result