In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
%matplotlib inline

import numpy as np
import pandas as pd
from lightgbm import LGBMClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import f1_score

import sys
sys.path.append('DialogueActClassification')

from dialogue_classification.dataset_utils import load_swda_corpus_data
from dialogue_classification.train_test_split import train_set_idx, valid_set_idx, test_set_idx

## Prepare data

In [3]:
talks, talk_names, tag_indices, tag_occurances = load_swda_corpus_data('swda/swda/swda')

Loading SwDA Corpus...

Found 43 different utterance tags.

Loaded SwDA Corpus.


In [4]:
len(talks), len(tag_indices)

(1155, 43)

In [5]:
import re

def preprocess(text):
    text = text.lower()
    text = re.sub("[,']+", '', text)
    text = re.sub('\.', ' . ', text)
    text = re.sub('\?', ' ? ', text)
    text = re.sub('!', ' ! ', text)
    text = re.sub('\s+', ' ', text)
    return text

In [6]:
texts, labels = [], []
for text, label in talks:
    assert len(text) == len(label)
    text = [' '.join(utterance) for utterance in text]
    texts.append(text)
    labels.append(label)

In [7]:
texts = [[preprocess(utt) for utt in text] for text in texts]

In [8]:
train_idx = np.isin(talk_names, train_set_idx)
valid_idx = np.isin(talk_names, valid_set_idx)
test_idx = np.isin(talk_names, test_set_idx)

In [9]:
texts, labels = np.array(texts), np.array(labels)

train_texts, train_labels = texts[train_idx], labels[train_idx]
valid_texts, valid_labels = texts[valid_idx], labels[valid_idx]
test_texts, test_labels = texts[test_idx], labels[test_idx]

  texts, labels = np.array(texts), np.array(labels)


In [10]:
len(train_texts), len(valid_texts), len(test_texts)

(1003, 112, 19)

In [11]:
# flatten
train_texts = sum(train_texts.tolist(), [])
valid_texts = sum(valid_texts.tolist(), [])
test_texts = sum(test_texts.tolist(), [])

train_labels = np.array(sum(train_labels.tolist(), []))
valid_labels = np.array(sum(valid_labels.tolist(), []))
test_labels = np.array(sum(test_labels.tolist(), []))

In [12]:
train_texts[:10]

['so ive been concerned about crime lately . ',
 'uh-huh . ',
 'uh its really scary to listen to the news every night and --',
 'uh-huh . ',
 '-- to hear about all the problems . ',
 'i wondered if you were taking any special precautions in your neighborhood ? ',
 'well i i think we have a neighborhood watch <laughter> . ',
 'uh-huh . ',
 'i think . ',
 '<laughter> . ']

In [13]:
train_labels[:10]

array([38, 11, 39, 11,  1, 37, 25, 11, 38, 42])

## Tf-Idf features

In [14]:
vectorizer = TfidfVectorizer(ngram_range=(1, 3), max_df=0.5, min_df=50, max_features=1000)
vectorizer.fit(train_texts)

X_train = vectorizer.transform(train_texts)
X_valid = vectorizer.transform(valid_texts)
X_test = vectorizer.transform(test_texts)

In [15]:
X_train.shape

(193325, 1000)

## Train & Eval

In [16]:
model = LGBMClassifier(n_estimators=1000, objective='multiclass', n_jobs=-1, random_state=42, learning_rate=0.02)

model.fit(
    X_train, 
    train_labels, 
    eval_metric='multi_logloss', 
    eval_set=[(X_valid, valid_labels)],
    early_stopping_rounds=50, 
    verbose=50,
)



[50]	valid_0's multi_logloss: 1.35092
[100]	valid_0's multi_logloss: 1.21192
[150]	valid_0's multi_logloss: 1.16525
[200]	valid_0's multi_logloss: 1.14457
[250]	valid_0's multi_logloss: 1.13271
[300]	valid_0's multi_logloss: 1.12595
[350]	valid_0's multi_logloss: 1.12164
[400]	valid_0's multi_logloss: 1.11919
[450]	valid_0's multi_logloss: 1.11772
[500]	valid_0's multi_logloss: 1.11716
[550]	valid_0's multi_logloss: 1.11734


In [18]:
f1_score(test_labels, model.predict(X_test), average='micro')

0.6389011962782455