### Load Data

In [1]:
import pandas as pd
import numpy as np
import sqlite3
import re
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier
from sklearn.metrics import classification_report, f1_score

from tqdm import tqdm
tqdm.pandas(desc='progress-bar')
from sklearn import utils
from gensim.models import Doc2Vec as d2v
import gensim
from gensim.models.doc2vec import TaggedDocument
import pprint
from imblearn.over_sampling import SMOTE

import pickle
from pathlib import Path

  from pandas import Panel
Using TensorFlow backend.


In [2]:
connection = sqlite3.connect('messages_info.db')
query = 'SELECT * FROM data'

data = pd.read_sql(query, connection)

### Preprocessing

In [3]:
def clean_string(string):
    delete_punctuation = re.compile('[^a-zA-Z0-9]')
    clean_string = re.sub(delete_punctuation, ' ', string)
    return clean_string

In [4]:
def label_messages(column, label_type):
    labeled = []
    for i, cell in enumerate(column):
        label = label_type + '_' + str(i)
        labeled.append(TaggedDocument(cell.split(), [label]))
    return labeled

In [5]:
data.message = data.message.map(clean_string)

X_train, X_test, y_train, y_test = train_test_split(data.message, data.loc[:, 'related':], train_size=0.75,
                                                    random_state=42)

X_train = label_messages(X_train, 'train')
X_test = label_messages(X_test, 'test')

In [6]:
drop_list = []

for col in y_train.columns:
    y_train[col] = pd.to_numeric(y_train[col], errors='raise')
    if y_train[col].sum() == 0:
        drop_list.append(col)

for col in y_test.columns:
    y_test[col] = pd.to_numeric(y_test[col], errors='raise')

y_test.drop(drop_list, inplace=True, axis=1)
y_train.drop(drop_list, inplace=True, axis=1)

In [7]:
# Parameters for Doc2Vec
dm=0
vector_size = 300
negative=5
min_count=1
alpha=0.065
epochs = 30

model_dbow = d2v(dm=dm, vector_size=vector_size, negative=negative, min_count=min_count, alpha=alpha, min_alpha=alpha)
model_dbow.build_vocab([x for x in tqdm (X_train + X_test)])

for epoch in range(epochs):
    model_dbow.train(utils.shuffle([x for x in tqdm(X_train + X_test)]), total_examples=len(X_train + X_test), epochs=1)
    model_dbow.alpha = 0.002
    model_dbow.min_alpha = model_dbow.alpha

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 26216/26216 [00:00<00:00, 3286739.61it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 26216/26216 [00:00<00:00, 3735870.41it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 26216/26216 [00:00<00:00, 4382013.86it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 26216/26216 [00:00<00:00, 4357873.88it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 26216/26216 [00:00<00:00, 4356837.85it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 26216/26216 [00:00<00:00, 4381140.87it/s]
100%|█████

In [8]:
def get_vectors(model, corpus_size, vectors_size, vectors_type):
    vectors = np.zeros((corpus_size, vectors_size))
    for i in range(corpus_size):
        prefix = vectors_type + '_' + str(i)
        vectors[i] = model.docvecs[prefix]
    return vectors

In [9]:
train_vectors = get_vectors(model_dbow, len(X_train), 300, 'train')
test_vectors = get_vectors(model_dbow, len(X_test), 300, 'test')

In [10]:
train_dict = dict([(col, (train_vectors, y_train[col].values)) for col in y_train.columns])
test_dict = dict([(col, (test_vectors, y_test[col].values)) for col in y_test.columns])

train_dict_oversampled = dict(
    [
        (col,
         SMOTE(sampling_strategy='not majority').fit_resample
         (train_vectors, y_train[col].values)
        ) for col in y_train.columns
    ]
)

### Machine Learning

In [11]:
# Initialize Models
logreg = LogisticRegression(solver='lbfgs', multi_class='auto', n_jobs=10)
svc = SVC(gamma='auto')
ada_boost = AdaBoostClassifier()
grad_boost = GradientBoostingClassifier()

models = {
    'Logistic Regression':logreg,
    'SVC':svc,
    'Adaptive Boosting':ada_boost,
    'Gradient Boosting':grad_boost
}

In [12]:
path = './data/models/'
results = {}

for model in models:
    f1_weighted = [] 
    f1_macro = []
    for category in train_dict_oversampled:
        if Path(path+model+'_'+category).is_file():
            with open(path+model+'_'+category, 'rb'):
                current_model = pickle.load(f)
                y_hat = current_model.predict(test_dict[category][0])
                f1_weighted.append(f1_score(test_dict[category][1], y_hat, average='weighted'))
                f1_macro.append(f1_score(test_dict[category][1], y_hat, average='macro'))
        else:        
            print('Fit category {} for {}'.format(category, model), flush=True, end='\r')
            current_model = models[model]
            current_model.fit(train_dict_oversampled[category][0], train_dict_oversampled[category][1])
            y_hat = current_model.predict(test_dict[category][0])
            f1_weighted.append(f1_score(test_dict[category][0], y_hat, average='weighted'))
            f1_macro.append(f1_score(test_dict[category][0], y_hat, average='macro'))
            with open(path+model+'_'+category, 'wb') as f:
                pickle.dump(current_model, f)
    results[model] = (np.mean(f1_weighted), np.mean(f1_macro))
    
result_df = pd.DataFrame.from_dict(results, orient='index', columns=['Weighted F1-Score', 'Macro F1-Score'])

Fit category related for Logistic Regression

ValueError: Classification metrics can't handle a mix of continuous-multioutput and binary targets