# Tabular + Text Classification Demo

In [2]:
# Setup logging and imports

from utils import get_tunables, pprint, setup

setup()

from mlblocks import MLPipeline
from mlprimitives.datasets import load_personae

In [3]:
primitives = [
    'mlprimitives.custom.text.TextCleaner',
    'mlprimitives.custom.feature_extraction.StringVectorizer',
    'sklearn.ensemble.RandomForestClassifier',
]
hyperparameters = {
    'mlprimitives.custom.text.TextCleaner': {
        'column': 'text',
        'language': 'nl'
    },
    'sklearn.ensemble.RandomForestClassifier': {
        'n_jobs': -1,
        'n_estimators': 100
    }
}

In [4]:
dataset = load_personae()

In [5]:
dataset.describe()

Personae Dataset.

    The data of this dataset is a 2d numpy array vector containing 145 entries
    that include texts written by Dutch users in Twitter, with some additional
    information about the author, and the target is a 1d numpy binary integer
    array indicating whether the author was extrovert or not.
    


In [6]:
dataset.data.head()

Unnamed: 0,text_language,author_gender,author_language,author_region,text
0,Dutch,male,Dutch,A,' Artificial Life ' : documentaire over kunstm...
1,Dutch,female,Dutch,A,Kunstmatig leven Men start de film met de vraa...
2,Dutch,male,Dutch,A,De film over artificiële intelligentie die we ...
3,Dutch,female,Dutch,B,' Artificial Life ' is de titel van een docume...
4,Dutch,female,Dutch,A,Tijdens het college van 14 november bekeken we...


In [7]:
dataset.target

array([0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0,
       0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0,
       1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0,
       0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0,
       1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0,
       0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1,
       1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0])

In [8]:
pipeline = MLPipeline(primitives, init_params=hyperparameters)

In [9]:
X_train, X_test, y_train, y_test = dataset.get_splits(1)

In [10]:
X_train.head()

Unnamed: 0,text_language,author_gender,author_language,author_region,text
47,Dutch,female,Dutch,A,The Artificial Life In deze ietwat verwarrende...
54,Dutch,female,Dutch,A,Kunstmatig leven ( Engels : artificial life ) ...
112,Dutch,male,Dutch,A,Wat zijn de fundamenten van leven en intellige...
79,Dutch,female,Dutch,A,Artificieel leven Er is al veel over geschreve...
86,Dutch,female,Dutch,OV,De film ging over het onderzoek naar automatic...


In [11]:
pipeline.fit(X_train, y_train)

In [12]:
predictions = pipeline.predict(X_test)

In [13]:
dataset.score(y_test, predictions)

0.4864864864864865