# Baseline

This notebook implements a baseline case. Tf-idf features are first extracted and then used to classify businesses into categories.

In [1]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from scipy.sparse import hstack

Load train and test sets

In [2]:
labels = [
    'Active Life', 'Arts & Entertainment', 'Automotive', 'Beauty & Spas',
    'Education', 'Event Planning & Services', 'Financial Services', 'Food',
    'Health & Medical', 'Home Services', 'Hotels & Travel', 'Local Flavor',
    'Local Services', 'Mass Media', 'Nightlife', 'Pets', 'Professional Services',
    'Public Services & Government', 'Real Estate', 'Religious Organizations',
    'Restaurants', 'Shopping'
]

train_df = pd.read_csv('data/train.csv', na_filter=False)
test_df = pd.read_csv('data/test.csv', na_filter=False)

train_texts = train_df['sequence']
test_texts = test_df['sequence']

train_labels = train_df['categories'].str.get_dummies(sep=', ')
test_labels = test_df['categories'].str.get_dummies(sep=', ')

Features extraction based on both char and word token level

In [3]:
word_vectorizer = TfidfVectorizer(
    tokenizer=lambda x: x.split(),
    strip_accents='unicode',
    ngram_range=(1, 1),
    sublinear_tf=True,
    max_features=10000)

train_word_features = word_vectorizer.fit_transform(train_texts)
test_word_features = word_vectorizer.transform(test_texts)

char_vectorizer = TfidfVectorizer(
    tokenizer=lambda x: x.split(),
    strip_accents='unicode',
    analyzer='char',
    ngram_range=(3, 4),
    max_df=0.85,
    sublinear_tf=True,
    max_features=20000)

train_char_features = char_vectorizer.fit_transform(train_texts)
test_char_features = char_vectorizer.transform(test_texts)

train_features = hstack([train_char_features, train_word_features])
test_features = hstack([test_char_features, test_word_features])

Classification via Logistic Regression

In [4]:
test_preds = np.zeros((len(test_df), len(labels)))
scores = []

for label_idx, label_name in enumerate(labels):
    train_target = train_labels[label_name]
    test_target = test_labels[label_name]

    clf = LogisticRegression(solver='sag')
    clf.fit(train_features, train_target)
    preds = clf.predict(test_features)
    test_preds[:, label_idx] = preds

    score = accuracy_score(test_target, preds)
    scores.append(score)
    print('Test score for class {} is {:.4f}'.format(label_name, score))

print('Mean test score is {:.4f}'.format(np.mean(scores)))

Test score for class Active Life is 0.9839
Test score for class Arts & Entertainment is 0.9799
Test score for class Automotive is 0.9819
Test score for class Beauty & Spas is 0.9833
Test score for class Education is 0.9878
Test score for class Event Planning & Services is 0.9650
Test score for class Financial Services is 0.9946
Test score for class Food is 0.9294
Test score for class Health & Medical is 0.9801
Test score for class Home Services is 0.9715
Test score for class Hotels & Travel is 0.9878
Test score for class Local Flavor is 0.9922
Test score for class Local Services is 0.9592
Test score for class Mass Media is 0.9983
Test score for class Nightlife is 0.9807
Test score for class Pets is 0.9931
Test score for class Professional Services is 0.9743
Test score for class Public Services & Government is 0.9957
Test score for class Real Estate is 0.9899
Test score for class Religious Organizations is 0.9989
Test score for class Restaurants is 0.9740
Test score for class Shopping i