In [2]:
from dawg import IntDAWG
from scipy import sparse

import pandas as pd
import numpy as np

In [7]:
train = pd.read_csv('learn.csv')
test = pd.read_csv('exam.csv')

In [8]:
train.shape

(718608, 5)

In [9]:
test.shape

(250000, 4)

In [11]:
accepted_chars_rus = "АБВГДЕЁЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯ'.-"
accepted_chars_lat = "ABCDEFGHIJKLMNOPQRSTUVWXYZ'.-"
top_iso = [ 'iso_' + a for a in list(train['COUNTRYISO'].value_counts().keys())[:25] ]
features = list()
for i in accepted_chars_lat:
    for j in accepted_chars_lat:
        for k in accepted_chars_lat:
            features.append(i+j+k)
for i in accepted_chars_rus:
    for j in accepted_chars_rus:
        for k in accepted_chars_rus:
            features.append(i+j+k)
features.extend(top_iso)
features_pairs = list(zip(features, range(len(features))))
features_dawg = IntDAWG(features_pairs)
feature_list = list(accepted_chars_rus + accepted_chars_lat) + top_iso
trigrams = lambda a: zip(a, a[1:], a[2:])

In [12]:
def construct_datasets(row):
    cur_feat_ids = []
    for trigram in trigrams(row['FULLNAME']):
        trigram = ''.join(trigram)
        if trigram in features_dawg:
#             print(trigram, row['FULLNAME'])
            cur_feat_ids.append(features_dawg[trigram])
    iso = 'iso_' + row['COUNTRYISO']
    if iso in features_dawg:
        cur_feat_ids.append(features_dawg[iso])
    cur_feat_ids = sorted(list(set(cur_feat_ids)))
    data = np.array([1]*len(cur_feat_ids))
    indices = np.array(cur_feat_ids)
    indptr = np.array([0,len(cur_feat_ids)])
    return sparse.csr_matrix((data, indices, indptr), shape=(1, len(features))).astype(np.float64)

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()

X = vectorizer.fit_transform(train['FULLNAME'])
Y = vectorizer.transform(test['FULLNAME'])

In [14]:
%%time
train['sparse_feats'] = train.apply(construct_datasets, axis=1)
test['sparse_feats'] = test.apply(construct_datasets, axis=1)

CPU times: user 5min 34s, sys: 1.39 s, total: 5min 35s
Wall time: 5min 35s


In [15]:
%%time
X_train = sparse.vstack(train['sparse_feats'].values)
y_train = train['label'].values
X_test = sparse.vstack(test['sparse_feats'].values)

CPU times: user 17.2 s, sys: 144 ms, total: 17.3 s
Wall time: 17.3 s


In [16]:
from scipy.sparse import hstack
from scipy.sparse import csr_matrix
s_train = csr_matrix(hstack([X_train,X]))
s_test = csr_matrix(hstack([X_test,Y]))

In [20]:
%%time
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(C=100, solver = 'lbfgs', multi_class = 'multinomial', max_iter = 1000, n_jobs = -1, random_state = 322).fit(s_train, y_train)

CPU times: user 1.51 s, sys: 417 ms, total: 1.93 s
Wall time: 16min 22s


In [21]:
from sklearn.metrics import recall_score, precision_score, f1_score,  classification_report

print('F1:',classification_report(y_train, model.predict(s_train)))
#print('Recall:', recall_score(y_train, model.predict(X_train)))
#print('Precision:', precision_score(y_train, model.predict(X_train),average = 'macro'))

F1:              precision    recall  f1-score   support

          0       1.00      1.00      1.00    579823
          1       1.00      1.00      1.00    109430
          2       1.00      1.00      1.00     29355

avg / total       1.00      1.00      1.00    718608



In [24]:
y_test = model.predict(s_test)

In [25]:
sum(y_test)

56750

In [26]:
with open('predictions/big_data_submission.csv', 'w') as f:
    f.write('lol'+'\n')
    f.write('\n'.join(list(map(str, y_test))))

In [27]:
from sklearn.externals import joblib
from sklearn import model_selection

filename = 'finalized_model.sav'
joblib.dump(model, filename)

['finalized_model.sav']