In [1]:
import numpy as np
import pandas as pd
import collections

In [2]:
data = pd.read_csv('linear_train.txt', header=None)

В качестве признаков возьмём последоватьельности букв в слове

In [7]:
def build_chargrams(word, gram_range):
    grams = []
    for l in range(gram_range[0], gram_range[1]):
        for i in range(len(word)-l+1):
            grams.append(word[i:i+l])
    return grams

In [8]:
grams = [build_chargrams(word.decode('utf-8'), (1, 3)) for word in data[0].values + '$']

In [9]:
all_grams = []
for i in range(len(grams)):
    all_grams += grams[i]

In [10]:
vocab_size = 1000

In [11]:
def build_dataset(words, n_words):
    """Process raw inputs into a dataset."""
    count = [['UNK', -1]]
    count.extend(collections.Counter(words).most_common(n_words - 1))
    dictionary = dict()
    for word, _ in count:
        dictionary[word] = len(dictionary)
    data = list()
    unk_count = 0
    for word in words:
        if word in dictionary:
            index = dictionary[word]
        else:
            index = 0  # dictionary['UNK']
        unk_count += 1
        data.append(index)
    count[0][1] = unk_count
    reversed_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
    return data, count, dictionary, reversed_dictionary

In [12]:
dataset, count, dictionary, reverse_dictionary = build_dataset(all_grams, vocab_size)

In [13]:
def word2gramvec(gram, dictionary):
    vec_length = len(dictionary.keys())
    vec = np.zeros(vec_length)
    for lex in gram:
        if lex in dictionary.keys():
            vec[dictionary[lex]]+=1
        else:
            vec[dictionary['UNK']]+=1
    return vec

In [18]:
dataset = np.array([word2gramvec(gram, dictionary) for gram in grams])

In [19]:
upper = [1 if val.decode('utf-8')[0].isupper() else -1 for val in data[0].values]

In [20]:
X = np.hstack((dataset, np.reshape(upper, (len(dataset), 1))))
y = data[1]

In [21]:
from sklearn.cross_validation import cross_val_score

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

clf = LogisticRegression(C=0.2)
cross_val_score(clf, X, y, scoring='roc_auc', cv=6, n_jobs=-1)



array([ 0.8039781 ,  0.65020047,  0.61742965,  0.67796756,  0.78553644,
        0.84539512])

In [16]:
cross_val_score(clf, X, y, scoring='roc_auc', cv=6, n_jobs=-1)

array([ 0.82687295,  0.69422441,  0.66193649,  0.71204365,  0.80348999,
        0.85914938])

Linear Model

In [17]:
clf.fit(X, y)

LogisticRegression(C=0.5, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [25]:
def process_test_set():
    test_data = pd.read_csv('linear_test.txt', header=None)
    grams = [build_chargrams(word.decode('utf-8'), (1, 4)) for word in test_data[0].values + '$']
    dataset = np.array([word2gramvec(gram, dictionary) for gram in grams])
    print(grams)
    dataset = np.hstack((dataset,
                         np.reshape([1 if word.decode('utf-8')[0].isupper() else -1 for word in test_data[0].values], 
                                    (len(test_data), 1))))
    return dataset

In [19]:
X_test_set = process_test_set()

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.


In [None]:
del X_train

In [None]:
del X_test

In [30]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

clf = LogisticRegression(C=0.5)
clf.fit(X, y)

LogisticRegression(C=0.5, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [32]:
del data

In [33]:
del X
del y

In [20]:
y_pred = clf.predict_proba(X_test_set)[:, 1]

In [36]:
with open("resul1.txt", "w") as f:
    f.write("Id,Answer\n")
    for i in range(len(y_pred)):
        f.write('{},{}\n'.format(i, str(float(y_pred[i]))))