In [24]:
import pandas as pd
train_data = pd.read_csv ('linear_train.txt', names=['name','is_surname'])

In [25]:
train_data.head(10)

Unnamed: 0,name,is_surname
0,Аалтонен,1
1,Аар,0
2,Аарон,0
3,ААРОН,0
4,Аарона,0
5,Аарона,1
6,Аароне,0
7,Ааронов,0
8,Аахена,0
9,Абабков,1


Разделим наши слова на n-граммы:

In [26]:
def local_bigrmm(string):
    bigramm = []
    for i in range(len(string)-3):
        bigramm.append(string[i:i+4])
    return ' '.join(bigramm)

Применим к нашей выборке:

In [27]:
data_for_train = train_data['name']
data_for_train = data_for_train.apply(local_bigrmm)
target = train_data['is_surname']
print(data_for_train.shape)
print(target.shape)

(101408,)
(101408,)


In [28]:
from sklearn.model_selection import train_test_split
X_train_, X_test_, y_train, y_test = train_test_split(data_for_train, target, test_size=0.3, random_state=42, stratify = target)

In [29]:
X_train_.shape

(70985,)

Выделим фичи:

In [30]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
vectorizer.fit(X_train_)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=None,
                min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

In [31]:
X_train = vectorizer.transform(X_train_)
X_test = vectorizer.transform(X_test_)

In [32]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(70985, 39298)
(70985,)
(30423, 39298)
(30423,)


Используем лог. регрессию:

In [33]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(random_state=0).fit(X_train, y_train)
prediction = clf.predict_proba(X_test)

In [34]:
from sklearn.metrics import roc_auc_score
print(roc_auc_score(y_test, prediction[:,1]))

0.8701057634556061


Посмотрим качество на XGBClassifier:

In [35]:
from xgboost import XGBClassifier
clf = XGBClassifier()
clf.fit(X_train, y_train)
prediction = clf.predict(X_test)

In [36]:
from sklearn.metrics import accuracy_score
print(roc_auc_score(y_test, prediction))

0.5514448191446935


Посмотрим качество на RandomForestClassifier:

In [37]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier()
clf.fit(X_train, y_train)
prediction = clf.predict_proba(X_test)

In [40]:
X_test.shape

(30423, 39298)

In [41]:
print(roc_auc_score(y_test, prediction[:,1]))

0.8679051430267486


На KNeighborsClassifier:

In [503]:
from sklearn.neighbors import KNeighborsClassifier
neigh = KNeighborsClassifier()
neigh.fit(X_train, y_train)
prediction = neigh.predict(X_test)

In [None]:
print(roc_auc_score(y_test, prediction))

Тестовая выборка:

In [10]:
import pandas as pd
test_data = pd.read_csv ('linear_test.txt', names=['name','is_surname'])

In [11]:
data_for_test = test_data['name']
data_for_test = data_for_test.apply(local_bigrmm)
target_test = test_data['is_surname']
print(data_for_test.shape)
print(target_test.shape)

(188920,)
(188920,)


In [12]:
data_for_train = train_data['name']
data_for_train = data_for_train.apply(local_bigrmm)
target_train = train_data['is_surname']
print(data_for_train.shape)
print(target_train.shape)

(101408,)
(101408,)


In [23]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
vectorizer.fit(data_for_train)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=None,
                min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

In [14]:
X_train = vectorizer.transform(data_for_train)
X_test = vectorizer.transform(data_for_test)

In [15]:
print(X_train.shape)
print(target_train.shape)

(101408, 51460)
(101408,)


In [529]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(random_state=0).fit(X_train, target_train)


Используем RandomForestClassifier:

In [16]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier()
clf.fit(X_train, target_train)
#prediction = clf.predict_proba(X_test)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [17]:
prediction = clf.predict_proba(X_test)

In [19]:
import numpy as np 
indexes_ = np.arange(prediction.shape[0])

In [20]:
print(indexes_.shape)
print(prediction[:,1].shape)

(188920,)
(188920,)


In [21]:
answ = pd.DataFrame(data = indexes_, columns=["Id"])
answ['Answer'] = prediction[:,1]
answ.head(10)

Unnamed: 0,Id,Answer
0,0,0.201658
1,1,0.190878
2,2,0.190878
3,3,0.190878
4,4,0.046121
5,5,0.080833
6,6,0.0425
7,7,0.18
8,8,0.18
9,9,0.122446


In [22]:
answ.to_csv("answer.txt", index =False)