# CFT2018 contest

In [1]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from scipy.sparse import hstack, csr_matrix

import gc
import os
import pickle

In [2]:
PATH_TO_DATA = ('D:/Py/DataFrames/CFT_Contest(Datasouls)/')

In [3]:
train = pd.read_csv(os.path.join(PATH_TO_DATA, 'train.csv'))
test = pd.read_csv(os.path.join(PATH_TO_DATA, 'test.csv'))

y_train = train['target'].values
print(train.shape)

(1991104, 5)


---

In [4]:
train.head(3)

Unnamed: 0,id,fullname,country,target,fullname_true
0,0,AKHMEDOV YGURIY,РОССИЯ,1,AKHMEDOV YURIY
1,1,ФОЗИЛОВ РАМИЛЬ ГУЛЛОВИЧ,РОССИЯ,1,ФОЗИЛОВ РАМИЛЬ ГУЛОВИЧ
2,2,ГОИБОВ АХЛИДДИН ШАМСУДИНОВИЧ,РОССИЯ,0,


In [5]:
test.head(3)

Unnamed: 0,id,fullname,country
0,0,ХУДАШКУРОВА ГУЛЗХОДА БЕРДИЕВНА,УЗБЕКИСТАН
1,1,СВЕЖЕТЬФЛОГИСТОН АРСЕН,РОССИЯ
2,2,ГУЛОМОВА СОЖИДА САНАЕВНА,УЗБЕКИСТАН


In [10]:
only_countries_in_train = [v for v in train['country'].unique() if v not in test['country'].unique()]
only_countries_in_test = [v for v in test['country'].unique() if v not in train['country'].unique()]

In [15]:
print('All countires in train:', len(train['country'].unique()))
print('Unique countires in train:', len(only_countries_in_train))
print('All countires in test:', len(test['country'].unique()))
print('Unique countires in test:', len(only_countries_in_test))

All countires in train: 1281
Unique countires in train: 506
All countires in test: 1644
Unique countires in test: 869


### Features engineering

In [4]:
train_features = pd.DataFrame()
test_features = pd.DataFrame()

In [5]:
def get_alphabet(FIO, language='ru'):
    counter = 0
    
    if language == 'ru':    
        for letter in FIO:
            if letter in 'АБВГДЕЁЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯ':
                counter += 1
                break
                
    elif language == 'en':    
        for letter in FIO:
            if letter in 'ABCDEFGHIJKLMNOPQRSTUVWXYZ':
                counter += 1
                break
                
    elif language == 'symbols':    
        for letter in FIO:
            if letter not in ('АБВГДЕЁЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯABCDEFGHIJKLMNOPQRSTUVWXYZ'):
                if letter != ' ':
                    counter += 1
                
    return counter

In [6]:
train_features['fullname_len'] = train['fullname'].apply(lambda x: len(str(x)))
train_features['fullname_num_words'] = train['fullname'].apply(lambda x: len(x.split()))
train_features['country_len'] = train['country'].apply(lambda x: len(str(x)))
train_features['country_num_words'] = train['country'].apply(lambda x: len(x.split()))
train_features['fullname_num_words_vs_len'] = train_features['fullname_num_words'] / train_features['fullname_len']

test_features['fullname_len'] = test['fullname'].apply(lambda x: len(str(x)))
test_features['fullname_num_words'] = test['fullname'].apply(lambda x: len(x.split()))
test_features['country_len'] = test['country'].apply(lambda x: len(str(x)))
test_features['country_num_words'] = test['country'].apply(lambda x: len(x.split()))
test_features['fullname_num_words_vs_len'] = test_features['fullname_num_words'] / test_features['fullname_len']

In [7]:
train_features['ru_name'] = train['fullname'].apply(lambda x: get_alphabet(x, language='ru'))
train_features['ru_country'] = train['country'].apply(lambda x: get_alphabet(x, language='ru'))
train_features['en_name'] = train['fullname'].apply(lambda x: get_alphabet(x, language='en'))
train_features['en_country'] = train['country'].apply(lambda x: get_alphabet(x, language='en'))
train_features['symbols_in_name'] = train['fullname'].apply(lambda x: get_alphabet(x, language='symbols'))

test_features['ru_name'] = test['fullname'].apply(lambda x: get_alphabet(x, language='ru'))
test_features['ru_country'] = test['country'].apply(lambda x: get_alphabet(x, language='ru'))
test_features['en_name'] = test['fullname'].apply(lambda x: get_alphabet(x, language='en'))
test_features['en_country'] = test['country'].apply(lambda x: get_alphabet(x, language='en'))
test_features['symbols_in_name'] = test['fullname'].apply(lambda x: get_alphabet(x, language='symbols'))

In [9]:
train_features.head(3)

Unnamed: 0,fullname_len,fullname_num_words,country_len,country_num_words,fullname_num_words_vs_len,ru_name,ru_country,en_name,en_country,symbols_in_name
0,15,2,6,1,0.133333,0,1,1,0,0
1,23,3,6,1,0.130435,1,1,0,0,0
2,28,3,6,1,0.107143,1,1,0,0,0


In [10]:
test_features.head(3)

Unnamed: 0,fullname_len,fullname_num_words,country_len,country_num_words,fullname_num_words_vs_len,ru_name,ru_country,en_name,en_country,symbols_in_name
0,30,3,10,1,0.1,1,1,0,0,0
1,22,2,6,1,0.090909,1,1,0,0,0
2,24,3,10,1,0.125,1,1,0,0,0


In [4]:
%%time
vectorizer_word = TfidfVectorizer(analyzer='word', ngram_range=(1, 1), max_features=100000)

train_word = vectorizer_word.fit_transform(train.fullname)
test_word = vectorizer_word.transform(test.fullname)

print(train_word.shape)

(1991104, 80000)
Wall time: 1min 22s


In [5]:
%%time
vectorizer_char = TfidfVectorizer(analyzer='char', ngram_range=(3, 3))
vectorizer_char_2 = TfidfVectorizer(analyzer='char', ngram_range=(2, 2))

train_char = vectorizer_char.fit_transform(train.fullname)
test_char = vectorizer_char.transform(test.fullname)

train_char_2 = vectorizer_char_2.fit_transform(train.fullname)
test_char_2 = vectorizer_char_2.transform(test.fullname)

print("Триграммы:", train_char.shape)
print("Биграммы:", train_char_2.shape)

Триграммы: (1991104, 55998)
Биграммы: (1991104, 2342)
Wall time: 7min 50s


In [6]:
%%time
train_country_char = vectorizer_char.fit_transform(train.country)
test_country_char = vectorizer_char.transform(test.country)

print("Триграммы:", train_country_char.shape)

Триграммы: (1991104, 3074)
Wall time: 1min 7s


In [10]:
s_train = csr_matrix(hstack([train_word, train_char, train_char_2, train_country_char]))

del train_word, train_char, train_char_2, train_country_char
gc.collect()

0

In [11]:
s_test = csr_matrix(hstack([test_word, test_char, test_char_2, test_country_char]))

del test_word, test_char, test_char_2, test_country_char
gc.collect()

0

### Save

In [10]:
scaler = StandardScaler(with_mean=False)
train_features = scaler.fit_transform(train_features)
test_features = scaler.transform(test_features)

train_features = csr_matrix(train_features)
test_features = csr_matrix(test_features)

print(1)

1


In [11]:
# [DUMP] stat features
with open(os.path.join(PATH_TO_DATA, 'train_features.pkl'), 'wb') as train_features_pkl:
    pickle.dump(train_features, train_features_pkl, protocol=2)
with open(os.path.join(PATH_TO_DATA, 'test_features.pkl'), 'wb') as test_features_pkl:
    pickle.dump(test_features, test_features_pkl, protocol=2)

In [12]:
# [DUMP] TF-IDF pickle files
with open(os.path.join(PATH_TO_DATA, 'train_tfidf.pkl'), 'wb') as train_tfidf_pkl:
    pickle.dump(s_train, train_tfidf_pkl, protocol=2)
with open(os.path.join(PATH_TO_DATA, 'test_tfidf.pkl'), 'wb') as test_tfidf_pkl:
    pickle.dump(s_test, test_tfidf_pkl, protocol=2)