# Загрузка и предобработка данных

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

Загрузим возраста

In [2]:
id_age_train = pd.read_csv('age_profile_train', sep = '\t', header = None)
id_age_train.columns = ['Id', 'age']
id_age_train.head()

Unnamed: 0,Id,age
0,000000013CB5719C0000A2C90002C101,53
1,00000001442BE24000001B7D00F50801,48
2,00000001448580F800003F1B31FB0901,28
3,0000000145BDB2FF000157971645E901,44
4,000000014602771F0000DB9359714C01,48


Построим словарь id->возраст

In [3]:
age_by_id = dict(id_age_train.values)
print age_by_id[id_age_train['Id'].values[0]]

53


Далее загрузим датафреймы с title и url

In [4]:
train_titles = pd.read_csv('title_unify_train', sep = '\t', header = None)
train_titles.columns = ['Id', 'title', 'cnt']
train_titles.head()

Unnamed: 0,Id,title,cnt
0,000000014B6D41C13D777E8314725401,коляна лента прикол,1
1,0000000150707ACB8A82451C0307BC01,candi410 rambler ru входящая рамблер-почта,1
2,0000000150707ACB8A82451C0307BC01,cosmopolitan витамин волос для женщина журнал ...,1
3,0000000150707ACB8A82451C0307BC01,realbox бокс для интернет-магазин страница тов...,1
4,0000000150707ACB8A82451C0307BC01,realbox бокс для интернет-магазин товар экипир...,2


In [5]:
test_titles = pd.read_csv('title_unify_test', sep = '\t', header = None)
test_titles.columns = ['Id', 'title', 'cnt']
test_titles.head()

Unnamed: 0,Id,title,cnt
0,0000000151790DCC1E8322AF0B6FA701,20-километровый амур китай мост недвижимость п...,2
1,0000000151790DCC1E8322AF0B6FA701,24-х 34-х до договор неделя новость предложить...,1
2,0000000151790DCC1E8322AF0B6FA701,3xl armour compress heatgear long size sleev s...,1
3,0000000151790DCC1E8322AF0B6FA701,4-е ca зачёт медальный место новость ои-2012 п...,1
4,0000000151790DCC1E8322AF0B6FA701,4-колёсный moi-bebik oregon oscar ru коляска к...,1


In [6]:
test_urls = pd.read_csv('url_domain_test', sep = '\t', header = None)
test_urls.columns = ['Id', 'url', 'cnt']
test_urls.head()

Unnamed: 0,Id,url,cnt
0,0000000151004FF4ADD746DA10685A01,afisha.ru,2
1,0000000151004FF4ADD746DA10685A01,aif.ru,1
2,0000000151004FF4ADD746DA10685A01,aimfar.solution.weborama.fr,1
3,0000000151004FF4ADD746DA10685A01,alkotest.ru,1
4,0000000151004FF4ADD746DA10685A01,aptekamos.ru,1


In [7]:
train_urls = pd.read_csv('url_domain_train', sep = '\t', header = None)
train_urls.columns = ['Id', 'url', 'cnt']
train_urls.head()

Unnamed: 0,Id,url,cnt
0,000000014B60815F65B38258011B6C01,login.rutracker.org,1
1,000000014B60815F65B38258011B6C01,rutracker.org,4
2,000000014C03DA2A47AC433A0C755201,admin.tour-spb.net,1
3,000000014C03DA2A47AC433A0C755201,czinfo.ru,1
4,000000014C03DA2A47AC433A0C755201,forumsostav.ru,1


Объединим train и test в один датафрейм

In [8]:
train_size = id_age_train.shape[0]
print train_size

118679


In [9]:
train_test_data = pd.DataFrame(np.vstack([train_titles.values, test_titles.values]))
train_test_data.columns = ['Id', 'title', 'cnt']
train_test_data.head()

Unnamed: 0,Id,title,cnt
0,000000014B6D41C13D777E8314725401,коляна лента прикол,1
1,0000000150707ACB8A82451C0307BC01,candi410 rambler ru входящая рамблер-почта,1
2,0000000150707ACB8A82451C0307BC01,cosmopolitan витамин волос для женщина журнал ...,1
3,0000000150707ACB8A82451C0307BC01,realbox бокс для интернет-магазин страница тов...,1
4,0000000150707ACB8A82451C0307BC01,realbox бокс для интернет-магазин товар экипир...,2


Построим по данному датафрейму sparse matrix по всем словам из title, используя TfidfVectorizer

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(max_features = 10000)

In [11]:
%%time
groupbyed = train_test_data.groupby('Id')['title'].apply(lambda x: x.tolist())
print type(groupbyed), groupbyed.shape

<class 'pandas.core.series.Series'> (134116,)
CPU times: user 23.1 s, sys: 252 ms, total: 23.4 s
Wall time: 23.6 s


In [12]:
groupbyed1 = zip(groupbyed.keys(), groupbyed.values)
print len(groupbyed1), len(groupbyed1[0])

134116 2


In [13]:
groupbyed1.sort(key = lambda x: x[0] in age_by_id, reverse = True)
for i in range(len(groupbyed1)):
    if groupbyed1[i][0] not in age_by_id:
        print i
        real_train_size = i
        break

114156


In [14]:
%%time
vectorized_data = vectorizer.fit_transform([''.join(x[1]) for x in groupbyed1])
print vectorized_data.shape

(134116, 10000)
CPU times: user 2min 37s, sys: 4.73 s, total: 2min 42s
Wall time: 2min 45s


Если слово "хуй" не вошло в список слов, по которым будем обучаться, значит, надо брать побольше :-)

In [35]:
features = vectorizer.vocabulary_
print features[u"хуй"]

KeyError: u'\u0445\u0443\u0439'

### Обучение моделей

Создадим массивы X_train и y_train

In [26]:
y_train = [age_by_id[groupbyed1[i][0]] for i in range(real_train_size)]
print y_train[:10]

[53, 48, 28, 44, 48, 36, 33, 41, 51, 32]


In [27]:
X_train = vectorized_data[:real_train_size]
print X_train.shape, len(y_train)

(114156, 10000) 114156


### Линейная регрессия

Обучим линейную регрессию на трейне с возрастами

In [37]:
from sklearn.linear_model import LinearRegression
model = LinearRegression(n_jobs = 2)

In [41]:
%%time
model.fit(vectorized_data[:real_train_size], ages)

CPU times: user 4min 51s, sys: 5.84 s, total: 4min 56s
Wall time: 3min 3s


LinearRegression(copy_X=True, fit_intercept=True, n_jobs=2, normalize=False)

Посмотрим на ее rmse на трейне, сделав кросс-валидацию

In [39]:
def rmse(x, y):
    return np.sqrt(np.mean((x - y) ** 2))

In [40]:
%%time
from sklearn.cross_validation import cross_val_score
print cross_val_score(model, vectorized_data[:real_train_size], ages, cv = 3, scoring = 'mean_squared_error')

[-160.26869196 -164.52824504 -151.38372678]


### Xgboost

In [19]:
import xgboost

In [32]:
params = {'max_depth':3, 'booster':'gbtree', 'eval_metric':'rmse', 'eta':0.1}
numround = 1000

In [33]:
%%time
Xdatatrain = xgboost.DMatrix(data = X_train, label = y_train)
Xdatatest = xgboost.DMatrix(data = X_train, label = y_train)

plst = list(params.items())
watchlist = [(Xdatatrain, 'train'), (Xdatatest, 'eval')]            

bst = xgboost.train(plst, Xdatatrain, numround, evals = watchlist, verbose_eval = 5)

[0]	train-rmse:34.4588	eval-rmse:34.4588
[5]	train-rmse:22.6094	eval-rmse:22.6094
[10]	train-rmse:16.5608	eval-rmse:16.5608
[15]	train-rmse:13.8185	eval-rmse:13.8185
[20]	train-rmse:12.7006	eval-rmse:12.7006
[25]	train-rmse:12.2662	eval-rmse:12.2662
[30]	train-rmse:12.0928	eval-rmse:12.0928
[35]	train-rmse:12.0151	eval-rmse:12.0151
[40]	train-rmse:11.9757	eval-rmse:11.9757
[45]	train-rmse:11.9482	eval-rmse:11.9482
[50]	train-rmse:11.9274	eval-rmse:11.9274
[55]	train-rmse:11.909	eval-rmse:11.909
[60]	train-rmse:11.8913	eval-rmse:11.8913
[65]	train-rmse:11.8765	eval-rmse:11.8765
[70]	train-rmse:11.8627	eval-rmse:11.8627
[75]	train-rmse:11.8493	eval-rmse:11.8493
[80]	train-rmse:11.8355	eval-rmse:11.8355
[85]	train-rmse:11.8225	eval-rmse:11.8225
[90]	train-rmse:11.8112	eval-rmse:11.8112
[95]	train-rmse:11.8002	eval-rmse:11.8002
[100]	train-rmse:11.7897	eval-rmse:11.7897
[105]	train-rmse:11.7784	eval-rmse:11.7784
[110]	train-rmse:11.7686	eval-rmse:11.7686
[115]	train-rmse:11.7578	eval-rmse:

### Отправка решения

In [35]:
answer_ages = bst.predict(xgboost.DMatrix(vectorized_data[real_train_size:]))

In [36]:
Ids = [x[0] for x in groupbyed1]
print Ids[:10]

['000000013CB5719C0000A2C90002C101', '00000001442BE24000001B7D00F50801', '00000001448580F800003F1B31FB0901', '0000000145BDB2FF000157971645E901', '000000014602771F0000DB9359714C01', '0000000147B2D6F311DB5C4201B7FB01', '0000000147C68954150168D701A8B801', '0000000147EB76D738CD80750C879701', '00000001482AAFB69FA5228008AC2A01', '0000000148390BB56A6B22BB178D3901']


In [37]:
test_urls = pd.read_csv('url_domain_test', sep = '\t', header = None)
test_urls.columns = ['Id', 'url', 'cnt']

In [38]:
test_urls.head()

Unnamed: 0,Id,url,cnt
0,0000000151004FF4ADD746DA10685A01,afisha.ru,2
1,0000000151004FF4ADD746DA10685A01,aif.ru,1
2,0000000151004FF4ADD746DA10685A01,aimfar.solution.weborama.fr,1
3,0000000151004FF4ADD746DA10685A01,alkotest.ru,1
4,0000000151004FF4ADD746DA10685A01,aptekamos.ru,1


In [39]:
missed_ids = set(list(test_urls['Id'].values)) - set(list(groupbyed.keys()))
print missed_ids

set(['060458FC4F26763C00000F979D2A7E01', '060E6C2B503325370000837052DE5001', '0000004654A10B824C165A466C168201', '05F9DAFE4E0C341900008CF2B767AF01', '000000465705245378FF65D832218001', '06134823531743F900004AC41E17E101', '0000004656F1981B1AA7519B68928301', '05F9DAFE4D3838F50000024A4318F701', '000000465704F25A438B05242F511001', '05F9DAFE4E09AA770000DDF2A270B501', '000000465684B22F9C5E2F5092605901', '0000042E5705589889C7137F36D41301', '00000046546F3F721C92663B86D4EE01', '000000465705212A890F65E12F997E01', '00000046569323B3B9B443367DC16401', '0000004656C8659C34EC56F3481DC101', '0000004657049E1AA4A453852830FA01', '05F9DAFE4E254F2B0000DDF205C6D101', '0000004654B561B02F4A0174E0EC2001'])


In [40]:
missed = np.array(zip(missed_ids, [0] * len(missed_ids)))
print missed.shape

(19, 2)


In [41]:
answer_df = pd.DataFrame(columns = ['Id', 'age'])
answer_df['Id'] = Ids[real_train_size:]
answer_df['age'] = answer_ages
print answer_df.shape
answer_df = pd.DataFrame(np.vstack((answer_df.values, missed)))
answer_df.columns = ['Id', 'age']
answer_df.head()

(19960, 2)


Unnamed: 0,Id,age
0,000000014A02348E701552980349FF01,40.5711
1,000000014A10EA183BF8594A0B2AB201,38.2735
2,000000014A4FE5C33A929D4C26943601,33.9577
3,000000014B7BB9957784A9BC0AC9F401,32.1867
4,000000014C7749F896D82C2B01E8B801,34.6329


In [42]:
print answer_df.shape

(19979, 2)


In [43]:
answer_df.to_csv('solution.csv', index = False)

In [69]:
id0 = Ids[real_train_size + 1]
print vectorized_data[real_train_size + 1]
print vectorizer.get_feature_names()[1523]

  (0, 533)	0.0470028843892
  (0, 7228)	0.0477088452249
  (0, 3750)	0.0459974805676
  (0, 2339)	0.0344471938693
  (0, 3154)	0.122357831641
  (0, 2461)	0.038999018059
  (0, 9957)	0.0440494433438
  (0, 6844)	0.043960981021
  (0, 8412)	0.0422027539728
  (0, 6968)	0.0433953080819
  (0, 2597)	0.0864037071474
  (0, 9898)	0.0407235175451
  (0, 2799)	0.0446476479461
  (0, 9820)	0.0427779624276
  (0, 7014)	0.0398504898525
  (0, 5829)	0.0406693646755
  (0, 6917)	0.0347509246408
  (0, 1705)	0.0365961215288
  (0, 8758)	0.0319800170987
  (0, 3898)	0.0371487659375
  (0, 4884)	0.034069150246
  (0, 8854)	0.0452128290607
  (0, 615)	0.1350573691
  (0, 9102)	0.0316911341205
  (0, 8869)	0.0376171812824
  :	:
  (0, 5877)	0.0233678799944
  (0, 507)	0.0109365797567
  (0, 3585)	0.0521836072955
  (0, 6231)	0.0185706532271
  (0, 6244)	0.0267958198026
  (0, 6108)	0.0210565388452
  (0, 7380)	0.0617804282619
  (0, 6908)	0.0461661153099
  (0, 8273)	0.0185636829541
  (0, 7385)	0.0780481338836
  (0, 5376)	0.0261717766