# Загрузка и предобработка данных

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

Загрузим возраста

In [2]:
id_age_train = pd.read_csv('age_profile_train', sep = '\t', header = None)
id_age_train.columns = ['Id', 'age']
id_age_train.head()

Unnamed: 0,Id,age
0,000000013CB5719C0000A2C90002C101,53
1,00000001442BE24000001B7D00F50801,48
2,00000001448580F800003F1B31FB0901,28
3,0000000145BDB2FF000157971645E901,44
4,000000014602771F0000DB9359714C01,48


Построим словарь id->возраст

In [3]:
age_by_id = dict(id_age_train.values)
print age_by_id[id_age_train['Id'].values[0]]

53


Далее загрузим датафреймы с url для train и test

In [73]:
train_urls = pd.read_csv('url_domain_train', sep = '\t', header = None)
train_urls.columns = ['Id', 'url', 'cnt']
train_urls.head()

Unnamed: 0,Id,url,cnt
0,000000014B60815F65B38258011B6C01,login.rutracker.org,1
1,000000014B60815F65B38258011B6C01,rutracker.org,4
2,000000014C03DA2A47AC433A0C755201,admin.tour-spb.net,1
3,000000014C03DA2A47AC433A0C755201,czinfo.ru,1
4,000000014C03DA2A47AC433A0C755201,forumsostav.ru,1


In [74]:
test_urls = pd.read_csv('url_domain_test', sep = '\t', header = None)
test_urls.columns = ['Id', 'url', 'cnt']
test_urls.head()

Unnamed: 0,Id,url,cnt
0,0000000151004FF4ADD746DA10685A01,afisha.ru,2
1,0000000151004FF4ADD746DA10685A01,aif.ru,1
2,0000000151004FF4ADD746DA10685A01,aimfar.solution.weborama.fr,1
3,0000000151004FF4ADD746DA10685A01,alkotest.ru,1
4,0000000151004FF4ADD746DA10685A01,aptekamos.ru,1


Обрежем адреса сайтов, оставив в них лишь самую длинную часть, отделенную точками, b размножим их по количеству посещений

In [59]:
def f(s):
    s1 = s.split('.')
    s1.sort(key = lambda w: len(w))
    return s1[-1]

In [60]:
def transform(urls, cnts):
    urls = [f(x) for x in urls]
    #uc = zip(urls, cnts)
    #urls = [(x[0] + ' ') * x[1] for x in uc]
    return np.array(urls)
t = transform(train_urls['url'], train_urls['cnt'])
print t.shape

(2046869,)


In [61]:
train_urls['url'] = t
train_urls.head()

Unnamed: 0,Id,url,cnt
0,000000014B60815F65B38258011B6C01,rutracker,1
1,000000014B60815F65B38258011B6C01,rutracker,4
2,000000014C03DA2A47AC433A0C755201,tour-spb,1
3,000000014C03DA2A47AC433A0C755201,czinfo,1
4,000000014C03DA2A47AC433A0C755201,forumsostav,1


In [62]:
test_urls['url'] = transform(test_urls['url'], test_urls['cnt'])
test_urls.head()

Unnamed: 0,Id,url,cnt
0,0000000151004FF4ADD746DA10685A01,afisha,2
1,0000000151004FF4ADD746DA10685A01,aif,1
2,0000000151004FF4ADD746DA10685A01,weborama,1
3,0000000151004FF4ADD746DA10685A01,alkotest,1
4,0000000151004FF4ADD746DA10685A01,aptekamos,1


Объединим train и test в один датафрейм

In [63]:
train_size = id_age_train.shape[0]
print train_size

118679


In [75]:
train_test_data = pd.DataFrame(np.vstack([train_urls.values, test_urls.values]))
train_test_data.columns = ['Id', 'url', 'cnt']
train_test_data.head()

Unnamed: 0,Id,url,cnt
0,000000014B60815F65B38258011B6C01,login.rutracker.org,1
1,000000014B60815F65B38258011B6C01,rutracker.org,4
2,000000014C03DA2A47AC433A0C755201,admin.tour-spb.net,1
3,000000014C03DA2A47AC433A0C755201,czinfo.ru,1
4,000000014C03DA2A47AC433A0C755201,forumsostav.ru,1


Построим по данному датафрейму sparse matrix по всем словам из url

In [65]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(max_features = 10000)

In [76]:
groupbyed = train_test_data.groupby('Id')['url'].apply(lambda x: x.tolist())
print type(groupbyed), groupbyed.shape

<class 'pandas.core.series.Series'> (138577,)


In [77]:
groupbyed1 = zip(groupbyed.keys(), groupbyed.values)
print len(groupbyed1), len(groupbyed1[0])

138577 2


In [78]:
groupbyed1.sort(key = lambda x: x[0] in age_by_id, reverse = True)
for i in range(len(groupbyed1)):
    if groupbyed1[i][0] not in age_by_id:
        print i
        real_train_size = i
        break

118603


In [79]:
vectorized_data = vectorizer.fit_transform([''.join(x[1]) for x in groupbyed1])
print vectorized_data.shape

(138577, 10000)


### Обучение модели

Зададим возраста для объектов из трейна

In [80]:
ages = [age_by_id[groupbyed1[i][0]] for i in range(real_train_size)]
print ages[:10]

[53, 48, 28, 44, 48, 36, 33, 41, 51, 32]


Обучим линейную регрессию на трейне с возрастами

In [71]:
from sklearn.linear_model import LinearRegression
model = LinearRegression(n_jobs = 2)

In [82]:
%%time
model.fit(vectorized_data[:real_train_size], ages)

CPU times: user 1min, sys: 1.16 s, total: 1min 1s
Wall time: 31 s


LinearRegression(copy_X=True, fit_intercept=True, n_jobs=2, normalize=False)

Посмотрим на ее rmse на трейне, сделав кросс-валидацию

In [27]:
def rmse(x, y):
    return np.sqrt(np.mean((x - y) ** 2))

In [81]:
from sklearn.cross_validation import cross_val_score
print cross_val_score(model, vectorized_data[:real_train_size], ages, cv = 3, scoring = 'mean_squared_error')

[-170.09654773 -156.58082811 -169.76434062]


### Отправка решения

In [83]:
answer_ages = model.predict(vectorized_data[real_train_size:])

In [84]:
Ids = [x[0] for x in groupbyed1]
print Ids[:10]

['000000013CB5719C0000A2C90002C101', '00000001442BE24000001B7D00F50801', '00000001448580F800003F1B31FB0901', '0000000145BDB2FF000157971645E901', '000000014602771F0000DB9359714C01', '0000000147B2D6F311DB5C4201B7FB01', '0000000147C68954150168D701A8B801', '0000000147EB76D738CD80750C879701', '00000001482AAFB69FA5228008AC2A01', '0000000148390BB56A6B22BB178D3901']


In [85]:
answer_df = pd.DataFrame(columns = ['Id', 'age'])
answer_df['Id'] = Ids[real_train_size:]
answer_df['age'] = answer_ages
answer_df.to_csv('solution.csv', index = False)

In [86]:
test_titles_id = pd.read_csv('title_unify_test', sep = '\t', header = None)
test_titles_id.columns = ['id', 'title', 'cnt']
test_titles_id.head()

Unnamed: 0,id,title,cnt
0,0000000151790DCC1E8322AF0B6FA701,20-километровый амур китай мост недвижимость п...,2
1,0000000151790DCC1E8322AF0B6FA701,24-х 34-х до договор неделя новость предложить...,1
2,0000000151790DCC1E8322AF0B6FA701,3xl armour compress heatgear long size sleev s...,1
3,0000000151790DCC1E8322AF0B6FA701,4-е ca зачёт медальный место новость ои-2012 п...,1
4,0000000151790DCC1E8322AF0B6FA701,4-колёсный moi-bebik oregon oscar ru коляска к...,1


In [87]:
missed_ids = set(test_titles_id['id']) - set(Ids)
print missed_ids

set(['000022D456544DE3771122F857A6F201', '06210D9556EFE0E30000054678B0D901', '00000046549F9C26429C4B0159BFEE01', '000000465684D839B64B58DA9701F701', '00000001545483F30F717F5A0721E401'])


In [88]:
missed_df = pd.DataFrame(zip(list(missed_ids), np.array([40] * len(missed_ids))))
missed_df.columns = ['Id', 'age']
answer_df = answer_df.append(missed_df, ignore_index = True)

In [89]:
print answer_df.shape

(19979, 2)


In [90]:
answer_df.to_csv('Solution.csv', index = False)

In [91]:
id0 = Ids[real_train_size + 1]
print vectorized_data[real_train_size + 1]
print vectorizer.get_feature_names()[1523]

  (0, 3365)	6
  (0, 5908)	1
  (0, 3438)	1
  (0, 4715)	1
  (0, 6499)	1
  (0, 7405)	1
  (0, 8727)	1
  (0, 3963)	1
  (0, 7250)	1
  (0, 6411)	1
  (0, 428)	1
  (0, 4658)	1
  (0, 1523)	1
  (0, 8138)	1
  (0, 9315)	1
  (0, 6370)	1
  (0, 3041)	1
  (0, 9353)	1
comvideo
