# 1.1Load Data

In [1]:
%matplotlib notebook
import os
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt

In [2]:
homedir = os.environ['HOME']

super_data = pd.read_csv(homedir+'/data/mangaki-data-challenge/train.csv')
unsuper = pd.read_csv(homedir+'/data/mangaki-data-challenge/watched.csv')
title = pd.read_csv(homedir+'/data/mangaki-data-challenge-0908/titles.csv')
scraping = pd.read_csv(homedir+'/data/mangaki-data-challenge/scraping_data.csv', index_col=0)
test = pd.read_csv(homedir+'/data/mangaki-data-challenge/test.csv')

In [3]:
unsuper.head()

Unnamed: 0,user_id,work_id,rating
0,717,8025,dislike
1,1106,1027,neutral
2,1970,3949,neutral
3,1685,9815,like
4,1703,3482,like


In [4]:
title.head()

Unnamed: 0,work_id,title,category
0,0,Hayate the Combat Butler! Cuties,anime
1,1,Caffe Latte Rhapsody,manga
2,2,Dragon Ball,anime
3,3,Accel World,manga
4,4,Soukyuu no Fafner: Right of Left,anime


In [5]:
scraping.head()

Unnamed: 0,img,category,title
0,1,anime,Death Note
1,0,anime,Code Geass: Hangyaku no Lelouch
2,1,anime,Sword Art Online
3,1,anime,Naruto
4,1,anime,L'Attaque des Titans


# 1.2Merge and Sort Data

In [6]:
unsuper = pd.merge(unsuper, title, on="work_id")
unsuper = pd.merge(unsuper, scraping, on=['title', 'category'], how='left')

super_data = super_data.sort_values(by=["user_id", "work_id"])
unsuper = unsuper.sort_values(by=["user_id", "work_id"])

# 2.FE
## 2.1 fill Na, label encoding

In [7]:
# fill Na
unsuper['img'] = unsuper['img'].fillna(-1).astype(int)

# label encoding
label_dict = {'dislike':0, 'neutral':1, 'like':2, 'love':3}
unsuper['rating_id'] = unsuper['rating'].apply(lambda x: label_dict[x])

label_dict = {'album':0, 'anime':1, 'manga':2}
unsuper['category_id'] = unsuper['category'].apply(lambda x: label_dict[x])

unsuper.drop(['rating', 'category', 'title'], axis = 1, inplace=True)

unsuper.head()

Unnamed: 0,user_id,work_id,img,rating_id,category_id
52695,0,79,1,2,1
7431,0,104,1,2,1
67192,0,1701,1,2,1
27042,0,2739,1,2,1
75752,0,3562,1,2,1


## 2.2 FE with two feature quantities
2.2.1 watched work_id distribution for user_id

2.2.2 rating distribution for user_id

2.2.3 catecory distribution for user_id

2.2.4 rating distribution for work_id

2.2.5 Correspondence of work_id-img(scraping data), work_id-category_id

### 2.2.1 watched work_id distribution for user_id

In [8]:
pivot_wu_id = unsuper.pivot(index='user_id', columns='work_id', values='work_id').fillna(-1) > 0
pivot_wu_id.head()

work_id,0,1,2,3,4,5,6,7,9,10,...,9885,9886,9887,9889,9890,9891,9892,9893,9894,9896
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,True,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,True,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,True,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [9]:
from sklearn.manifold import TSNE
TSNE_pivot_wu_id = TSNE(n_components=3, perplexity=5).fit_transform(pivot_wu_id)

from sklearn.decomposition import PCA
PCA_pivot_wu_id = PCA(n_components=5).fit_transform(pivot_wu_id)

from sklearn import (decomposition, ensemble)

hasher = ensemble.RandomTreesEmbedding(n_estimators=200, random_state=0,
                                       max_depth=10)
X_transformed = hasher.fit_transform(pivot_wu_id)
pca = decomposition.TruncatedSVD(n_components=5)
RFE_pivot_wu_id = pca.fit_transform(X_transformed)

UW_ID = pd.DataFrame(np.concatenate([TSNE_pivot_wu_id, PCA_pivot_wu_id, RFE_pivot_wu_id], axis=1))
UW_ID.index = pivot_wu_id.index

### 2.2.2 rating distribution for user_id
### 2.2.3 catecory distribution for user_id

In [13]:
dum1 = pd.get_dummies(unsuper['rating_id'],prefix='rate_u')
dum2 = pd.get_dummies(unsuper['category_id'],prefix='cate_u')
user_rate_cate = dum1.join(dum2).join(unsuper['user_id'])

In [14]:
pivot_table_user_rate_cate = pd.pivot_table(data=user_rate_cate, index = 'user_id', aggfunc='sum')
pivot_table_user_rate_cate['Appear'] = unsuper['user_id'].value_counts().sort_index()

pivot_table_user_rate_cate.head()

Unnamed: 0_level_0,cate_u_0,cate_u_1,cate_u_2,rate_u_0,rate_u_1,rate_u_2,rate_u_3,Appear
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,0.0,19.0,0.0,1.0,1.0,17.0,0.0,19
1,0.0,87.0,0.0,10.0,6.0,33.0,38.0,87
2,0.0,49.0,0.0,2.0,2.0,45.0,0.0,49
3,0.0,402.0,0.0,23.0,84.0,295.0,0.0,402
4,0.0,64.0,9.0,6.0,6.0,35.0,26.0,73


In [15]:
pivot_table_user_rate_cate.describe()

Unnamed: 0,cate_u_0,cate_u_1,cate_u_2,rate_u_0,rate_u_1,rate_u_2,rate_u_3,Appear
count,1963.0,1963.0,1963.0,1963.0,1963.0,1963.0,1963.0,1963.0
mean,0.001019,82.695874,18.663271,11.928681,22.5135,61.148242,5.76974,101.360163
std,0.031911,124.480018,53.439432,32.350259,55.326436,84.713837,14.07121,144.862936
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
25%,0.0,12.0,0.0,0.0,1.0,10.5,0.0,18.0
50%,0.0,35.0,1.0,2.0,6.0,29.0,0.0,47.0
75%,0.0,97.0,15.5,10.0,21.0,77.0,5.0,122.0
max,1.0,1132.0,948.0,560.0,1119.0,818.0,202.0,1513.0


In [16]:
alpha_cate = np.array([0.0,82.7,18.7])*0.05
alpha_rate = np.array([11.9,22.5,61.1,5.8])*0.05

alpha = np.concatenate([alpha_cate, alpha_rate])

In [17]:
Pivot_User_rate_cate = pivot_table_user_rate_cate.drop('Appear', axis=1).apply(lambda x:x+alpha, axis=1)

Appear = pivot_table_user_rate_cate['Appear']

for i in ['cate_u_0','cate_u_1','cate_u_2']:
    Pivot_User_rate_cate[i] = Pivot_User_rate_cate[i] / (Appear+np.sum(alpha_cate))

for i in ['rate_u_0','rate_u_1','rate_u_2', 'rate_u_3']:
    Pivot_User_rate_cate[i] = Pivot_User_rate_cate[i] / (Appear+np.sum(alpha_rate))

Pivot_User_rate_cate.head()

Unnamed: 0_level_0,cate_u_0,cate_u_1,cate_u_2,rate_u_0,rate_u_1,rate_u_2,rate_u_3
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.0,0.961155,0.038845,0.066279,0.088303,0.833368,0.012051
1,0.0,0.989845,0.010155,0.115082,0.077391,0.391625,0.415902
2,0.0,0.982708,0.017292,0.047998,0.057801,0.888838,0.005364
3,0.0,0.997703,0.002297,0.057964,0.209119,0.732205,0.000712
4,0.0,0.872742,0.127258,0.084481,0.09127,0.487478,0.336771


### 2.2.4 rating distribution for work_id

In [18]:
dum1 = pd.get_dummies(unsuper['rating_id'],prefix='rate_w')
work_rate = dum1.join(unsuper['work_id'])

work_rate.head()

Unnamed: 0,rate_w_0,rate_w_1,rate_w_2,rate_w_3,work_id
52695,0,0,1,0,79
7431,0,0,1,0,104
67192,0,0,1,0,1701
27042,0,0,1,0,2739
75752,0,0,1,0,3562


In [19]:
pivot_table_work_rate = pd.pivot_table(data=work_rate, index='work_id', aggfunc='sum')
pivot_table_work_rate['Appear'] = unsuper['work_id'].value_counts().sort_index()

pivot_table_work_rate.head()

Unnamed: 0_level_0,rate_w_0,rate_w_1,rate_w_2,rate_w_3,Appear
work_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,2.0,2.0,2.0,0.0,6
1,0.0,0.0,1.0,0.0,1
2,50.0,85.0,373.0,41.0,549
3,0.0,1.0,4.0,0.0,5
4,2.0,3.0,8.0,1.0,14


In [20]:
pivot_table_work_rate.describe()

Unnamed: 0,rate_w_0,rate_w_1,rate_w_2,rate_w_3,Appear
count,8584.0,8584.0,8584.0,8584.0,8584.0
mean,2.727866,5.148416,13.983458,1.319432,23.179171
std,8.586941,13.725428,48.580373,6.743016,73.527325
min,0.0,0.0,0.0,0.0,1.0
25%,0.0,0.0,1.0,0.0,1.0
50%,1.0,1.0,2.0,0.0,3.0
75%,2.0,4.0,7.0,0.0,13.0
max,226.0,284.0,1050.0,178.0,1408.0


In [21]:
alpha_rate = np.array([2.727866,5.148416,13.983458,1.319432]) * 0.05

Pivot_Work_rate = pivot_table_work_rate.drop('Appear', axis=1).apply(lambda x:x+alpha_rate, axis=1)

Appear = pivot_table_work_rate['Appear']

for i in ['rate_w_0','rate_w_1','rate_w_2', 'rate_w_3']:
    Pivot_Work_rate[i] = Pivot_Work_rate[i] / (Appear+np.sum(alpha_rate))

Pivot_Work_rate.head()

Unnamed: 0_level_0,rate_w_0,rate_w_1,rate_w_2,rate_w_3
work_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,0.298422,0.315328,0.377034,0.009215
1,0.063176,0.119234,0.787034,0.030557
2,0.091131,0.154969,0.679257,0.074644
3,0.022146,0.204161,0.762982,0.010711
4,0.140933,0.214884,0.573863,0.07032


### 2.2.5 Correspondence of work_id-img(scraping data), work_id-category_id

In [22]:
wid_img_sheet = unsuper[['work_id','img', 'category_id']].drop_duplicates().sort_values(by=['work_id'])
wid_img_sheet.index = wid_img_sheet['work_id']
wid_img_sheet.drop(['work_id'], axis=1, inplace=True)
wid_img_sheet.head()

Unnamed: 0_level_0,img,category_id
work_id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,1,1
1,0,2
2,1,1
3,0,2
4,1,1


In [23]:
X1 = super_data['user_id'].apply(lambda x: UW_ID.ix[x] if x in UW_ID.index else  UW_ID.describe().iloc[1])
X2 = super_data['user_id'].apply(lambda x: Pivot_User_rate_cate.ix[x] if x in Pivot_User_rate_cate.index else  Pivot_User_rate_cate.describe().iloc[1])
X3 = super_data['work_id'].apply(lambda x: Pivot_Work_rate.ix[x] if x in Pivot_Work_rate.index else  Pivot_Work_rate.describe().iloc[1])
X4 = super_data['work_id'].apply(lambda x: wid_img_sheet.ix[x] if x in wid_img_sheet.index else  wid_img_sheet.describe().iloc[1])

X = pd.concat([X1, X2, X3, X4], axis=1)

y = super_data['rating']
y.name = 'rating'
train_data = pd.concat([X, y], axis=1)

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  """Entry point for launching an IPython kernel.
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  This is separate from the ipykernel package so we can avoid doing imports until
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  after removing the cwd from sys.path.


In [24]:
X1 = test['user_id'].apply(lambda x: UW_ID.ix[x] if x in UW_ID.index else  UW_ID.describe().iloc[1])
X2 = test['user_id'].apply(lambda x: Pivot_User_rate_cate.ix[x] if x in Pivot_User_rate_cate.index else  Pivot_User_rate_cate.describe().iloc[1])
X3 = test['work_id'].apply(lambda x: Pivot_Work_rate.ix[x] if x in Pivot_Work_rate.index else  Pivot_Work_rate.describe().iloc[1])
X4 = test['work_id'].apply(lambda x: wid_img_sheet.ix[x] if x in wid_img_sheet.index else  wid_img_sheet.describe().iloc[1])

test_data = pd.concat([X1, X2, X3, X4], axis=1)

In [25]:
train_data.to_csv("train_fe_finish.txt")

test_data.to_csv("test_fe_finish.txt")