# sampleをベースに特徴量を加えたりして，精度向上を目指す

In [None]:
import json
import csv
import random
import numpy as np
import pandas as pd
from tqdm import tqdm_notebook as tqdm
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
import scipy as sc
seed = random.seed(12345)

from collections import Counter
import pickle

In [None]:
df = pd.read_csv('./train.csv')
df_test = pd.read_csv('./test.csv')

# 記事IDの他に，記事の長さ，スクロールの長さ，記事閲覧時間を使用する
# そのままでは扱いづらいので０〜１００ -> 0 100~200 -> 1 のように新たにカテゴライズする

In [None]:
V1 = []
for v in df['scroll_length'].values:
    value = list(map(float,v.replace('[','').replace(']','').split(',')))
    value = [str(int(v//25)) for v in value]
    V1.append('['+', '.join(value)+']')
V=[]
for v in df['session_length'].values:
    value = list(map(float,v.replace('[','').replace(']','').split(',')))
    value = [str(int(v//1)) for v in value]
    V.append('['+', '.join(value)+']')
    
V2=[]
for v in df['article_length'].values:
    value = list(map(float,v.replace('[','').replace(']','').split(',')))
    value = [str(int(v//50)) for v in value]
    V2.append('['+', '.join(value)+']')
    
df['session_length_alt'] = V
df['scroll_length_alt'] = V1
df['article_length_alt'] = V2

In [None]:
#test data もdouyouni
V = []
for v in df_test['session_length'].values:
    value = list(map(float,v.replace('[','').replace(']','').split(',')))
    
    value = [str(200//10) if i > 200 else "0" if i < 0 else str(i//10) for i in value]
    value = ', '.join(value)
    V.append('['+value+']')

V1 = []
for v in df_test['scroll_length'].values:
    value = list(map(float,v.replace('[','').replace(']','').split(',')))
    value = [str(5000//250) if i > 5000 else "0" if i < 0 else str(i//250) for i in value]
    value = ', '.join(value)
    V1.append('['+value+']')

    
V2 = []
for v in df_test['article_length'].values:
    value = list(map(float,v.replace('[','').replace(']','').split(',')))
    value = [str(10000//500) if i > 10000 else "0" if i < 0 else str(i//500) for i in value]
    value = ', '.join(value)
    V2.append('['+value+']')
    
df_test['session_length_alt'] = V
df_test['scroll_length_alt'] = V1
df_test['article_length_alt'] = V2

In [None]:
# 再利用するためにメソッドとして定義
def preprocess_x(df, vectorizer, with_fit=False,name='article_ids'):

    transformed_article_ids = []
    for i, row in tqdm(df.iterrows(), desc='transform' + name, total=len(df)):
        transformed_article_ids.append(json.loads(row[name]))

    if with_fit:
        vectorizer.fit(transformed_article_ids)

    return vectorizer.transform(transformed_article_ids)

In [None]:
# train / valid 分割
feature_num = 20
train, valid = train_test_split(df, test_size=0.2, shuffle=True, random_state=seed)

#記事ID
train_x = preprocess_x(train, vectorizer, with_fit=True)
train_y = np.asarray(train['age_range'])

valid_x = preprocess_x(valid, vectorizer, with_fit=False)
valid_y = np.asarray(valid['age_range'])
test_x = preprocess_x(df_test, vectorizer, with_fit=False)

#閲覧時間
vectorizer = CountVectorizer(analyzer=lambda x: x, max_features=feature_num)
train_x1 = preprocess_x(train, vectorizer, with_fit=True, name="session_length_alt")
train_y = np.asarray(train['age_range'])
valid_x1 = preprocess_x(valid, vectorizer, with_fit=False,name='session_length_alt')
valid_y = np.asarray(valid['age_range'])
test_x1 = preprocess_x(df_test, vectorizer, with_fit=False,name='session_length_alt')

#スクロールした長さ
vectorizer = CountVectorizer(analyzer=lambda x: x, max_features=feature_num)
train_x2 = preprocess_x(train, vectorizer, with_fit=True, name="scroll_length_alt")
valid_x2 = preprocess_x(valid, vectorizer, with_fit=False,name='scroll_length_alt')
test_x2 = preprocess_x(df_test, vectorizer, with_fit=False,name='scroll_length_alt')

#記事の長さ
vectorizer = CountVectorizer(analyzer=lambda x: x, max_features=feature_num)
train_x3 = preprocess_x(train, vectorizer, with_fit=True, name="article_length_alt")
valid_x3 = preprocess_x(valid, vectorizer, with_fit=False,name='article_length_alt')
test_x3 = preprocess_x(df_test, vectorizer, with_fit=False,name='article_length_alt')

# 方針1 age gender でマルチタスク
# 方針2 ageでシングルタスク
# 入力データもx,x1,x2,x3を色々組み合わせて試す

In [None]:
#gender ラベルを作成
train_y1 = np.asarray(train['gender_range'])
valid_y1 = np.asarray(valid['gender_range'])

In [None]:
import keras
import keras.backend as K
from keras.layers import *

In [None]:
#multi task model

def m(units=512):
    i = Input(shape=(5000,))
    x = Dense(units,activation='relu')(i)
    x = Dropout(0.5)(x)
    x = Dense(units//2,activation='relu')(x)
    x = Dropout(0.5)(x)
    x = Dense(units//4,activation='relu')(x)
    x = Dropout(0.5)(x)
    #
    i1 = Input(shape=(20,))
    i2 = Input(shape=(20,))
    i3 = Input(shape=(20,))
    #
    j1 = Dense(16,activation='relu')(i1)
    j2 = Dense(16,activation='relu')(i2)
    j3 = Dense(16,activation='relu')(i3)
    #x = concatenate([x,j1])
    x = concatenate([x,j1,j2,j3])
    x1 = Dense(units//4,activation='relu')(x)
    x1 = Dropout(0.5)(x1)
    x2 = Dense(1,activation='sigmoid',name='gender')(x1)
    x1 = Dense(6,activation='softmax',name='age')(x1)
    model = keras.Model(
                                    #i,
                                    #[i,i1],
                                    [i,i1,i2,i3],
                                    [x1,x2])
    model.compile(
                loss = {'age':'sparse_categorical_crossentropy','gender':'binary_crossentropy'},
                optimizer=keras.optimizers.Adam(1e-04),
                #optimizer = keras.optimizers.SGD(),
                metrics=['accuracy'],
                loss_weights = {'age':1,
                                        'gender':0.25}
                )
    #model.summary()
    return model


####
#学習
####

for i in [32,64,128]:
    print('batch_size',i)
    model = m(units=1024)
    hist = model.fit(
        #train_x.toarray(),
        #[train_x.toarray(),train_x1.toarray()],
        #[train_x.toarray(),train_x2.toarray()],
        #[train_x.toarray(),train_x3.toarray()],
        [train_x.toarray(),train_x1.toarray(),train_x2.toarray(),train_x3.toarray()],
        [train_y,train_y1],
                 epochs=50,batch_size=i,verbose=0,
                 callbacks=[keras.callbacks.EarlyStopping(monitor='val_age_loss',verbose=2,patience=10)],
                validation_data = (
                    #valid_x.toarray(), 
                    #[valid_x.toarray(),valid_x1.toarray()],
                    #[valid_x.toarray(),valid_x2.toarray()],
                    #[valid_x.toarray(),valid_x3.toarray()],
                    [valid_x.toarray(),valid_x1.toarray(),valid_x2.toarray(),valid_x3.toarray()],
                    [valid_y,valid_y1]))
    
    print(model.evaluate([valid_x.toarray(),valid_x1.toarray(),valid_x2.toarray(),valid_x3.toarray()],[valid_y,valid_y1]))
    pred = model.predict([valid_x.toarray(),valid_x1.toarray(),valid_x2.toarray(),valid_x3.toarray()])
    pred = [np.argmax(i) for i in pred[0]]
    print(confusion_matrix(pred,valid_y))

In [None]:
#評価
from sklearn.metrics import confusion_matrix
#print(model.evaluate([valid_x.toarray(),valid_x3.toarray()],[valid_y,valid_y1]))
#pred = model.predict([valid_x.toarray(),valid_x3.toarray()])
print(model.evaluate([valid_x.toarray(),valid_x1.toarray(),valid_x2.toarray(),valid_x3.toarray()],[valid_y,valid_y1]))
pred = model.predict([valid_x.toarray(),valid_x1.toarray(),valid_x2.toarray(),valid_x3.toarray()])
pred = [np.argmax(i) for i in pred[0]]
#pred = [1 if i>0.5 else 0 for i in pred]
print(confusion_matrix(pred,valid_y))

In [None]:
#保存
pred = [np.argmax(i) for i in pred[0]]
with open('./submission9.csv', 'w') as f:
    writer = csv.writer(f)

    writer.writerow(["user_id", "age_range"])

    for i, row in df_test.iterrows():
        writer.writerow([row.user_id, pred[i]])

In [None]:
# 続いてsingle task model

# single task
def m(units=512):
    i = Input(shape=(5000,))
    x = Dense(units,activation='relu')(i)
    x = Dropout(0.5)(x)
    x = Dense(units//4,activation='relu')(x)
    x = Dropout(0.5)(x)
    #
    i1 = Input(shape=(20,))
    i2 = Input(shape=(20,))
    i3 = Input(shape=(20,))
    #
    j1 = Dense(16,activation='relu')(i1)
    j2 = Dense(16,activation='relu')(i2)
    j3 = Dense(16,activation='relu')(i3)
    x = concatenate([x,j1])
    #x = concatenate([x,j1,j2,j3])
    x1 = Dense(units//4,activation='relu')(x)
    x1 = Dropout(0.5)(x1)
    x1 = Dense(6,activation='softmax',name='age')(x1)
    model = keras.Model(
                                    #i,
                                    [i,i1],
                                    #[i,i1,i2,i3],
                                    x1)
    model.compile(
                loss = 'sparse_categorical_crossentropy',
                optimizer=keras.optimizers.Adam(1e-03),
                #optimizer=keras.optimizers.SGD(),
                metrics=['accuracy'],
                )
    model.summary()
    return model

for i in [32]:
    model = m(units=1024)
    hist = model.fit(
        #train_x.toarray(),
        [train_x.toarray(),train_x1.toarray()],
        #[train_x.toarray(),train_x2.toarray()],
        #[train_x.toarray(),train_x3.toarray()],
        #[train_x.toarray(),train_x1.toarray(),train_x2.toarray(),train_x3.toarray()],
        train_y,
                 epochs=100,batch_size=32,verbose=2,
                 callbacks=[keras.callbacks.EarlyStopping(monitor='val_loss',verbose=2,patience=5)],
                validation_data = (
                    #valid_x.toarray(), 
                    [valid_x.toarray(),valid_x1.toarray()],
                    #[valid_x.toarray(),valid_x2.toarray()],
                    #[valid_x.toarray(),valid_x3.toarray()],
                    #[valid_x.toarray(),valid_x1.toarray(),valid_x2.toarray(),valid_x3.toarray()],
                    valid_y))
    print(model.evaluate([valid_x.toarray(),valid_x1.toarray()],valid_y))

In [None]:
#評価

print(model.evaluate(valid_x.toarray(),valid_y))
pred = model.predict(valid_x.toarray())
pred = [np.argmax(i) for i in pred]
print(confusion_matrix(pred,valid_y))

# print(model.evaluate([valid_x.toarray(),valid_x1.toarray(),valid_x2.toarray(),valid_x3.toarray()],valid_y))
# pred = model.predict([valid_x.toarray(),valid_x1.toarray(),valid_x2.toarray(),valid_x3.toarray()])
# pred = [np.argmax(i) for i in pred]
# print(confusion_matrix(pred,valid_y))

In [None]:
pred = model.predict([test_x.toarray(),test_x3.toarray()])
pred = [np.argmax(i) for i in pred]

with open('./submission25.csv', 'w') as f:
    writer = csv.writer(f)

    writer.writerow(["user_id", "age_range"])
    for i, row in df_test.iterrows():
        writer.writerow([row.user_id, pred[i]])