In [1]:
import os
import gc
import glob
import random
import sys
# sys.path.append('../input/iterativestratification')

import numpy as np
import pandas as pd
from pandas import DataFrame, Series
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.auto import tqdm
from datetime import date, datetime

from sklearn.metrics import mean_squared_error, mean_squared_log_error
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from sklearn.model_selection import KFold , StratifiedKFold, GroupKFold
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.preprocessing import RobustScaler, MinMaxScaler, MultiLabelBinarizer

from imblearn.under_sampling import RandomUnderSampler

# from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
import lightgbm as lgb
import optuna.integration.lightgbm as op_lgb
import xgboost as xgb
from catboost import CatBoostRegressor
from catboost import Pool
from gensim.models import word2vec
from scipy.spatial.distance import pdist
from scipy.spatial.distance import squareform
# from tabpfn import TabPFNClassifier
import warnings
warnings.filterwarnings('ignore')



In [2]:
# !pip install ptitprince
# import ptitprince as pt 
# from ptitprince import RainCloud

In [3]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed_everything(0)

In [4]:
def create_kfold_seen(df):
    sgk = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    df["kfold"] = -1
    for i, (tr_id, va_id) in enumerate(sgk.split(df, df["user_id"].values)):
        df.loc[va_id, "kfold"] = int(i)   
    return df

def create_kfold_unseen(df):
    sgk = GroupKFold(n_splits=5)
    df["kfold"] = -1
    for i, (tr_id, va_id) in enumerate(sgk.split(df, df["user_id"], df["user_id"])):
        df.loc[va_id, "kfold"] = int(i)   
    return df


def stratified_and_group_kfold_split(train_df):
    # https://www.guruguru.science/competitions/21/discussions/45ffc8a1-e37c-4b95-aac4-c4e338aa6a9b/

    # 20%のユーザを抽出
    n_user = train_df["user_id"].nunique()
    unseen_users = random.sample(sorted(train_df["user_id"].unique()), k=n_user // 5)
    train_df["unseen_user"] = train_df["user_id"].isin(unseen_users)
    unseen_df = train_df[train_df["unseen_user"]].reset_index(drop=True)
    train_df = train_df[~train_df["unseen_user"]].reset_index(drop=True)

    # train_dfの80%をStratifiedKFoldで分割
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    for fold_id, (_, valid_idx) in enumerate(skf.split(train_df, train_df["user_id"])):
        train_df.loc[valid_idx, "fold"] = fold_id

    # 20%をGroupKFoldで分割
    gkf = GroupKFold(n_splits=5)
    unseen_df["fold"] = -1
    for fold_id, (_, valid_idx) in enumerate(gkf.split(unseen_df, unseen_df["user_id"], unseen_df["user_id"])):
        unseen_df.loc[valid_idx, "fold"] = fold_id

    # concat
    train_df = pd.concat([train_df, unseen_df], axis=0).reset_index(drop=True)
    train_df.drop(columns=["unseen_user"], inplace=True)
    return train_df

In [5]:
###Anime2Vec### 
def add_w2v_features(train_df, test_df, consider_score=True):
    anime_ids = train_df['anime_id'].unique().tolist()
    user_anime_list_dict = {user_id: anime_ids.tolist() for user_id, anime_ids in train_df.groupby('user_id')['anime_id']}

    # スコアを考慮する場合
    # 今回は1～10のレーティングなので、スコアが5のアニメは5回、スコアが10のアニメは10回、タイトルをリストに追加する
    if consider_score:
        title_sentence_list = []
        for user_id, user_df in train_df.groupby('user_id'):
            user_title_sentence_list = []
            for anime_id, anime_score in user_df[['anime_id', 'score']].values:
                for i in range(anime_score):
                    user_title_sentence_list.append(anime_id)
            title_sentence_list.append(user_title_sentence_list)
    # スコアを考慮しない場合
    # タイトルをそのままリストに追加する
    else:
        title_sentence_list = train_df.groupby('user_id')['anime_id'].apply(list).tolist()

    # ユーザごとにshuffleしたリストを作成
    shuffled_sentence_list = [random.sample(sentence, len(sentence)) for sentence in title_sentence_list]  ## <= 変更点

    # 元のリストとshuffleしたリストを合わせる
    train_sentence_list = title_sentence_list + shuffled_sentence_list

    # word2vecのパラメータ
    vector_size = 64
    w2v_params = {
        "vector_size": vector_size,  ## <= 変更点
        "seed": 0,
        "min_count": 1,
        "workers": 1
    }

    # word2vecのモデル学習
    model = word2vec.Word2Vec(train_sentence_list, **w2v_params)

    # ユーザーごとの特徴ベクトルと対応するユーザーID
    user_factors = {user_id: np.mean([model.wv[anime_id] for anime_id in user_anime_list], axis=0) for user_id, user_anime_list in user_anime_list_dict.items()}

    # アイテムごとの特徴ベクトルと対応するアイテムID
    item_factors = {aid: model.wv[aid] for aid in anime_ids}

    # データフレームを作成
    user_factors_df = pd.DataFrame(user_factors).T.reset_index().rename(columns={"index": "user_id"})
    item_factors_df = pd.DataFrame(item_factors).T.reset_index().rename(columns={"index": "anime_id"})

    # データフレームのカラム名をリネーム
    user_factors_df.columns = ["user_id"] + [f"anime_vec_user_factor_{i}" for i in range(vector_size)]
    item_factors_df.columns = ["anime_id"] + [f"anime_vec_anime_factor_{i}" for i in range(vector_size)]

    train_df = train_df.merge(user_factors_df, on="user_id", how="left")
    train_df = train_df.merge(item_factors_df, on="anime_id", how="left")
    
    test_df = test_df.merge(user_factors_df, on="user_id", how="left")
    test_df = test_df.merge(item_factors_df, on="anime_id", how="left")
    
    return train_df, test_df

def add_w2v_features_without_score(train_df, test_df, train_test_df):

    anime_ids = train_test_df['anime_id'].unique().tolist()
    user_anime_list_dict = {user_id: anime_ids.tolist() for user_id, anime_ids in train_test_df.groupby('user_id')['anime_id']}

    title_sentence_list = train_test_df.groupby('user_id')['anime_id'].apply(list).tolist()

    # ユーザごとにshuffleしたリストを作成
    shuffled_sentence_list = [random.sample(sentence, len(sentence)) for sentence in title_sentence_list]  ## <= 変更点

    # 元のリストとshuffleしたリストを合わせる
    train_sentence_list = title_sentence_list + shuffled_sentence_list

    # word2vecのパラメータ
    vector_size = 64
    w2v_params = {
        "vector_size": vector_size,  ## <= 変更点
        "seed": 0,
        "min_count": 1,
        "workers": 1
    }

    # word2vecのモデル学習
    model = word2vec.Word2Vec(train_sentence_list, **w2v_params)

    # ユーザーごとの特徴ベクトルと対応するユーザーID
    user_factors = {user_id: np.mean([model.wv[anime_id] for anime_id in user_anime_list], axis=0) for user_id, user_anime_list in user_anime_list_dict.items()}

    # アイテムごとの特徴ベクトルと対応するアイテムID
    item_factors = {aid: model.wv[aid] for aid in anime_ids}

    # データフレームを作成
    user_factors_df = pd.DataFrame(user_factors).T.reset_index().rename(columns={"index": "user_id"})
    item_factors_df = pd.DataFrame(item_factors).T.reset_index().rename(columns={"index": "anime_id"})

    # データフレームのカラム名をリネーム
    user_factors_df.columns = ["user_id"] + [f"anime_vec_wo_score_user_factor_{i}" for i in range(vector_size)]
    item_factors_df.columns = ["anime_id"] + [f"anime_vec_wo_score_item_factor_{i}" for i in range(vector_size)]
    
    train_df = train_df.merge(user_factors_df, on="user_id", how="left")
#     train_df = train_df.merge(item_factors_df, on="anime_id", how="left")
    
    test_df = test_df.merge(user_factors_df, on="user_id", how="left")
#     test_df = test_df.merge(item_factors_df, on="anime_id", how="left")

    return train_df, test_df

In [6]:
###User2Vec### 
def add_w2v_user_features(train_df, test_df, consider_score=True):
    user_ids = train_df['user_id'].unique().tolist()
    user_anime_list_dict = {anime_id: user_ids.tolist() for anime_id, user_ids in train_df.groupby('anime_id')['user_id']}

    # スコアを考慮する場合
    # 今回は1～10のレーティングなので、スコアが5のアニメは5回、スコアが10のアニメは10回、タイトルをリストに追加する
    if consider_score:
        title_sentence_list = []
        for user_id, user_df in train_df.groupby('anime_id'):
            user_title_sentence_list = []
            for anime_id, anime_score in user_df[['user_id', 'score']].values:
                for i in range(anime_score):
                    user_title_sentence_list.append(anime_id)
            title_sentence_list.append(user_title_sentence_list)
    # スコアを考慮しない場合
    # タイトルをそのままリストに追加する
    else:
        title_sentence_list = train_df.groupby('anime_id')['user_id'].apply(list).tolist()

    # ユーザごとにshuffleしたリストを作成
    shuffled_sentence_list = [random.sample(sentence, len(sentence)) for sentence in title_sentence_list]  ## <= 変更点

    # 元のリストとshuffleしたリストを合わせる
    train_sentence_list = title_sentence_list + shuffled_sentence_list

    # word2vecのパラメータ
    vector_size = 64
    w2v_params = {
        "vector_size": vector_size,  ## <= 変更点
        "seed": 0,
        "min_count": 1,
        "workers": 1
    }

    # word2vecのモデル学習
    model = word2vec.Word2Vec(train_sentence_list, **w2v_params)

    # アニメごとの特徴ベクトルと対応するアニメID
    user_factors = {anime_id: np.mean([model.wv[user_id] for user_id in user_anime_list], axis=0) for anime_id, user_anime_list in user_anime_list_dict.items()}

    # アイテムごとの特徴ベクトルと対応するアイテムID
    item_factors = {uid: model.wv[uid] for uid in user_ids}

    # データフレームを作成
    user_factors_df = pd.DataFrame(user_factors).T.reset_index().rename(columns={"index": "anime_id"})
    item_factors_df = pd.DataFrame(item_factors).T.reset_index().rename(columns={"index": "user_id"})

    # データフレームのカラム名をリネーム
    user_factors_df.columns = ["anime_id"] + [f"user_vec_anime_factor_{i}" for i in range(vector_size)]
    item_factors_df.columns = ["user_id"] + [f"user_vec_user_factor_{i}" for i in range(vector_size)]

    train_df = train_df.merge(user_factors_df, on="anime_id", how="left")
    train_df = train_df.merge(item_factors_df, on="user_id", how="left")
    
    test_df = test_df.merge(user_factors_df, on="anime_id", how="left")
    test_df = test_df.merge(item_factors_df, on="user_id", how="left")
    
    return train_df, test_df

def add_w2v_user_features_without_score(train_df, test_df, train_test_df):

    user_ids = train_test_df['user_id'].unique().tolist()
    user_anime_list_dict = {anime_id: user_ids.tolist() for anime_id, user_ids in train_test_df.groupby('anime_id')['user_id']}

    title_sentence_list = train_test_df.groupby('anime_id')['user_id'].apply(list).tolist()

    # ユーザごとにshuffleしたリストを作成
    shuffled_sentence_list = [random.sample(sentence, len(sentence)) for sentence in title_sentence_list]  ## <= 変更点

    # 元のリストとshuffleしたリストを合わせる
    train_sentence_list = title_sentence_list + shuffled_sentence_list

    # word2vecのパラメータ
    vector_size = 64
    w2v_params = {
        "vector_size": vector_size,  ## <= 変更点
        "seed": 0,
        "min_count": 1,
        "workers": 1
    }

    # word2vecのモデル学習
    model = word2vec.Word2Vec(train_sentence_list, **w2v_params)

    # ユーザーごとの特徴ベクトルと対応するユーザーID
    user_factors = {anime_id: np.mean([model.wv[user_id] for user_id in user_anime_list], axis=0) for anime_id, user_anime_list in user_anime_list_dict.items()}

    # アイテムごとの特徴ベクトルと対応するアイテムID
    item_factors = {aid: model.wv[aid] for aid in user_ids}

    # データフレームを作成
    user_factors_df = pd.DataFrame(user_factors).T.reset_index().rename(columns={"index": "anime_id"})
    item_factors_df = pd.DataFrame(item_factors).T.reset_index().rename(columns={"index": "user_id"})

    # データフレームのカラム名をリネーム
    user_factors_df.columns = ["anime_id"] + [f"user_vec_wo_score_anime_factor_{i}" for i in range(vector_size)]
    item_factors_df.columns = ["user_id"] + [f"user_vec_wo_score_user_factor_{i}" for i in range(vector_size)]
    
    train_df = train_df.merge(user_factors_df, on="anime_id", how="left")
#     train_df = train_df.merge(item_factors_df, on="anime_id", how="left")
    
    test_df = test_df.merge(user_factors_df, on="anime_id", how="left")
#     test_df = test_df.merge(item_factors_df, on="anime_id", how="left")

    return train_df, test_df

In [7]:
###Etc2Vec### 
def add_w2v_genres_features(train_df, test_df, train_test_df, col,consider_score=True):
    # スコアを考慮する場合
    # 今回は1～10のレーティングなので、スコアが5のアニメは5回、スコアが10のアニメは10回、タイトルをリストに追加する
    if consider_score:
        genres_ids = train_df[col].unique().tolist()
        user_anime_list_dict = {user_id: genres_ids.tolist() for user_id, genres_ids in train_df.groupby('user_id')[col]}
        title_sentence_list = []
        for user_id, user_df in train_df.groupby('user_id'):
            user_title_sentence_list = []
            for anime_id, anime_score in user_df[[col, 'score']].values:
                for i in range(anime_score):
                    user_title_sentence_list.append(anime_id)
            title_sentence_list.append(user_title_sentence_list)
    # スコアを考慮しない場合
    # タイトルをそのままリストに追加する
    else:
        genres_ids = train_test_df[col].unique().tolist()
        user_anime_list_dict = {user_id: genres_ids.tolist() for user_id, genres_ids in train_test_df.groupby('user_id')[col]}
        title_sentence_list = train_test_df.groupby('user_id')[col].apply(list).tolist()

    # ユーザごとにshuffleしたリストを作成
    shuffled_sentence_list = [random.sample(sentence, len(sentence)) for sentence in title_sentence_list]  ## <= 変更点

    # 元のリストとshuffleしたリストを合わせる
    train_sentence_list = title_sentence_list + shuffled_sentence_list

    # word2vecのパラメータ
    vector_size = 64
    w2v_params = {
        "vector_size": vector_size,  ## <= 変更点
        "seed": 0,
        "min_count": 1,
        "workers": 1
    }

    # word2vecのモデル学習
    model = word2vec.Word2Vec(train_sentence_list, **w2v_params)

    # userごとの特徴ベクトルと対応するuserID
    user_factors = {user_id: np.mean([model.wv[genres] for genres in user_anime_list], axis=0) for user_id, user_anime_list in user_anime_list_dict.items()}

    # データフレームを作成
    user_factors_df = pd.DataFrame(user_factors).T.reset_index().rename(columns={"index": "user_id"})
    
    # データフレームのカラム名をリネーム
    if consider_score:
        user_factors_df.columns = ["user_id"] + [f"user_vec_{col}_factor_{i}" for i in range(vector_size)]
    else:
        user_factors_df.columns = ["user_id"] + [f"user_vec_wo_score_{col}_factor_{i}" for i in range(vector_size)]

    train_df = train_df.merge(user_factors_df, on="user_id", how="left")
    test_df = test_df.merge(user_factors_df, on="user_id", how="left")
    
    return train_df, test_df

In [11]:
def onehot(df, col):
    list_srs = df[col].map(lambda x: x.split(", ")).tolist()
    count = df[col].map(lambda x: len(x.split(", ")))
    mlb = MultiLabelBinarizer()
    ohe_srs = mlb.fit_transform(list_srs)
    if col == "genres" or col == "licensors":
        col_df = pd.DataFrame(ohe_srs, columns=[f"ohe_{col}_{name}" for name in mlb.classes_])
    else:
        components_num = 15
        svd = TruncatedSVD(n_components=components_num, random_state=0)
        svd_arr = svd.fit_transform(ohe_srs)
        col_df = pd.DataFrame(svd_arr, columns=[f"svd_{col}_{ix}" for ix in range(components_num)])
    
    col_df[f"{col}_count_num"] = count
    
    return col_df

def duration_transform(d):
    if d == "Unknown":
        return np.nan
    elif "min" not in d:
        return int(d[0])*60
    elif "hr" in d:
        d = d.split(".")
        count = int(d[0].replace("hr",""))*60 + int(d[1].replace("min",""))
        return count
    else:
        d = d.split(".")
        count = int(d[0].replace("min",""))
        return count

def anime_feature_create(df):
    for c in ["genres", "producers", "studios", "licensors"]:
        _df = onehot(df, c)
        df = pd.concat([df, _df], axis=1)
        
    df["genres"] = df["genres"].map(lambda x: ", ".join(sorted(x.split(", "))))
    
    le = LabelEncoder()
    df["type_and_source"] = df["type"] + " and " + df["source"]
    df["type"] = le.fit_transform(df["type"])
    df["source"] = le.fit_transform(df["source"])
    df["rating"] = le.fit_transform(df["rating"])
    df["type_and_source_le"] = le.fit_transform(df["type_and_source"])
    
    df["is_contains_ep"] = df["duration"].str.contains("per")*1
    
    for c in ["watching", "completed", "on_hold", "dropped", "plan_to_watch"]:
        df[f"{c}_per_member"] = df[c] / df["members"]
    
    df["episodes"] = df["episodes"].replace({"Unknown": np.nan})
    df["episodes"] = pd.to_numeric(df["episodes"], errors='coerce')
    
    df["duration"] = df["duration"].apply(lambda x: duration_transform(x))
    df["total_duration"] = df["duration"] * df["episodes"]
    
    return df

def user_similarity(train_df, train_test_df):
    mlb = MultiLabelBinarizer()
    user_anime_list = pd.DataFrame(train_test_df.groupby("user_id")["anime_id"].apply(list).reset_index())
    matrix = mlb.fit_transform(user_anime_list["anime_id"])
    _matrix = pdist(matrix, 'cosine')
    _matrix = 1 - _matrix
    _matrix = squareform(_matrix)
    components_num = 32
    svd = TruncatedSVD(n_components=components_num, random_state=0)
    svd_arr = svd.fit_transform(matrix)
    _svd_arr = svd.fit_transform(_matrix)
    col_df = pd.DataFrame(svd_arr, columns=[f"svd_user_anime_looked_{ix}" for ix in range(components_num)])
    _col_df = pd.DataFrame(_svd_arr, columns=[f"svd_user_anime_looked_similarity_{ix}" for ix in range(components_num)])
    _user_anime_list1 = pd.concat([user_anime_list["user_id"], col_df, _col_df], axis=1)
    
    user_anime_list = pd.DataFrame(train_df.groupby("user_id")["anime_id"].apply(list).reset_index())
    matrix = mlb.fit_transform(user_anime_list["anime_id"])
    anime_list = sorted(train_df["anime_id"].unique())
    user_list = user_anime_list["user_id"].values
    for i in tqdm(range(matrix.shape[0])):
        idxs = np.where(matrix[i]==1)[0]
        user = user_list[i]
        _df = train_df[train_df["user_id"]==user]
        for idx in idxs:
            anime = anime_list[idx]
            score = _df[_df["anime_id"]==anime]["score"].values
            matrix[i,idx] = score
    _matrix = pdist(matrix, 'cosine')
    _matrix = 1 - _matrix
    _matrix = squareform(_matrix)
    svd_arr = svd.fit_transform(matrix)
    _svd_arr = svd.fit_transform(_matrix)
    col_df = pd.DataFrame(svd_arr, columns=[f"svd_user_anime_score_{ix}" for ix in range(components_num)])
    _col_df = pd.DataFrame(_svd_arr, columns=[f"svd_user_anime_score_similarity_{ix}" for ix in range(components_num)])
    _user_anime_list2 = pd.concat([user_anime_list["user_id"], col_df, _col_df], axis=1)
    
    return _user_anime_list1, _user_anime_list2

def anime_similarity(train_df, train_test_df):
    mlb = MultiLabelBinarizer()
    user_anime_list = pd.DataFrame(train_test_df.groupby("anime_id")["user_id"].apply(list).reset_index())
    matrix = mlb.fit_transform(user_anime_list["user_id"])
    _matrix = pdist(matrix, 'cosine')
    _matrix = 1 - _matrix
    _matrix = squareform(_matrix)
    components_num = 32
    svd = TruncatedSVD(n_components=components_num, random_state=0)
    svd_arr = svd.fit_transform(matrix)
    _svd_arr = svd.fit_transform(_matrix)
    col_df = pd.DataFrame(svd_arr, columns=[f"svd_anime_user_looked_{ix}" for ix in range(components_num)])
    _col_df = pd.DataFrame(_svd_arr, columns=[f"svd_anime_user_looked_similarity_{ix}" for ix in range(components_num)])
    _user_anime_list1 = pd.concat([user_anime_list["anime_id"], col_df, _col_df], axis=1)
    
    user_anime_list = pd.DataFrame(train_df.groupby("anime_id")["user_id"].apply(list).reset_index())
    matrix = mlb.fit_transform(user_anime_list["user_id"])
    user_list = sorted(train_df["user_id"].unique())
    anime_list = user_anime_list["anime_id"].values
    for i in tqdm(range(matrix.shape[0])):
        idxs = np.where(matrix[i]==1)[0]
        anime = anime_list[i]
        _df = train_df[train_df["anime_id"]==anime]
        for idx in idxs:
            user = user_list[idx]
            score = _df[_df["user_id"]==user]["score"].values
            matrix[i,idx] = score
    _matrix = pdist(matrix, 'cosine')
    _matrix = 1 - _matrix
    _matrix = squareform(_matrix)
    svd_arr = svd.fit_transform(matrix)
    _svd_arr = svd.fit_transform(_matrix)
    col_df = pd.DataFrame(svd_arr, columns=[f"svd_anime_user_score_{ix}" for ix in range(components_num)])
    _col_df = pd.DataFrame(_svd_arr, columns=[f"svd_anime_user_score_similarity_{ix}" for ix in range(components_num)])
    _user_anime_list2 = pd.concat([user_anime_list["anime_id"], col_df, _col_df], axis=1)
    
    return _user_anime_list1, _user_anime_list2
    

def feature_create(train_df, test_df, train_test_df):
    ###ANIME###
    _df = train_test_df.groupby("anime_id")["user_id"].agg(["count"]).add_prefix("user_id_")
    train_df = train_df.merge(_df, how="left", on="anime_id")
    test_df = test_df.merge(_df, how="left", on="anime_id")
    train_test_df = train_test_df.merge(_df, how="left", on="anime_id")
    
    ###USER###
#     counts_df = pd.DataFrame(tr_df["user_id"].value_counts(sort=True)).reset_index()
#     small_num_users = counts_df[counts_df["user_id"] < 50]["index"].values
#     _df = tr_df.groupby("user_id")["score"].agg(["mean", "var"]).add_prefix("user_score_").reset_index()
#     te_df = te_df.merge(_df, how="left", on="user_id")
#     _df.loc[_df["user_id"].isin(small_num_users)] = np.nan
#     tr_df = tr_df.merge(_df, how="left", on="user_id")
    _df = train_test_df.groupby("user_id")["anime_id"].agg(["count"]).add_prefix("looked_anime_")
    train_df = train_df.merge(_df, how="left", on="user_id")
    test_df = test_df.merge(_df, how="left", on="user_id")
    
    _df = pd.DataFrame()
    for c in ["members", "watching", "completed", "on_hold", "dropped", "plan_to_watch"]:
        if c == "members":
            _df = pd.concat([
                _df,
                train_df.groupby("user_id")[c].agg(["mean", "median", "var"]).add_prefix(f"per_user_{c}_"),
            ], axis=1)
        else:
            _df = pd.concat([
                _df,
                train_df.groupby("user_id")[c].agg(["mean", "median", "var"]).add_prefix(f"per_user_{c}_"),
                train_df.groupby("user_id")[f"{c}_per_member"].agg(["mean", "median", "var"]).add_prefix(f"per_user_{c}_per_member_"),
            ], axis=1)
    
    train_df = train_df.merge(_df, how="left", on="user_id")
    test_df = test_df.merge(_df, how="left", on="user_id")
    
    _df = train_test_df.groupby("user_id")["user_id_count"].agg(["mean", "median", "var"]).add_prefix("user_id_count_")
    train_df = train_df.merge(_df, how="left", on="user_id")
    test_df = test_df.merge(_df, how="left", on="user_id")
    
#     _df = train_df.groupby("user_id")["score"].agg(["var", "median", "mean"]).add_prefix("user_score_")
#     train_df = train_df.merge(_df, how="left", on="user_id")
#     test_df = test_df.merge(_df, how="left", on="user_id")
    
    ohe_genres_c = [c for c in train_test_df.columns if "ohe_genres" in c]
    _df = train_test_df.groupby("user_id")[ohe_genres_c].mean().add_prefix("user_mean_")
    train_df = train_df.merge(_df, how="left", on="user_id")
    test_df = test_df.merge(_df, how="left", on="user_id")
    

    _df1, _df2 = user_similarity(train_df, train_test_df)    
    train_df = train_df.merge(_df1, how="left", on="user_id")
    test_df = test_df.merge(_df1, how="left", on="user_id")
    train_df = train_df.merge(_df2, how="left", on="user_id")
    test_df = test_df.merge(_df2, how="left", on="user_id")
    _df1, _df2 = anime_similarity(train_df, train_test_df)
    train_df = train_df.merge(_df1, how="left", on="anime_id")
    test_df = test_df.merge(_df1, how="left", on="anime_id")
    train_df = train_df.merge(_df2, how="left", on="anime_id")
    test_df = test_df.merge(_df2, how="left", on="anime_id")
#     train_df = train_df.drop(ohe_genres_c, axis=1)
#     test_df = test_df.drop(ohe_genres_c, axis=1)
    
#     _df = train_df.groupby("user_id")["duration"].agg(["mean", "median", "max", "min"]).add_prefix("user_anime_duration_")
#     train_df = train_df.merge(_df, how="left", on="user_id")
#     test_df = test_df.merge(_df, how="left", on="user_id")
    
#     _df = train_df.groupby("user_id")["total_duration"].agg(["mean", "median", "max", "min"]).add_prefix("user_anime_total_duration_")
#     train_df = train_df.merge(_df, how="left", on="user_id")
#     test_df = test_df.merge(_df, how="left", on="user_id")
    
    return train_df, test_df

In [12]:
train_df = pd.read_csv("/kaggle/input/atmacup-15dataset/train.csv")
test_df = pd.read_csv("/kaggle/input/atmacup-15dataset/test.csv")
anime_df = pd.read_csv("/kaggle/input/atmacup-15dataset/anime.csv")

# Seen

In [13]:
print("anime_features_create")
_anime_df = anime_feature_create(anime_df)
train_df = train_df.merge(_anime_df, how="left", on="anime_id")
test_df = test_df.merge(_anime_df, how="left", on="anime_id")
print("anime_vec_create")
train_test_df = pd.concat([train_df, test_df]).reset_index(drop=True)
train_df, test_df = add_w2v_features_without_score(train_df, test_df, train_test_df)
train_df, test_df = add_w2v_features(train_df, test_df, consider_score=True)
print("user_vec_create")
train_df, test_df = add_w2v_user_features(train_df, test_df, consider_score=True)
train_df, test_df = add_w2v_user_features_without_score(train_df, test_df, train_test_df)
# print("genres_vec_create")
# train_df, test_df = add_w2v_genres_features(train_df, test_df, train_test_df, consider_score=True)
# train_df, test_df = add_w2v_genres_features(train_df, test_df, train_test_df, consider_score=False)
# print("type_source_vec_create")
# train_df, test_df = add_w2v_genres_features(train_df, test_df, train_test_df, "type_and_source" ,consider_score=True)
# train_df, test_df = add_w2v_genres_features(train_df, test_df, train_test_df, "type_and_source" ,consider_score=False)
print("features_create")
train_test_df = pd.concat([train_df, test_df]).reset_index(drop=True)
train_df, test_df = feature_create(train_df, test_df, train_test_df)

train_df = train_df.drop(["type_and_source"], axis=1)
test_df = test_df.drop(["type_and_source"], axis=1)

anime_features_create
anime_vec_create
user_vec_create
features_create


  0%|          | 0/1794 [00:00<?, ?it/s]

  0%|          | 0/1954 [00:00<?, ?it/s]

In [14]:
test_seen_df = test_df[test_df["user_id"].isin(train_df["user_id"].unique())].reset_index(drop=True)
test_unseen_df = test_df[~test_df["user_id"].isin(train_df["user_id"].unique())].reset_index(drop=True)

In [15]:
train_df.to_csv("train_df.csv", index=False)
test_df.to_csv("test_df.csv", index=False)
# test_seen_df.to_csv("test_seen_df.csv", index=False)
# test_unseen_df.to_csv("test_unseen_df.csv", index=False)

# UnSeen

# EDA

In [None]:
# fig, ax = plt.subplots(figsize=(12, 6))
# RainCloud(data=train_df, x="rating", y="score", ax=ax)

# fig.tight_layout()
# ax.grid()

In [None]:
mlb = MultiLabelBinarizer()
user_anime_list = pd.DataFrame(train_df.groupby("user_id")["anime_id"].apply(list).reset_index())
matrix = mlb.fit_transform(user_anime_list["anime_id"])

In [None]:
from scipy.spatial.distance import pdist
from scipy.spatial.distance import squareform
# d = pdist(matrix, 'cosine')
# d = 1 - d
# d = squareform(d)