In [None]:
!lsb_release -a

In [None]:
!pip install catboost

# Import

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 

import lightgbm as lgb
from catboost import CatBoostRegressor,Pool
from catboost.eval.catboost_evaluation import *
from sklearn.model_selection import train_test_split
# import shap


from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Preprocessing Data

In [None]:
def generate_ap_id(df):
    df['release_time'] = pd.to_datetime(df['release_time'])
    df['year'] = df['release_time'].apply(lambda x: x.year)


    df['artist_id'] = df['artist_id'].str.replace('.',' ') \
                                    .str.replace(',',' ')  \
                                    .str.split(' ')

    df['composers_id'] = df['composers_id'].str.replace('.',' ')   \
                                    .str.replace(',',' ')          \
                                    .str.split(' ')

    temp_artist = pd.get_dummies(df['artist_id']      \
                                    .apply(pd.Series) \
                                    .stack()).sum(level=0)

    temp_composer = pd.get_dummies(df['composers_id'] \
                                    .apply(pd.Series) \
                                    .stack()).sum(level=0)

    temp_artist  = temp_artist.add_prefix('artist_')
    temp_composer = temp_composer.add_prefix('composers_')

    df = pd.concat([df,temp_artist,temp_composer],axis = 1)

    return df

def generate_main_df_features(df):
    # International_Musics
    indexs = df.loc[df['composers_100103'] == 1]['number_composers'].index.values 
    df['International_Musics'] = 0
    df.loc[indexs,'International_Musics'] = 1

    indexs = df.loc[df['composers_100103'] == 1]['number_composers'].index.values 
    df.loc[indexs,'number_composers'] = df.loc[indexs,'number_composers'] - 1

    # Is Self Compose 
    index = df[df['artist_id'] == df['composers_id']].index.values
    df['is_self_compose'] = 0
    df.loc[index,'is_self_compose'] = 1 

    # Various Composer  
    indexs = df.loc[df['composers_101978'] == 1]['number_composers'].index.values 
    df.loc[indexs,'number_composers'] = 10


    ## Beat, LK and Remix 
    df['lower_title'] = df['title'].str.lower()

    df['beat'] = 0
    df.loc[df['lower_title'].str.contains("beat",na=False),'beat'] = 1


    df['lk'] = 0
    df.loc[df['lower_title'].str.contains("liên khúc",na=False),'lk'] = 1


    df['remix'] = 0
    df.loc[df['lower_title'].str.contains("remix",na=False),'remix'] = 1

    ## Artist ID 
    df['artist_newest'] = df['artist_id'].apply(max).astype(int)
    df['artist_oldest'] = df['artist_id'].apply(min).astype(int)

    def removeComposerID(list_):
        try: 
            list_.remove('100103')
        except: 
            pass
        
        try:
            list_.remove('101978')
        except: 
            pass 

        return list_

    df['composers_id'].apply(removeComposerID)

    ## Composer ID 
    def maxComposer(list_):
        try: 
            return max(list_)
        except: 
            return 0 

    def minComposer(list_):
        try: 
            return min(list_)
        except: 
            return 0 


    df['composer_newest'] = df['composers_id'].apply(maxComposer)
    df['composer_oldest'] = df['composers_id'].apply(minComposer)

    return df

def generate_metadata_features(df):
    # Is in Album or Not
    df['is_in_album'] = 1 
    df.loc[df['album'].isna(),'is_in_album'] = 0

    # Have Lyrics 
    df['lyric'] = 1 
    df.loc[df['USLT::eng'].isna(),'lyric'] = 0

    # Various Artist 
    df['album_various_artist'] = 0 
    df.loc[df['albumartist'] == 'Various Artists','album_various_artist'] = 1

    # Old Song 
    df['is_old_song'] = 0
    df.loc[df['year'] > df['date'],'year'] = 1


    import re 

    def countNumDifferentWord(lyric):
        try:
            string = lyric.lower() 
            string = re.sub('[\r\n]+', '', string)
            string = re.sub(r'[+\-"():\\\/\^<>\.,;!?]'," ",string)
            return len(set(string.split()))
        except: 
            return 0 

    df['num_word'] = df['USLT::eng'].apply(countNumDifferentWord)
    
    return df

def transformData(df):
    df = generate_ap_id(df)

    df = generate_main_df_features(df)

    df = generate_metadata_features(df)

    return df

In [None]:
PATH = r'/content/drive/My Drive/zaloai/finaldata/'
df       =  pd.read_csv(PATH + 'train_origin.csv')
df_mp3   =  pd.read_csv(PATH + 'train_pyaudio.csv')
df_meta  =  pd.read_csv(PATH + 'train_metadata.csv')[['ID','audio_offset', 'filesize', 'duration','genre_encode']]
df_full_train_meta = pd.read_csv(PATH + 'minh_train_metadata.csv',
                                 index_col=0).drop(columns = ['artist','composer','title'])
df_full_train_meta.loc[df_full_train_meta['date'] == 2917,'date'] = 2017.0


df_train_origin = pd.read_csv(r'/content/drive/My Drive/ZaloAI/Dataset/train_info.tsv', sep = '\t')
df_train_artist_id = pd.read_csv(PATH + 'train_artist_id.csv')

df_train_rank = pd.read_csv(PATH + 'train_rank.csv')


df = pd.merge(df_train_origin,df, on='ID', how='left')
df = pd.merge(df, df_mp3, on='ID', how='left')
df = pd.merge(df, df_meta, on ='ID', how = 'left')
df = pd.merge(df, df_train_artist_id,on = 'ID', how = 'left')
df = pd.merge(df, df_full_train_meta,on = 'ID', how = 'left')
df = pd.merge(df, df_train_rank,on = 'ID', how = 'left')

In [None]:
df = transformData(df)

In [None]:
df_album = pd.read_csv('/content/drive/My Drive/zaloai/by_album_release_time.csv')

df_album['list_label']  = df_album['list_label'].apply(lambda x:  x[1:-1].split(',')) 
df_album['list_label']  = df_album['list_label'].apply(lambda x: list(map(int,x)))

df_album['list_label'] = df_album['list_label'].apply(lambda x: list(filter(lambda a: a != 0, x)))
df_album['std_album']  = df_album['list_label'].apply(np.std) #.fillna(-1)
df_album['mean_album'] = df_album['list_label'].apply(np.mean) #.fillna(-1)

def quant(x,q):
    try: 
        return np.quantile(x, q)
    except: 
        return np.nan

df_album['quantile_50_album'] = df_album['list_label'].apply( lambda x: quant(x, .50) )
df_album['quantile_25_album'] = df_album['list_label'].apply( lambda x: quant(x, .25) )


temp = df_album
temp['release_time'] =  pd.to_datetime(temp['release_time'])

df['release_time'] = pd.to_datetime(df['release_time'].apply(lambda x: x.date()))

df = pd.merge(df,temp,on=['release_time','album'], how = 'left')

  keepdims=keepdims)
  out=out, **kwargs)


# PipeLine 

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OneHotEncoder, StandardScaler,Normalizer,RobustScaler, Imputer
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from sklearn.pipeline import FeatureUnion, Pipeline 

class FeatureSelector( BaseEstimator, TransformerMixin ):
    #Class Constructor 
    def __init__( self, feature_names ):
        self._feature_names = feature_names 
    
    # Return self nothing else to do here    
    def fit( self, X, y = None ):
        return self 
    
    # Method that describes what we need this transformer to do
    def transform( self, X, y = None ):
        return X[ self._feature_names ]

class TransfromInt(BaseEstimator, TransformerMixin ):
    #Class Constructor 
    def __init__( self, attributes,categorical_attributes ):
        self.attributes = attributes
        self.categorical_attributes = categorical_attributes
    
    # Return self nothing else to do here    
    def fit( self, X, y = None ):
        return self 
    
    # Method that describes what we need this transformer to do
    def transform( self, X, y = None ):
        X = pd.DataFrame(X,columns = self.attributes)
        X[self.categorical_attributes] = X[self.categorical_attributes].astype(int)
        return X

In [None]:
def PipeLine(continuous_attributes,categorical_attributes,attributes):
    Continous_Pipeline = Pipeline([('continous selector', FeatureSelector(continuous_attributes)),
                                        ('std_scaler', StandardScaler()) ,
                                        ('Imputer', Imputer()),
                                ])
    Cat_Pipeline       = Pipeline([('cat selector', FeatureSelector(categorical_attributes)),
                                    ])                              
    Preprocessing      = Pipeline([('Feature Union', FeatureUnion([( 'categorical_pipeline', Cat_Pipeline ), 
                                                                    ( 'numerical_pipeline', Continous_Pipeline ),
                                                                    ])),
                                    ('Transfrom DF', TransfromInt(attributes,categorical_attributes)),
                                ])
    return Preprocessing

# Catboost

## Train 

### LightGBM

In [None]:
# y_train = train_df['label']
# X_train = train_df.drop(columns = ['label'])

# X_train = Preprocessing.fit_transform(X_train)

# lgb_params = {'application':'regression_l2',
#               'metric':'l2_root',
#               'learning_rate': 0.005976053757896968,
#               'lambda_l1': 1.0568823110348258, 
#               'lambda_l2': 1.55326839435241,
#               'num_leaves': 59,
#               'feature_fraction': 0.6107130496167985, 
#               'bagging_fraction': 0.21707765621212027,
#               'bagging_frequent': 10,
#               'min_split_gain': 3.3313398631164086e-07,
#               'min_child_weight': 32.47447995484354,
#               'seed': 251}



# train_set=lgb.Dataset(X_train.values, 
#                       label=y_train,
#                       feature_name=attributes ,
#                       categorical_feature=categorical_attributes,
#                       )

# model = lgb.train(lgb_params,
#                   train_set,
#                   valid_sets = [train_set],
#                   num_boost_round=300,
#                   verbose_eval= 100,
#                   categorical_feature=categorical_attributes)

### Catboost Model 1 With Album Information 

In [None]:
mp3_attributes_1 =  ['zcr_mean',
                   'energy_mean',
                   'energy_entropy_mean',
                   'spectral_centroid_mean',
                   'spectral_spread_mean',
                   'spectral_entropy_mean',
                   'spectral_flux_mean',
                   'spectral_rolloff_mean',
                   ]

categorical_attributes_1 =  [
                          'Min', 
                          'number_artists', 'number_composers', 
                          'number_punctuations_encode',
                          'length_title_not_punc_encode', 
                          'Hour_encode', 
                          'DayofWeek_encode',
                          'Month', 'Day', #'artist_id_encode', # 'frequence_artist',
                          'genre_encode',
                          'International_Musics',
                          'is_self_compose',
                          'beat',
                          'lk',
                          'remix',
                          'is_in_album',
                          'lyric',
                          'album_various_artist',
                          'is_old_song',
                          ]

continuous_attributes_1 = [
                         'ID',
                         'duration', 
                         'audio_offset',
                         'filesize',
                         'num_word',
                         'artist_newest',
                         'artist_oldest',
                         'summary_mean',	
                         'quantile_25',
                         'quantile_50',
                         'adapt_mean',
                         'std_all',
                         'std_album',
                         'mean_album',
                         'quantile_50_album',
                         'quantile_25_album',
                        #  'composer_newest',
                        #  'composer_oldest'
                         ]\
                         + mp3_attributes_1 

attributes_1 = categorical_attributes_1 + continuous_attributes_1

train_df = df[attributes_1 + ['label']].dropna().reset_index(drop = True)

for att in categorical_attributes_1:
  train_df[att] = train_df.loc[:,att].astype('category')

In [None]:
# y = train_df['label']

# X_train, X_valid, y_train, y_valid = train_test_split(train_df, y, 
#                                                       random_state = 0,
#                                                       test_size=0.2,)

# X_train, X_valid, y_train, y_valid = train_test_split(df, y, 
#                                                       random_state = 0,
#                                                       test_size=0.2,)

preprocessing_model_1 = PipeLine(continuous_attributes_1,categorical_attributes_1,attributes_1)
# X_train = preprocessing_model_1.fit_transform(X_train)
# X_valid = Preprocessing.transform(X_valid)

y_train = train_df['label']
X_train = train_df.drop(columns = ['label'])

categorical_features_indices = list(range(len(categorical_attributes_1)))

X_train = preprocessing_model_1.fit_transform(X_train)
# X_valid = Preprocessing.transform(X_valid)

model_1 = CatBoostRegressor(
    loss_function='RMSE',
    random_seed = 42,
    iterations = 500,
    od_wait = 1,
)

model_1.fit(
    X_train, y_train,
    cat_features=categorical_features_indices, 
    verbose=100,
);




0:	learn: 2.7777270	total: 85.5ms	remaining: 42.7s
100:	learn: 0.9833179	total: 2.5s	remaining: 9.89s
200:	learn: 0.9181715	total: 4.75s	remaining: 7.07s
300:	learn: 0.8858669	total: 7.2s	remaining: 4.76s
400:	learn: 0.8575346	total: 9.79s	remaining: 2.42s
499:	learn: 0.8392431	total: 12s	remaining: 0us


### Catboost Model 2 Without Album Information 

In [None]:
mp3_attributes_2 =  ['zcr_mean',
                   'energy_mean',
                   'energy_entropy_mean',
                   'spectral_centroid_mean',
                   'spectral_spread_mean',
                   'spectral_entropy_mean',
                   'spectral_flux_mean',
                   'spectral_rolloff_mean',
                   ]

categorical_attributes_2 =  [
                          'Min', 
                          'number_artists', 'number_composers', 
                          'number_punctuations_encode',
                          'length_title_not_punc_encode', 
                          'Hour_encode', 
                          'DayofWeek_encode',
                          'Month', 'Day', #'artist_id_encode', # 'frequence_artist',
                          'genre_encode',
                          'International_Musics',
                          'is_self_compose',
                          'beat',
                          'lk',
                          'remix',
                          'is_in_album',
                          'lyric',
                          'album_various_artist',
                          'is_old_song',
                          ]

continuous_attributes_2 = [
                         'ID',
                         'duration', 
                         'audio_offset',
                         'filesize',
                         'num_word',
                         'artist_newest',
                         'artist_oldest',
                         'summary_mean',	
                         'quantile_25',
                         'quantile_50',
                         'adapt_mean',
                         'std_all',
                        ]\
                        + mp3_attributes_2 

attributes_2 = categorical_attributes_2 + continuous_attributes_2

train_df = df[attributes_2 + ['label']].dropna().reset_index(drop = True)

for att in categorical_attributes_2:
  train_df[att] = train_df.loc[:,att].astype('category')

In [None]:
preprocessing_model_2 =  PipeLine(continuous_attributes_2,
                                  categorical_attributes_2,
                                  attributes_2)


y_train = train_df['label']
X_train = train_df.drop(columns = ['label'])

categorical_features_indices = list(range(len(categorical_attributes_2)))

X_train = preprocessing_model_2.fit_transform(X_train)

model_2 = CatBoostRegressor(
    loss_function='RMSE',
    random_seed = 42,
    iterations = 10000,
    od_wait = 1,
)

model_2.fit(
    X_train, y_train,
    cat_features=categorical_features_indices, 
    # eval_set=(X_valid, y_valid),
    verbose=100,
);


# Bayesian Optimization 

In [None]:
# !pip install bayesian_optimization

In [None]:
!pip install optuna

### Optuna 

In [None]:
from sklearn.model_selection import StratifiedKFold

preprocessing_model_2 =  PipeLine(continuous_attributes_2,
                                  categorical_attributes_2,
                                  attributes_2)

y = train_df['label']
X = train_df.drop(columns = ['label'])

categorical_features_indices = list(range(len(categorical_attributes_2)))

skf = StratifiedKFold(n_splits=5,random_state = 0)
skf.get_n_splits(X, y)

score = []
iteration = []

params = {
        'loss_function':'RMSE',
        'iterations' : 100,
        'early_stopping_rounds': 20,
        'learning_rate': 0.1,
        'depth' : 2,
        'leaf_estimation_iterations' : 2,
        'l2_leaf_reg': 10,
        'bagging_temperature' :10,
        'random_seed' : 0,
    }
 
for train_index, test_index in skf.split(X, y):
    
    X_train = X.loc[train_index]
    y_train = y.loc[train_index]
    
    X_valid = X.loc[test_index]
    y_valid = y.loc[test_index]

    X_train = preprocessing_model_2.fit_transform(X_train)
    X_valid = preprocessing_model_2.transform(X_valid)


    model_2 = CatBoostRegressor( **params, )
                                # iterations = 100 )

    model_2.fit(
        X_train, y_train,
        cat_features=categorical_features_indices, 
        eval_set=(X_valid, y_valid),
        verbose=100,
    );

    score.append(model_2.best_score_['validation']['RMSE'])
    iteration.append(model_2.best_iteration_)

    np.mean(score)


Class Imputer is deprecated; Imputer was deprecated in version 0.20 and will be removed in 0.22. Import impute.SimpleImputer from sklearn instead.



0:	learn: 2.7352251	test: 2.7264187	best: 2.7264187 (0)	total: 5.5ms	remaining: 544ms
Stopped by overfitting detector  (20 iterations wait)

bestTest = 1.524819722
bestIteration = 44

Shrink model to first 45 iterations.
0:	learn: 2.7350328	test: 2.7229819	best: 2.7229819 (0)	total: 5.48ms	remaining: 542ms
99:	learn: 1.5089196	test: 1.5161075	best: 1.5161075 (99)	total: 390ms	remaining: 0us

bestTest = 1.516107466
bestIteration = 99

0:	learn: 2.7297099	test: 2.7261196	best: 2.7261196 (0)	total: 5.36ms	remaining: 531ms
Stopped by overfitting detector  (20 iterations wait)

bestTest = 1.562473124
bestIteration = 53

Shrink model to first 54 iterations.
0:	learn: 2.7303296	test: 2.7446578	best: 2.7446578 (0)	total: 5.34ms	remaining: 529ms
Stopped by overfitting detector  (20 iterations wait)

bestTest = 1.816469363
bestIteration = 26

Shrink model to first 27 iterations.
0:	learn: 2.7262717	test: 2.7335046	best: 2.7335046 (0)	total: 5.33ms	remaining: 527ms
Stopped by overfitting detector

In [None]:
preprocessing_model_2 =  PipeLine(continuous_attributes_2,
                                  categorical_attributes_2,
                                  attributes_2)


y = train_df['label']
X = train_df.drop(columns = ['label'])

skf = StratifiedKFold(n_splits=5,random_state = 0 )
skf.get_n_splits(X, y)


def objective(trial):
    params = {
        'loss_function':'RMSE',
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.005, 0.1),
        'depth' : trial.suggest_int('depth',2,5),
        'leaf_estimation_iterations' : trial.suggest_int('leaf_estimation_iterations',2,10),
        'l2_leaf_reg': trial.suggest_uniform('l2_leaf_reg',0.01, 10),
        'bagging_temperature' : trial.suggest_uniform('bagging_temperature', 0, 30),
        'random_seed' : trial.suggest_int('seed',0,100),
        'iterations' : 100000,
        'early_stopping_rounds': 2000,
    }

    score = []
 
    for train_index, test_index in skf.split(X, y):
    
        X_train = X.loc[train_index]
        y_train = y.loc[train_index]
        
        X_valid = X.loc[test_index]
        y_valid = y.loc[test_index]

        X_train = preprocessing_model_2.fit_transform(X_train)
        X_valid = preprocessing_model_2.transform(X_valid)


        model_2 = CatBoostRegressor(
                    **params,
                )

        model_2.fit(
            X_train, y_train,
            cat_features=categorical_features_indices, 
            eval_set=(X_valid, y_valid),
            verbose=0,
        );

        score.append(model_2.best_score_['validation']['RMSE'])
        # iteration.append(model_2.best_iteration_)

        cv_score = np.mean(score)
    return cv_score


Class Imputer is deprecated; Imputer was deprecated in version 0.20 and will be removed in 0.22. Import impute.SimpleImputer from sklearn instead.



In [None]:
import optuna
study = optuna.create_study(direction='minimize')
study.optimize(objective,n_jobs= -1,
               n_trials=100)

print(study.best_value)
print(study.best_params)

In [None]:
# with open(r'/content/drive/My Drive/optuna_baseline.plk','rb') as f:
#         study = pickle.load(f)

In [None]:
# for i in range(10):
#     study.optimize(objective,n_jobs= -1,
#                n_trials=50)
#     with open(r'/content/drive/My Drive/optuna_baseline.plk','wb') as f:
#         pickle.dump(study,f)

# Test Data transform 

In [None]:
PATH = r'/content/drive/My Drive/zaloai/finaldata/'
df_test            = pd.read_csv(PATH +    'test_origin.csv')
df_test_mp3        = pd.read_csv(PATH +    'test_pyaudio.csv')
df_test_meta       = pd.read_csv(PATH +    'test_metadata.csv')[['ID','audio_offset', 'filesize', 'duration','genre_encode']]
# df_test_tempo      = pd.read_csv(PATH +    'test_tempo.csv',index_col=0)
df_full_test_meta  = pd.read_csv(PATH +    'minh_test_metadata.csv',
                                 index_col=0).drop(columns = ['artist','composer','title'])
df_full_test_meta.loc[df_full_test_meta['date'] == 2917,'date'] = 2017.0

df_test_origin = pd.read_csv(r'/content/drive/My Drive/ZaloAI/Dataset/test_info.tsv', sep = '\t')
df_test_artist_id = pd.read_csv(PATH + 'test_artist_id.csv')



df_test = pd.merge(df_test_origin,df_test, on='ID', how='left')
df_test = pd.merge(df_test, df_test_mp3, on='ID', how='left')
df_test = pd.merge(df_test, df_test_meta, on ='ID', how = 'left')
# df_test = pd.merge(df_test, df_test_tempo, on = 'ID', how='left')
df_test = pd.merge(df_test, df_test_artist_id,on = 'ID', how = 'left')
df_test = pd.merge(df_test, df_full_test_meta,on = 'ID', how = 'left')

In [None]:
df_test = transformData(df_test)

In [None]:
df_test

Unnamed: 0,ID,title,artist_name,artist_id,composers_name,composers_id,release_time,number_artists,number_composers,number_artists_encode,number_composers_encode,number_punctuations,number_punctuations_encode,length_title,length_title_encode,length_title_not_punc,length_title_not_punc_encode,artist_id_encode,composer_id_encode,Month,Hour,Day,Min,Hour_encode,DayofWeek_encode,frequence_artist,frequence_composer,zcr_0.05,zcr_0.25,zcr_0.5,zcr_0.75,zcr_0.95,zcr_mean,zcr_std,energy_0.05,energy_0.25,energy_0.5,energy_0.75,energy_0.95,energy_mean,...,composers_72141,composers_72757,composers_72991,composers_73021,composers_73069,composers_734,composers_7351,composers_73859,composers_74750,composers_74960,composers_75230,composers_76149,composers_7625,composers_76464,composers_76499,composers_7673,composers_77751,composers_7907,composers_8032,composers_8033,composers_8885,composers_8978,composers_9000,composers_9145,composers_9436,International_Musics,is_self_compose,lower_title,beat,lk,remix,artist_newest,artist_oldest,composer_newest,composer_oldest,is_in_album,lyric,album_various_artist,is_old_song,num_word
0,1073994292,Giấc Mơ Mình Em,Minh Tuyết,[455],Minh Vy,[100019],2017-11-01 18:27:00,1,1,0,0,0,0,4,4,4,4,22,0,11,18,1,27,18,2,4,3,0.013155,0.026763,0.037197,0.051486,0.085507,0.042913,0.030663,0.001338,0.009668,0.028805,0.059728,0.128421,0.041139,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,giấc mơ mình em,0,0,0,455,455,100019,100019,1,1,0,0,107
1,1074250503,Rồi 30 Năm Qua,Tâm Đoan,[518],Nhật Ngân,[100218],2017-10-01 22:07:00,1,1,0,0,0,0,4,4,4,4,0,121,10,22,1,7,22,6,1,4,0.019959,0.034248,0.047403,0.064414,0.097301,0.052697,0.029796,0.008071,0.040929,0.062233,0.083227,0.111469,0.061592,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,rồi 30 năm qua,0,0,0,518,518,100218,100218,1,1,1,0,118
2,1074254709,Hòa Bình Hoan Ca,"Hùng Phú, Duy Linh","[18241, 918]",Huy Liêu,[430572],2017-11-07 23:47:00,2,1,1,0,0,0,4,4,4,4,0,0,11,23,7,47,23,1,1,2,0.025856,0.046496,0.064641,0.093445,0.161488,0.075084,0.042369,0.010760,0.030465,0.046991,0.061946,0.082017,0.046472,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,hòa bình hoan ca,0,0,0,918,18241,430572,430572,0,0,0,0,0
3,1074254735,Chia Tay Ngày Hè,Xuân Trường,[11740],Huy Liêu,[430572],2017-11-07 23:38:00,1,1,0,0,0,0,4,4,4,4,0,0,11,23,7,38,23,1,1,2,0.014743,0.022908,0.030846,0.043547,0.082264,0.038319,0.031216,0.004407,0.018863,0.032738,0.045828,0.068755,0.033847,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,chia tay ngày hè,0,0,0,11740,11740,430572,430572,0,0,0,0,0
4,1074326411,Xin Gọi Nhau Là Cố Nhân,Quang Lê,[828],Song Ngọc,[100288],2017-10-01 21:33:00,1,1,0,0,0,0,6,6,6,6,46,237,10,21,1,33,21,6,4,7,0.023588,0.037197,0.048991,0.063053,0.104786,0.056383,0.040681,0.006854,0.028503,0.048914,0.073441,0.122355,0.054067,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,xin gọi nhau là cố nhân,0,0,0,828,828,100288,100288,1,1,1,0,103
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1113,1078495269,Đâu Chỉ Mình Em (#DCME),EMOI,[650851],EMOI,[650851],2018-12-29 00:31:00,1,1,0,0,3,1,7,7,5,5,0,0,12,0,29,31,0,5,1,1,0.010836,0.026881,0.040633,0.056470,0.088143,0.045433,0.032652,0.002708,0.013327,0.032750,0.072262,0.103781,0.043120,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,đâu chỉ mình em (#dcme),0,0,0,650851,650851,650851,650851,1,1,0,0,224
1114,1078495586,Tim Vỡ,Song Hải,[650627],Song Hải,[650627],2018-12-29 01:29:00,1,1,0,0,0,0,2,2,2,2,0,0,12,1,29,29,1,5,1,1,0.019506,0.032207,0.044455,0.058063,0.085507,0.048085,0.026906,0.000724,0.011443,0.025020,0.043134,0.074988,0.029563,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,tim vỡ,0,0,0,650627,650627,650627,650627,1,0,0,0,0
1115,1078495706,Lô Tô Cô Ú Biển Số Xe Miền Tây,Cô Ú,[618837],Cao Minh Thu,[100384],2018-12-29 01:36:00,1,1,0,0,0,0,9,0,9,0,0,0,12,1,29,36,1,5,1,2,0.028964,0.047093,0.060429,0.076058,0.127777,0.066538,0.034458,0.015267,0.044777,0.066117,0.084958,0.113738,0.065372,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,lô tô cô ú biển số xe miền tây,0,0,0,618837,618837,100384,100384,0,1,0,0,288
1116,1078502537,Chỉ Cần,Dion,[653359],Dion,[653359],2018-12-30 01:36:00,1,1,0,0,0,0,2,2,2,2,0,0,12,1,30,36,1,6,1,1,0.017237,0.042640,0.059878,0.078249,0.157178,0.073794,0.071419,0.000112,0.002721,0.008382,0.031153,0.059929,0.018233,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,chỉ cần,0,0,0,653359,653359,653359,653359,1,1,0,0,163


In [None]:
df_album = pd.read_csv('/content/drive/My Drive/zaloai/by_album_release_time.csv')

df_album['list_label']  = df_album['list_label'].apply(lambda x:  x[1:-1].split(',')) 
df_album['list_label']  = df_album['list_label'].apply(lambda x: list(map(int,x)))

df_album['list_label'] = df_album['list_label'].apply(lambda x: list(filter(lambda a: a != 0, x)))
df_album['std_album']  = df_album['list_label'].apply(np.std) #.fillna(-1)
df_album['mean_album'] = df_album['list_label'].apply(np.mean) #.fillna(-1)

def quant(x,q):
    try: 
        return np.quantile(x, q)
    except: 
        return np.nan

df_album['quantile_50_album'] = df_album['list_label'].apply( lambda x: quant(x, .50) )
df_album['quantile_25_album'] = df_album['list_label'].apply( lambda x: quant(x, .25) )


temp = df_album
temp['release_time'] = pd.to_datetime(temp['release_time'])

df_test['release_time'] = pd.to_datetime(df_test['release_time'].apply(lambda x: x.date()))

df_test = pd.merge(df_test,temp,on=['release_time','album'], how = 'left')

  keepdims=keepdims)
  out=out, **kwargs)


### Heuristic Prediction

In [None]:
heuristic_prediction  = df_test[(df_test['std_album'] > 0) & ( df_test['std_album'] < 1)]
heuristic_prediction['predict'] = heuristic_prediction['list_label'].apply(np.mean)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [None]:
heuristic_prediction[['ID','predict']]

Unnamed: 0,ID,predict
1,1074250503,7.571429
17,1075810920,3.000000
24,1075811202,7.142857
25,1075811205,7.142857
26,1075811206,7.142857
...,...,...
1070,1078344863,6.500000
1082,1078363566,9.000000
1084,1078372306,2.000000
1094,1078441699,4.833333


### Model_1 with Album Attribute

In [None]:
model_1_predict      = df_test[ df_test['std_album'] > 1]

test_df_model_1_prediction = model_1_predict[attributes_1]
for att in categorical_attributes_1:
  test_df_model_1_prediction[att] = test_df_model_1_prediction.loc[:,att].astype('category')

test_temp_1 = preprocessing_model_1.transform(test_df_model_1_prediction)
pred = model_1.predict(test_temp_1)

df_model_1_prediction  = pd.DataFrame({'ID' : test_df_model_1_prediction['ID'], 
                                         'predict' : pred})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


In [None]:
df_sub[df_test['ID'] < 10000]

Unnamed: 0,ID,predict


# Model_2 without Album Attribute

In [None]:
# predicted_id = list(df_model_1_prediction['ID'].values) +\
predicted_id =                    list(heuristic_prediction['ID'].values)
model_2_predict = df_test[~df_test['ID'].isin(predicted_id)]


test_df_model_2_prediction = model_2_predict[attributes_2] 
for att in categorical_attributes_2:
    test_df_model_2_prediction[att] = test_df_model_2_prediction.loc[:,att].astype('category')

test_temp_2 = preprocessing_model_2.transform(test_df_model_2_prediction)
pred = model_2.predict(test_temp_2)

df_model_2_prediction = pd.DataFrame({'ID' : test_df_model_2_prediction['ID'], 
                                      'predict' : pred})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys


# Submission 

In [None]:
df_sub = pd.concat([heuristic_prediction[['ID','predict']],
                    # df_model_1_prediction,
                    df_model_2_prediction,], axis = 0).sort_values('ID')

In [None]:
# round predict < 1 to 1 
df_sub.loc[df_sub['predict'] < 1,'predict'] = 1

# round predict > 9.7 to 10 
df_sub.loc[df_sub['predict'] > 9.7,'predict'] = 10 

In [None]:
df_sub.to_csv('submission_31_catboost.csv',
                index= False, 
                header=False,
                index_label=False)

# Test 

In [None]:
# from catboost import Catboost
model.save_model(r'/content/drive/My Drive/2019-22-11_1PM_catboost_1.676')

In [None]:
csv = pd.read_csv(r'/content/drive/My Drive/zaloai/finaldata/train_artist_id.csv')#   names = ['ID','sub'])
csv_test = pd.read_csv(r'/content/drive/My Drive/zaloai/finaldata/test_artist_id.csv')#   names = ['ID','sub'])

In [None]:
# np.mean(csv['sub'] - pred)
csv
csv_test

In [None]:
pd.merge(df_test,csv_test,on = 'ID',how='left')

In [None]:
model.get_feature_importance()

In [None]:
feature_score = pd.DataFrame(list(zip(X.dtypes.index, model.get_feature_importance(Pool(X, label=y, cat_features=categorical_features_indices)))),
                columns=['Feature','Score'])

In [None]:
X = train_df.drop(columns = ['label'])
y = train_df['label']

categorical_features_indices = list(range(len(categorical_attributes)))

X = Preprocessing.transform(X)
X = pd.DataFrame(X,columns = attributes)
X[categorical_attributes] = X[categorical_attributes].astype(int)

model.get_feature_importance(Pool(X, label=y, cat_features=categorical_features_indices))

In [None]:
feature_score = pd.DataFrame(list(zip(X.dtypes.index, model.get_feature_importance(Pool(X, label=y, cat_features=categorical_features_indices)))),
                columns=['Feature','Score'])

feature_score = feature_score.sort_values(by='Score', ascending=False, inplace=False, kind='quicksort', na_position='last')

In [None]:
plt.rcParams["figure.figsize"] = (12,7)
ax = feature_score.plot('Feature', 'Score', kind='bar', color='c')
ax.set_title("Catboost Feature Importance Ranking", fontsize = 14)
ax.set_xlabel('')

rects = ax.patches

labels = feature_score['Score'].round(2)

for rect, label in zip(rects, labels):
    height = rect.get_height()
    ax.text(rect.get_x() + rect.get_width()/2, height + 0.35, label, ha='center', va='bottom')

plt.show()

In [None]:
shap_values = model.get_feature_importance(Pool(X_valid, label=y_valid,cat_features=categorical_features_indices), 
                                                                     type="ShapValues")
expected_value = shap_values[0,-1]
shap_values = shap_values[:,:-1]


In [None]:
shap.initjs()
shap.force_plot(expected_value, shap_values[3,:], X_valid.iloc[1,:])

In [None]:
shap.summary_plot(shap_values, X_valid)

# New 

In [None]:
df_album_1 = pd.read_csv('/content/drive/My Drive/zaloai/by_album_release_time.csv')
df_album_2 = pd.read_csv('/content/drive/My Drive/zaloai/by_album.csv')

In [None]:
pd.merge(df_album_1,df_album_2,on = 'album', how = 'left').to_csv('temp.csv')

In [None]:
df_album = pd.read_csv('/content/drive/My Drive/zaloai/by_album_release_time.csv')

df_album['list_label']  = df_album['list_label'].apply(lambda x:  x[1:-1].split(',')) 
df_album['list_label']  = df_album['list_label'].apply(lambda x: list(map(int,x)))

df_album['list_label'] = df_album['list_label'].apply(lambda x: list(filter(lambda a: a != 0, x)))
df_album['std_album'] = df_album['list_label'].apply(np.std).fillna(-1)
df_album['mean_album'] = df_album['list_label'].apply(np.mean) #.fillna(-1)

def quant(x,q):
    try: 
        return np.quantile(x, q)
    except: 
        return np.nan

df_album['quantile_50_album'] = df_album['list_label'].apply( lambda x: quant(x, .50) )
df_album['quantile_25_album'] = df_album['list_label'].apply( lambda x: quant(x, .25) )



temp = df_album
temp['release_time'] = pd.to_datetime(temp['release_time'])

df_test['release_time'] = pd.to_datetime(df_test['release_time'].apply(lambda x: x.date()))

df_test = pd.merge(df_test,temp,on=['release_time','album'], how = 'left')
heuristic_prediction  = df_test[(df_test['std_album'] > 0) & ( df_test['std_album'] < 1)]

heuristic_prediction['predict'] = heuristic_prediction['list_label'].apply(np.mean)

In [None]:
df_test

In [None]:
model_predict = df_test[~df_test['ID'].isin(heuristic['ID'])]

In [None]:
csv_1 = pd.read_csv(r'/content/sub_26.csv',names = ['ID','pred'])
csv_2 = pd.read_csv(r'/content/submission_29_catboost.csv',names = ['ID','pred'])

In [None]:
np.sqrt(np.mean((csv_1['pred'].values - csv_2['pred'].values)**2 ))