<a href="https://colab.research.google.com/github/K7chyp/sber/blob/main/sber.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb

In [48]:
from google.colab import drive
drive.mount('/content/drive')

transactions_train=pd.read_csv('/content/drive/MyDrive/contests/sber/transactions_train.csv')
train_target=pd.read_csv('/content/drive/MyDrive/contests/sber/train_target.csv')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [73]:
class CommonStat(): 
  
  def __init__(self, df): 
    self.df = df

  def get_stat(self): 
    return self.df.shape 
  
  def check_missing(self,output_path=None):    
    result = pd.concat([self.df.isnull().sum(),self.df.isnull().mean()],axis=1)
    result = result.rename(index=str,columns={0:'total missing',1:'proportion'})
    if output_path is not None:
        result.to_csv(output_path+'missing.csv')
        print(output_path, 'missing.csv')
    return result
  
  def report(self): 
    profile = ProfileReport(self.df, title="Report")
    return profile
  

In [37]:
class Errors():

  @staticmethod
  def mae(y_true, y_pred):
    return np.average(np.average(np.abs(y_pred - y_true), axis=0))
  
  @staticmethod
  def mad(y_true, y_pred): 
    return np.average(np.median(np.abs(y_pred - y_true), axis=0))

  @staticmethod
  def mse(y_true, y_pred, squared=True): 
    mse = np.average(np.average((y_true - y_pred) ** 2, axis=0))
    return mse if squared else np.sqrt(mse)
    
  @staticmethod
  def mape(y_true, y_pred): 
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100
  
  @staticmethod
  def avg(series): 
    sum_ = 0
    for value in series: 
      sum_ += value
    return sum_/len(series)


In [83]:
class Preprocessing(Errors): 

  def __init__(self, 
               transactions_train=transactions_train, 
               train_target=train_target): 

    self.transactions_train = transactions_train
    self.train_target = train_target
    self.agg_features = self.calculate_aggregations()
    self.cat_counts_train = self.counter_df()
  
  def calculate_aggregations(self): 
    self.agg_features=transactions_train.groupby('client_id')['amount_rur'].agg(['sum','mean','std','min','max']).reset_index()
    return self.agg_features
  
  def counter_df(self):
    self.counter_df_train=self.transactions_train.groupby(['client_id','small_group'])['amount_rur'].count() 
    self.cat_counts_train=self.counter_df_train.reset_index().pivot(index='client_id',
                                                              columns='small_group',
                                                              values='amount_rur')
    self.cat_counts_train=self.cat_counts_train.fillna(0)
    self.cat_counts_train.columns=['small_group_'+str(i) for i in self.cat_counts_train.columns]
    return self.cat_counts_train
   
  
  def merge_df(self): 
    self.train=pd.merge(self.train_target,self.agg_features,on='client_id')
    self.train=pd.merge(self.train,self.cat_counts_train.reset_index(),on='client_id')
    return self.train.head()

  


Unnamed: 0,client_id,bins,sum,mean,std,min,max,small_group_0,small_group_1,small_group_2,small_group_3,small_group_4,small_group_5,small_group_6,small_group_7,small_group_8,small_group_9,small_group_10,small_group_11,small_group_12,small_group_13,small_group_14,small_group_15,small_group_16,small_group_17,small_group_18,small_group_19,small_group_20,small_group_21,small_group_22,small_group_23,small_group_24,small_group_25,small_group_26,small_group_27,small_group_28,small_group_29,small_group_30,small_group_31,small_group_32,...,small_group_162,small_group_163,small_group_164,small_group_165,small_group_166,small_group_167,small_group_168,small_group_169,small_group_170,small_group_171,small_group_172,small_group_173,small_group_174,small_group_175,small_group_176,small_group_177,small_group_178,small_group_179,small_group_180,small_group_181,small_group_182,small_group_183,small_group_184,small_group_185,small_group_186,small_group_187,small_group_188,small_group_189,small_group_190,small_group_191,small_group_192,small_group_193,small_group_195,small_group_196,small_group_197,small_group_198,small_group_199,small_group_200,small_group_202,small_group_203
0,24662,2,30254.011,34.774725,72.037354,0.074,1227.314,0.0,174.0,2.0,64.0,33.0,0.0,0.0,0.0,1.0,3.0,0.0,92.0,365.0,0.0,0.0,11.0,0.0,0.0,20.0,0.0,0.0,4.0,3.0,3.0,9.0,16.0,4.0,0.0,4.0,0.0,0.0,4.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1046,0,42548.57,52.015367,106.540962,0.55,1210.506,1.0,187.0,61.0,47.0,13.0,1.0,0.0,0.0,2.0,8.0,1.0,27.0,3.0,0.0,1.0,79.0,0.0,0.0,142.0,0.0,2.0,0.0,4.0,2.0,5.0,4.0,3.0,0.0,6.0,1.0,0.0,5.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,34089,2,26842.816,34.325852,59.92745,0.043,782.641,0.0,372.0,0.0,72.0,37.0,10.0,0.0,0.0,0.0,17.0,0.0,47.0,9.0,0.0,0.0,49.0,15.0,1.0,6.0,0.0,2.0,2.0,1.0,5.0,26.0,21.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,34848,1,15773.126,16.16099,14.224936,0.043,109.59,0.0,359.0,1.0,0.0,41.0,0.0,0.0,0.0,0.0,38.0,0.0,116.0,0.0,0.0,0.0,306.0,0.0,0.0,45.0,0.0,1.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,3.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,47076,3,12488.375,15.92905,35.473591,0.432,541.165,0.0,378.0,0.0,150.0,44.0,0.0,0.0,0.0,0.0,122.0,0.0,33.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,2.0,2.0,8.0,31.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
