<a href="https://colab.research.google.com/github/K7chyp/sber/blob/main/sber.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb

In [85]:
from google.colab import drive
drive.mount('/content/drive')

transactions_train=pd.read_csv('/transactions_train.csv')
train_target=pd.read_csv('/train_target.csv')
test=pd.read_csv('/test.csv')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [73]:
class CommonStat(): 
  
  def __init__(self, df): 
    self.df = df

  def get_stat(self): 
    return self.df.shape 
  
  def check_missing(self,output_path=None):    
    result = pd.concat([self.df.isnull().sum(),self.df.isnull().mean()],axis=1)
    result = result.rename(index=str,columns={0:'total missing',1:'proportion'})
    if output_path is not None:
        result.to_csv(output_path+'missing.csv')
        print(output_path, 'missing.csv')
    return result
  
  def report(self): 
    profile = ProfileReport(self.df, title="Report")
    return profile
  

In [37]:
class Errors():

  @staticmethod
  def mae(y_true, y_pred):
    return np.average(np.average(np.abs(y_pred - y_true), axis=0))
  
  @staticmethod
  def mad(y_true, y_pred): 
    return np.average(np.median(np.abs(y_pred - y_true), axis=0))

  @staticmethod
  def mse(y_true, y_pred, squared=True): 
    mse = np.average(np.average((y_true - y_pred) ** 2, axis=0))
    return mse if squared else np.sqrt(mse)
    
  @staticmethod
  def mape(y_true, y_pred): 
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100
  
  @staticmethod
  def avg(series): 
    sum_ = 0
    for value in series: 
      sum_ += value
    return sum_/len(series)


In [87]:
class Preprocessing(Errors): 

  def __init__(self, 
               transactions_train=transactions_train, 
               train_target=train_target,
               test=test): 

    self.transactions_train = transactions_train
    self.train_target = train_target
    self.agg_features = self.calculate_aggregations()
    self.cat_counts_train = self.counter_df()
    self.train = self.merge_df()
    self.test = test
  
  def calculate_aggregations(self): 
    self.agg_features=transactions_train.groupby('client_id')['amount_rur'].agg(['sum','mean','std','min','max']).reset_index()
    return self.agg_features
  
  def counter_df(self):
    self.counter_df_train=self.transactions_train.groupby(['client_id','small_group'])['amount_rur'].count() 
    self.cat_counts_train=self.counter_df_train.reset_index().pivot(index='client_id',
                                                              columns='small_group',
                                                              values='amount_rur')
    self.cat_counts_train=self.cat_counts_train.fillna(0)
    self.cat_counts_train.columns=['small_group_'+str(i) for i in self.cat_counts_train.columns]
    return self.cat_counts_train
   
  
  def merge_df(self): 
    self.train=pd.merge(self.train_target,self.agg_features,on='client_id')
    self.train=pd.merge(self.train,self.cat_counts_train.reset_index(),on='client_id')
    return self.train

  def split(self): 

    common_features=list(set(self.train.columns).intersection(set(self.test.columns)))

    y_train=self.train['bins']
    X_train=self.train[common_features]
    X_test=self.test[common_features]

    return X_train, y_train

In [None]:
class Model(Preprocessing): 
  
  def param_search(self)