In [3]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import warnings
from datetime import datetime
import random
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from typing import List
from tqdm import tqdm
warnings.filterwarnings(action="ignore")
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
def set_random_seed(seed:int) -> None:
    random.seed(seed)
    np.random.seed(seed)
    os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:2"
    os.environ["PL_GLOBAL_SEED"] = str(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)

In [None]:
def extract_payd_period(x: List) -> List:
    lst = []
    for i in range(1, len(x)):
        delta = (x[i] - x[i - 1]).days
        lst.append(delta)
    return lst

def extract_delay_day(x: List) -> int:
    count = 0
    for i in x:
        if i > 31:
            count += i - 30
    return count
def extract_amount_delays(x: List) -> int:
    count = 0
    for i in x:
        if i > 31:
            count += 1
    return count

def extract_amount_early(x: List) -> int:
    count = 0
    for i in x:
        if i < 28:
            count += 1
    return count

def extract_amount_intime(x: List) -> int:
    count = 0
    for i in x:
        if i > 27 and i < 32:
            count += 1
    return count

def interpl_targ(df -> pd.DataFrame) -> None:
    window = 2
    df["ExpectedTermDate"] = pd.to_datetime(df["ExpectedTermDate"]) + pd.to_timedelta(pd.np.ceil(df.Term*window), unit="D")
    df["LastPaymentDate"] = pd.to_datetime(df["LastPaymentDate"]).dt.tz_localize(None)
    culc_fild = []
    for r in df.iterrows():
        r = r[1]
        culc = ((100 * sum((r.PaymentsHistory) + r[-6:].values.tolist()) // r.TotalContractValue)>=60.) and (r.LastPaymentDate < r.ExpectedTermDate)
        culc_fild.append(float(culc))

    df["culc_fild"] = culc_fild
    

def pad_history(df:pd.DataFrame, max_len:int=41):
    
    padded_payments = []
    
    for r in df.copy().iterrows():
        r = r[1]
        
        if len(r.PaymentsHistory) > max_len:
            padded_payments.append(r.PaymentsHistory[:max_len])
            
        else:
            padding_len = abs(max_len - len(ast.literal_eval(r.PaymentsHistory)))
            padded_payments.append(r.PaymentsHistory + padding_len*[0.])
            
    
    df["PaymentsHistory"] = padded_payments

In [None]:
def create_features(df: pd.DataFrame) -> None:
    df['RegisteredInLeapYear'] = df.RegistrationDate.dt.is_leap_year.astype('float')
    df['RegisteredAtMonthStart'] = df.RegistrationDate.dt.is_month_start.astype('float')
    df['RegisteredAtMonthEnd'] = df.RegistrationDate.dt.is_month_end.astype('float')
    df['LastPaymentMonth'] = df.LastPaymentDate.dt.month
    df['FirstPaymentMonth'] = df.FirstPaymentDate.dt.month
    df['TransactionDates'] = df['TransactionDates'].apply(lambda x: eval(x))
    df['PaymentsHistory'] = df['PaymentsHistory'].apply(lambda x: eval(x))
    df['TransactionDates'] = df['TransactionDates'].apply(lambda x: [datetime.strptime(i, "%m-%Y").date() for i in x])
    df['pay_period'] = df['TransactionDates'].apply(lambda x:extract_payd_period(x))
    df['sum_delay_days'] = df['pay_period'].apply(lambda x:extract_delay_day(x))
    df['amount_delay_days'] = df['pay_period'].apply(lambda x:extract_amount_delays(x))
    df['amount_early_pays'] = df['pay_period'].apply(lambda x:extract_amount_early(x))
    df['amount_intime_pays'] = df['pay_period'].apply(lambda x:extract_amount_intime(x))
    df['mean_payment'] = df['PaymentsHistory'].apply(lambda x:np.mean(x))
    df['std_payment'] = df['PaymentsHistory'].apply(lambda x:np.std(x))
    df['max_payment'] = df['PaymentsHistory'].apply(lambda x:np.max(x))
    df['median_payment'] = df['PaymentsHistory'].apply(lambda x:np.median(x))
    df['min_payment'] = df['PaymentsHistory'].apply(lambda x:np.min(x))
    interpl_targ(df)
    pad_history(df)

In [None]:
def preprocess_data(df: pd.DataFrame, skip_cols: List=None) -> None:
    cols = [col for col in df.columns[1:] if col not i skip_cols]
    cat_cols = [col for col in cols if df[col].dtype == 'O' and 'Date' not in col]
    date_cols = [col for col in cols if 'Date' in col]

    for col in tqdm(cat_cols, desc='Processing categorical columns \t'):
        df[col] = df[col].astype('category')
        
    # print("Date cols", date_cols)
    # convert date col to datetime type
    for col in tqdm(date_cols, desc='Processing datetime columns \t'):
        df[col] = pd.to_datetime(df[col]).dt.tz_localize(None)

In [None]:
cat_cols = ['rateTypeEntity', 'MainApplicantGender', 'Region', 'Town', 'Occupation']
for col in cat_cols:
    train[col] = train[col].astype('category')
    test[col] = test[col].astype('category')
    train[col] = train[col].cat.codes
    test[col] = test[col].cat.codes

In [None]:
def filling_nulls(df: pd.DataFrame) -> None: 
    train['Region'].fillna(value=train.Region.mode()[0], inplace=True)
    train['Age'].fillna(value=round(train.Age.mean()), inplace=True)
    test['Region'].fillna(value=train.Region.mode()[0], inplace=True)
    test['Age'].fillna(value=round(train.Age.mean()), inplace=True)

    pass

In [None]:
metadata = pd.read_csv('../input/scoring/metadata.csv')
metadata

In [None]:
train = pd.read_csv('../input/scoring/Train.csv')
train = pd.merge(left=metadata, right = train, on = "ID", how = 'inner')

In [None]:
test = pd.read_csv('../input/scoring/Test.csv')
test = pd.merge(left=metadata, right = test, on = "ID",how = 'right')

In [None]:
train.drop(columns = ['SupplierName','UpsellDate','PaymentMethod','RegistrationDate','ExpectedTermDate','FirstPaymentDate','LastPaymentDate'],inplace= True)
test.drop(columns = ['SupplierName','UpsellDate','PaymentMethod','RegistrationDate','ExpectedTermDate','FirstPaymentDate','LastPaymentDate'],inplace=True)
train.drop(columns = ['TransactionDates','PaymentsHistory','pay_period'],inplace=True)

In [None]:
set_random_seed(42)

In [None]:
X,y = train.drop(columns=['ID','m1','m2','m3','m4','m5','m6']), train[['m1','m2','m3','m4','m5','m6']]

In [None]:
train_x,test_x,y_train,y_test = train_test_split(X,y)

In [None]:
model = lgb.LGBMRegressor()
lst_model = []
for i in y_train:
    model.fit(train_x,y_train[[i]])
    y_pred = model.predict(test_x)
    print(f'target is {i}')
    print(f'MAE is :{mean_absolute_error(y_test[[i]],y_pred)}')
    lst_model.append(model)
    

In [None]:
lst_feat = ['ID','PaymentsHistory','TransactionDates','pay_period']
for i in range(len(lst_model)):
    y_pred = lst_model[i].predict(test.drop(columns=lst_feat))
    test[f'm{i+1}'] = y_pred
    lst_feat.append(f'm{i+1}')

In [None]:
result = test[['m1','m2','m3','m4','m5','m6']]
df1 = result.values.flatten()
subm = pd.read_csv('../input/scoring/SampleSubmission.csv')
subm['Target'] = df1
subm.to_csv('simple.csv',index=False)

In [None]:
metadata.SupplierName.unique()
#PaymentMethod,SupplierName useless


In [None]:
full_train.columns

In [None]:
def convert_features(df):
    df.RegistrationDate = pd.to_datetime(df.RegistrationDate)
    df.LastPaymentDate = pd.to_datetime(df.LastPaymentDate)
    df.FirstPaymentDate = pd.to_datetime(df.FirstPaymentDate)
    df.ExpectedTermDate = pd.to_datetime(df.ExpectedTermDate)

In [None]:
def feature_extract(df):
    df['RegisteredInLeapYear'] = df.RegistrationDate.dt.is_leap_year.astype('float')
    df['RegisteredAtMonthStart'] = df.RegistrationDate.dt.is_month_start.astype('float')
    df['RegisteredAtMonthEnd'] = df.RegistrationDate.dt.is_month_end.astype('float')
    df['LastPaymentMonth'] = df.LastPaymentDate.dt.month
    df['FirstPaymentMonth'] = df.FirstPaymentDate.dt.month
    df['Reg_Duratation'] = datetime.now() - df.RegistrationDate

In [None]:
convert_features(full_train)
feature_extract(full_train)

In [None]:
full_test