# CLASS 2

In [2]:
import pandas as pd
import datetime
import numpy as np
FILE_PATH = "data.csv"

In [3]:
dataframe = pd.read_csv(FILE_PATH, sep='|', low_memory=False)

In [4]:
#Number of Rows
dataframe.shape[0]

238616

In [5]:
#Number of Columns
dataframe.shape[1]

77

In [6]:
#Number of Columns from Saving Account
dataframe.filter(regex='SavingAccount_*').shape[1]

29

In [7]:
#Number of Months
dataframe["Month"].nunique()

9

In [8]:
#Number of Clients
dataframe["client_id"].nunique()

26561

In [9]:
#Take only clients with 9 Months of Data. Why?
#We take clients with recent activity because those are relevant to our universe definition. 
#If we added clients with less month of data that could affect the model later on.
#We use this amount (9) because of this particular use case but it will always depend in the case at hand.
clients_with_nine_months = dataframe.groupby("client_id").filter(lambda x: len(x.Month.unique()) == 9).client_id.drop_duplicates()
dataframe_with_nine_month_clients = dataframe[dataframe.client_id.isin(clients_with_nine_months)].drop_duplicates()

dataframe_with_nine_month_clients

Unnamed: 0,client_id,Target,Month,First_product_dt,Last_product_dt,CreditCard_Premium,CreditCard_Active,CreditCard_CoBranding,Loan_Active,Mortgage_Active,...,CreditCard_Payment_External,CreditCard_Payment_Cash,CreditCard_Payment_Web,CreditCard_Payment_ATM,CreditCard_Payment_TAS,Investment_Numbers,Mobile,Email,Region,CreditCard_Product
0,5856970,1.0,2018-10-01,2013-10-23,2019-01-10,No,Yes,No,No,No,...,0.0,0.0,0.0,0.0,0.0,1.0,Yes,Yes,,
1,6371753,0.0,2018-09-01,2015-07-29,2018-06-02,No,No,No,No,No,...,0.0,0.0,0.0,0.0,0.0,0.0,Yes,No,,
2,5928737,0.0,2019-01-01,2016-08-31,2018-12-27,No,No,No,No,No,...,0.0,0.0,0.0,0.0,0.0,0.0,Yes,Yes,,
3,475064,0.0,2018-12-01,2014-07-13,2017-11-30,No,Yes,No,No,No,...,0.0,0.0,0.0,0.0,0.0,0.0,Yes,Yes,,
4,3615172,0.0,2018-09-01,2017-12-27,2017-12-28,No,No,No,No,No,...,0.0,0.0,0.0,0.0,0.0,0.0,Yes,No,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
238610,6448833,0.0,2019-03-01,2016-05-09,2019-04-02,No,No,No,No,No,...,0.0,0.0,0.0,0.0,0.0,0.0,Yes,Yes,REGION NORTE GRANDE ARGENTINO,
238611,6377583,0.0,2019-04-01,2015-06-03,2019-01-04,No,Yes,No,Yes,No,...,0.0,0.0,0.0,0.0,0.0,0.0,Yes,Yes,BUENOS AIRES,J55660104XX012
238612,6412619,0.0,2019-01-01,2015-07-08,2018-06-02,No,No,No,Yes,No,...,0.0,0.0,0.0,0.0,0.0,0.0,Yes,No,,
238613,5542402,0.0,2019-04-01,2012-09-13,2012-09-13,No,Yes,No,No,No,...,0.0,0.0,0.0,1.0,0.0,0.0,No,Yes,REGION NORTE GRANDE ARGENTINO,J55660104XX012


In [10]:
#Take only clients without Active Package. Be careful with the month! 
last_date_clients_with_no_package = dataframe_with_nine_month_clients[(dataframe_with_nine_month_clients.Month == '2019-01-01') & (dataframe_with_nine_month_clients.Package_Active == 'No')].client_id
dataframe_universe = dataframe_with_nine_month_clients[dataframe_with_nine_month_clients.client_id.isin(last_date_clients_with_no_package)]
universe = dataframe_with_nine_month_clients.merge(last_date_clients_with_no_package, on=['client_id'])[['client_id']].drop_duplicates()
dataframe_universe.Month.value_counts()

2018-10-01    26006
2018-09-01    26006
2019-01-01    26006
2018-12-01    26006
2019-02-01    26006
2019-04-01    26006
2018-11-01    26006
2018-08-01    26006
2019-03-01    26006
Name: Month, dtype: int64

In [11]:
TOTAL_MONTH_RANGE = 9

PREDICTION_WINDOW_RANGE = 2
LEAD_WINDOW_RANGE = 1
HISTORICAL_WINDOW_RANGE = TOTAL_MONTH_RANGE - PREDICTION_WINDOW_RANGE - LEAD_WINDOW_RANGE

#First i convert dates to type date so i can order them
dates = pd.to_datetime(dataframe['Month'], format='%Y-%m-%d').unique()[:TOTAL_MONTH_RANGE]
dates =  np.datetime_as_string(np.sort(dates), unit='D')
dates

array(['2018-08-01', '2018-09-01', '2018-10-01', '2018-11-01',
       '2018-12-01', '2019-01-01', '2019-02-01', '2019-03-01',
       '2019-04-01'], dtype='<U28')

In [12]:
prediction_window_data = dataframe[dataframe.Month.isin(dates[-PREDICTION_WINDOW_RANGE:])]
prediction_window_data

Unnamed: 0,client_id,Target,Month,First_product_dt,Last_product_dt,CreditCard_Premium,CreditCard_Active,CreditCard_CoBranding,Loan_Active,Mortgage_Active,...,CreditCard_Payment_External,CreditCard_Payment_Cash,CreditCard_Payment_Web,CreditCard_Payment_ATM,CreditCard_Payment_TAS,Investment_Numbers,Mobile,Email,Region,CreditCard_Product
9,5775560,0.0,2019-04-01,2013-08-22,2014-08-01,No,Yes,No,No,No,...,0.0,0.0,0.0,0.0,0.0,0.0,Yes,Yes,REGION CENTRO,J55660104XX012
13,5800470,0.0,2019-04-01,2013-08-23,2018-03-26,No,Yes,Yes,No,No,...,0.0,0.0,0.0,0.0,0.0,0.0,Yes,Yes,REGION CENTRO,J55660123XX012
17,3540244,0.0,2019-04-01,2018-09-07,2018-09-07,No,No,No,No,No,...,0.0,0.0,0.0,0.0,0.0,0.0,No,No,REGION CENTRO,
35,2739521,0.0,2019-03-01,2007-02-27,2007-03-25,No,Yes,No,No,No,...,0.0,0.0,0.0,0.0,1.0,0.0,Yes,Yes,AMBA Resto,J55660104XX012
39,704635,0.0,2019-03-01,2007-12-31,2018-05-03,No,Yes,Yes,No,No,...,0.0,0.0,0.0,0.0,0.0,0.0,Yes,Yes,REGION CENTRO,J55660124XX012
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
238609,1899373,0.0,2019-03-01,2015-06-23,2018-05-03,No,No,No,No,No,...,0.0,0.0,0.0,0.0,0.0,0.0,Yes,Yes,REGION CENTRO,
238610,6448833,0.0,2019-03-01,2016-05-09,2019-04-02,No,No,No,No,No,...,0.0,0.0,0.0,0.0,0.0,0.0,Yes,Yes,REGION NORTE GRANDE ARGENTINO,
238611,6377583,0.0,2019-04-01,2015-06-03,2019-01-04,No,Yes,No,Yes,No,...,0.0,0.0,0.0,0.0,0.0,0.0,Yes,Yes,BUENOS AIRES,J55660104XX012
238613,5542402,0.0,2019-04-01,2012-09-13,2012-09-13,No,Yes,No,No,No,...,0.0,0.0,0.0,1.0,0.0,0.0,No,Yes,REGION NORTE GRANDE ARGENTINO,J55660104XX012


In [13]:
lead_window_data = dataframe[dataframe.Month.isin([dates[-PREDICTION_WINDOW_RANGE-LEAD_WINDOW_RANGE]])]
lead_window_data

Unnamed: 0,client_id,Target,Month,First_product_dt,Last_product_dt,CreditCard_Premium,CreditCard_Active,CreditCard_CoBranding,Loan_Active,Mortgage_Active,...,CreditCard_Payment_External,CreditCard_Payment_Cash,CreditCard_Payment_Web,CreditCard_Payment_ATM,CreditCard_Payment_TAS,Investment_Numbers,Mobile,Email,Region,CreditCard_Product
8,4976363,0.0,2019-02-01,2013-12-05,2017-01-31,No,No,No,No,No,...,0.0,0.0,0.0,0.0,0.0,0.0,Yes,Yes,,
14,6224892,0.0,2019-02-01,2014-12-10,2014-12-10,Yes,Yes,No,No,No,...,1.0,0.0,0.0,0.0,0.0,0.0,Yes,Yes,,
15,6600949,1.0,2019-02-01,2016-01-25,2016-01-25,Yes,Yes,No,No,No,...,0.0,1.0,0.0,0.0,0.0,0.0,Yes,Yes,,
30,5668297,0.0,2019-02-01,2013-02-06,2013-02-06,No,Yes,No,No,No,...,0.0,0.0,0.0,0.0,1.0,0.0,Yes,Yes,,
44,3070198,0.0,2019-02-01,2007-04-04,2007-06-29,No,Yes,Yes,No,No,...,0.0,0.0,1.0,0.0,0.0,0.0,Yes,Yes,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
238460,5054120,0.0,2019-02-01,2011-08-26,2019-01-04,No,No,No,No,No,...,0.0,0.0,0.0,0.0,0.0,0.0,Yes,Yes,,
238472,4028262,0.0,2019-02-01,2011-09-19,2018-01-30,Yes,Yes,No,No,No,...,0.0,0.0,0.0,0.0,0.0,1.0,Yes,Yes,,
238595,6468486,0.0,2019-02-01,2015-09-18,2018-03-16,No,Yes,Yes,Yes,No,...,0.0,0.0,0.0,0.0,1.0,0.0,Yes,No,,
238601,4980147,0.0,2019-02-01,2015-02-26,2015-02-26,No,Yes,Yes,No,No,...,1.0,0.0,0.0,0.0,0.0,0.0,Yes,No,,


In [14]:
historical_window_data =  dataframe[dataframe.Month.isin(dates[:HISTORICAL_WINDOW_RANGE])]
historical_window_data

Unnamed: 0,client_id,Target,Month,First_product_dt,Last_product_dt,CreditCard_Premium,CreditCard_Active,CreditCard_CoBranding,Loan_Active,Mortgage_Active,...,CreditCard_Payment_External,CreditCard_Payment_Cash,CreditCard_Payment_Web,CreditCard_Payment_ATM,CreditCard_Payment_TAS,Investment_Numbers,Mobile,Email,Region,CreditCard_Product
0,5856970,1.0,2018-10-01,2013-10-23,2019-01-10,No,Yes,No,No,No,...,0.0,0.0,0.0,0.0,0.0,1.0,Yes,Yes,,
1,6371753,0.0,2018-09-01,2015-07-29,2018-06-02,No,No,No,No,No,...,0.0,0.0,0.0,0.0,0.0,0.0,Yes,No,,
2,5928737,0.0,2019-01-01,2016-08-31,2018-12-27,No,No,No,No,No,...,0.0,0.0,0.0,0.0,0.0,0.0,Yes,Yes,,
3,475064,0.0,2018-12-01,2014-07-13,2017-11-30,No,Yes,No,No,No,...,0.0,0.0,0.0,0.0,0.0,0.0,Yes,Yes,,
4,3615172,0.0,2018-09-01,2017-12-27,2017-12-28,No,No,No,No,No,...,0.0,0.0,0.0,0.0,0.0,0.0,Yes,No,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
238597,1673642,0.0,2018-11-01,2017-08-18,2017-09-26,No,Yes,No,No,No,...,0.0,0.0,1.0,0.0,0.0,0.0,No,Yes,,
238603,6145735,1.0,2018-11-01,2014-10-26,2014-10-26,No,Yes,No,No,No,...,0.0,0.0,1.0,0.0,0.0,0.0,Yes,Yes,,
238604,5638786,1.0,2018-11-01,2012-12-26,2017-03-08,No,Yes,No,No,No,...,0.0,0.0,0.0,0.0,1.0,0.0,Yes,No,,
238608,3824781,0.0,2018-08-01,2014-11-27,2019-01-04,No,No,No,No,No,...,0.0,0.0,0.0,0.0,0.0,0.0,Yes,Yes,,


In [15]:
#Is the target Balanced or Unbalanced?
target_users = prediction_window_data[prediction_window_data.Target == 1][['client_id']].drop_duplicates()
target_users['TGT'] = 1
target_data = universe.merge(target_users, how='left', on='client_id').fillna(0).drop_duplicates()

target_data.TGT.value_counts()

0.0    19177
1.0     6829
Name: TGT, dtype: int64

In [16]:
balance_percentage = target_data[target_data.TGT == 1][['TGT']].count() / target_data.TGT.count()
balance_percentage

TGT    0.262593
dtype: float64

In [17]:
# 74% non targets rowns vs 26% targets rowns we could say its not perfectly balanced (50/50)
# How can you balance the data frame 50/50?
# Some simple solutions are over or under sampling the dataset. That is, to duplicate or eliminate some values in other to force this balance.
# The problem with this strategies is that we could be deleting critical information or overloading our dataset with redundant data that will end up overfitting our model (bias towards a result)
# Another approach would be using Feature Selection where we use statistical metrics (like chi square) to determine which rows are more "unique" than others based on correlation

# CLASS 3

In [18]:
data_universe = dataframe_universe.sort_values(by=['client_id', 'Month']).bfill()
data_universe = data_universe[data_universe.Month.isin(dates[:HISTORICAL_WINDOW_RANGE])].drop_duplicates()
data_universe['CreditCard_Product'] = data_universe['CreditCard_Product'].fillna(data_universe['CreditCard_Product'].mode().iloc[0])
data_universe

Unnamed: 0,client_id,Target,Month,First_product_dt,Last_product_dt,CreditCard_Premium,CreditCard_Active,CreditCard_CoBranding,Loan_Active,Mortgage_Active,...,CreditCard_Payment_External,CreditCard_Payment_Cash,CreditCard_Payment_Web,CreditCard_Payment_ATM,CreditCard_Payment_TAS,Investment_Numbers,Mobile,Email,Region,CreditCard_Product
201502,1000199,0.0,2018-08-01,2017-02-21,2017-05-18,No,No,No,Yes,No,...,0.0,0.0,1.0,0.0,0.0,0.0,Yes,No,BUENOS AIRES,J55660104XX012
53012,1000199,0.0,2018-09-01,2017-02-21,2017-05-18,No,No,No,Yes,No,...,0.0,0.0,1.0,1.0,0.0,0.0,Yes,No,BUENOS AIRES,J55660104XX012
116546,1000199,0.0,2018-10-01,2017-02-21,2017-05-18,No,No,No,Yes,No,...,0.0,0.0,3.0,1.0,0.0,0.0,Yes,No,BUENOS AIRES,J55660104XX012
102248,1000199,0.0,2018-11-01,2017-02-21,2017-05-18,No,No,No,Yes,No,...,0.0,0.0,1.0,0.0,0.0,0.0,Yes,No,BUENOS AIRES,J55660104XX012
217608,1000199,0.0,2018-12-01,2017-02-21,2017-05-18,No,No,No,Yes,No,...,0.0,0.0,1.0,0.0,0.0,0.0,Yes,No,BUENOS AIRES,J55660104XX012
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
121475,999462,0.0,2018-09-01,2016-03-08,2017-05-18,No,No,No,No,No,...,0.0,0.0,0.0,0.0,0.0,0.0,Yes,Yes,REGION CENTRO,J55660104XX012
167692,999462,0.0,2018-10-01,2016-03-08,2017-05-18,No,No,No,No,No,...,0.0,0.0,0.0,0.0,0.0,0.0,Yes,Yes,REGION CENTRO,J55660104XX012
9312,999462,0.0,2018-11-01,2016-03-08,2017-05-18,No,No,No,No,No,...,0.0,0.0,0.0,0.0,0.0,0.0,Yes,Yes,REGION CENTRO,J55660104XX012
70985,999462,0.0,2018-12-01,2016-03-08,2017-05-18,No,No,No,No,No,...,0.0,0.0,0.0,0.0,0.0,0.0,Yes,Yes,REGION CENTRO,J55660104XX012


In [36]:
#Fill nulls
identity_dataframe = data_universe.drop(columns=['Client_Age_grp', 'SavingAccount_Balance_Average', 'SavingAccount_Transactions_Transactions', 'SavingAccount_Total_Amount'])
transform_dataframe = data_universe[['Client_Age_grp', 'SavingAccount_Balance_Average', 'SavingAccount_Transactions_Transactions', 'SavingAccount_Total_Amount']]

identity_dataframe

Unnamed: 0,client_id,Target,Month,First_product_dt,Last_product_dt,CreditCard_Premium,CreditCard_Active,CreditCard_CoBranding,Loan_Active,Mortgage_Active,SavingAccount_Active_ARG_Salary,SavingAccount_Active_ARG,SavingAccount_Active_DOLLAR,DebitCard_Active,Investment_Active,Package_Active,Insurance_Life,Insurance_Home,Insurance_Accidents,Insurance_Mobile,Insurance_ATM,Insurance_Unemployment,Sex,SavingAccount_Balance_FirstDate,SavingAccount_Balance_LastDate,SavingAccount_Days_with_use,SavingAccount_Days_with_Credits,SavingAccount_Days_with_Debits,SavingAccount_Salary_Payment_Transactions,SavingAccount_Transfer_In_Transactions,SavingAccount_ATM_Extraction_Transactions,SavingAccount_Service_Payment_Transactions,SavingAccount_CreditCard_Payment_Transactions,SavingAccount_Transfer_Out_Transactions,SavingAccount_DebitCard_Spend_Transactions,SavingAccount_Credits_Transactions,SavingAccount_Debits_Transactions,SavingAccount_Salary_Payment_Amount,SavingAccount_Transfer_In_Amount,SavingAccount_ATM_Extraction_Amount,SavingAccount_Service_Payment_Amount,SavingAccount_CreditCard_Payment_Amount,SavingAccount_Transfer_Out_Amount,SavingAccount_DebitCard_Spend_Amount,SavingAccount_Credits_Amounts,SavingAccount_Debits_Amounts,Operations_Bank,Operations_Terminal,Operations_HomeBanking,Operations_Mobile,Operations_Ivr,Operations_Telemarketer,Operations_ATM,CreditCard_Balance_ARG,CreditCard_Balance_DOLLAR,CreditCard_Total_Limit,CreditCard_Total_Spending,CreditCard_Spending_1_Installment,CreditCard_Spending_Installments,CreditCard_Spending_CrossBoarder,CreditCard_Spending_Aut_Debits,CreditCard_Revolving,CreditCard_Payment_Aut_Debit,CreditCard_Payment_External,CreditCard_Payment_Cash,CreditCard_Payment_Web,CreditCard_Payment_ATM,CreditCard_Payment_TAS,Investment_Numbers,Mobile,Email,Region,CreditCard_Product
201502,1000199,0.0,2018-08-01,2017-02-21,2017-05-18,No,No,No,Yes,No,No,Yes,Yes,Yes,No,No,No,No,No,No,No,No,M,0.00,0.00,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,2.0,2.0,0.0,0.0,0.0,0.0,14.95,0.0,0.0,3100.00,3100.00,0.0,0.0,2.0,5.0,0.0,0.0,0.0,33777.01,0.0,32000.0,14561.03,10434.21,872.78,0.0,3254.04,12817.74,1.0,0.0,0.0,1.0,0.0,0.0,0.0,Yes,No,BUENOS AIRES,J55660104XX012
53012,1000199,0.0,2018-09-01,2017-02-21,2017-05-18,No,No,No,Yes,No,No,Yes,Yes,Yes,No,No,No,No,No,No,No,No,M,0.00,0.00,2.0,2.0,2.0,0.0,2.0,0.0,0.0,1.0,0.0,0.0,2.0,3.0,0.0,3275.0,0.0,0.0,47.82,0.0,0.0,3275.00,3275.00,0.0,0.0,3.0,8.0,0.0,0.0,0.0,34034.73,0.0,32000.0,14530.75,9942.77,224.67,0.0,4363.31,13949.19,1.0,0.0,0.0,1.0,1.0,0.0,0.0,Yes,No,BUENOS AIRES,J55660104XX012
116546,1000199,0.0,2018-10-01,2017-02-21,2017-05-18,No,No,No,Yes,No,No,Yes,Yes,Yes,No,No,No,No,No,No,No,No,M,0.00,26.41,2.0,2.0,2.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,2.0,2.0,0.0,3099.0,0.0,0.0,0.00,0.0,0.0,3099.00,3072.59,0.0,0.0,6.0,6.0,0.0,0.0,0.0,33212.94,0.0,32000.0,12842.35,7737.75,224.67,0.0,4879.93,14569.12,0.0,0.0,0.0,3.0,1.0,0.0,0.0,Yes,No,BUENOS AIRES,J55660104XX012
102248,1000199,0.0,2018-11-01,2017-02-21,2017-05-18,No,No,No,Yes,No,No,Yes,Yes,Yes,No,No,No,No,No,No,No,No,M,26.41,0.00,1.0,1.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,2.0,1.0,0.0,3340.0,0.0,0.0,0.00,0.0,0.0,3340.00,3366.41,0.0,0.0,0.0,2.0,0.0,0.0,0.0,31598.25,0.0,32000.0,9653.21,7557.47,266.74,0.0,1829.00,15365.77,0.0,0.0,0.0,1.0,0.0,0.0,0.0,Yes,No,BUENOS AIRES,J55660104XX012
217608,1000199,0.0,2018-12-01,2017-02-21,2017-05-18,No,No,No,Yes,No,No,Yes,Yes,Yes,No,No,Yes,Yes,Yes,Yes,No,Yes,M,0.00,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.00,0.00,0.0,0.0,1.0,0.0,0.0,0.0,0.0,42959.58,0.0,32000.0,8080.78,0.00,266.74,0.0,7814.04,27598.25,0.0,0.0,0.0,1.0,0.0,0.0,0.0,Yes,No,BUENOS AIRES,J55660104XX012
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
121475,999462,0.0,2018-09-01,2016-03-08,2017-05-18,No,No,No,No,No,No,Yes,Yes,Yes,No,No,No,No,No,No,No,No,M,10947.35,10857.39,4.0,2.0,2.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,2.0,5.0,0.0,9000.0,0.0,0.0,0.00,0.0,0.0,9000.84,9090.80,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.00,0.00,0.00,0.0,0.00,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Yes,Yes,REGION CENTRO,J55660104XX012
167692,999462,0.0,2018-10-01,2016-03-08,2017-05-18,No,No,No,No,No,No,Yes,Yes,Yes,No,No,No,No,No,No,No,No,M,10857.39,10701.36,5.0,2.0,3.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,2.0,5.0,0.0,9000.0,0.0,0.0,0.00,0.0,0.0,9000.78,9156.81,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.00,0.00,0.00,0.0,0.00,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Yes,Yes,REGION CENTRO,J55660104XX012
9312,999462,0.0,2018-11-01,2016-03-08,2017-05-18,No,No,No,No,No,No,Yes,Yes,Yes,No,No,No,No,No,No,No,No,M,10701.36,10368.31,5.0,2.0,3.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,2.0,5.0,0.0,9000.0,0.0,0.0,0.00,0.0,0.0,9000.93,9333.98,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.00,0.00,0.00,0.0,0.00,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Yes,Yes,REGION CENTRO,J55660104XX012
70985,999462,0.0,2018-12-01,2016-03-08,2017-05-18,No,No,No,No,No,No,Yes,Yes,Yes,No,No,Yes,No,Yes,No,Yes,No,M,10368.31,9999.99,4.0,2.0,2.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,2.0,5.0,0.0,9000.0,0.0,0.0,0.00,0.0,0.0,9001.19,9369.51,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.00,0.00,0.00,0.0,0.00,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Yes,Yes,REGION CENTRO,J55660104XX012


In [33]:
#10 features
transform_dataframe['Days_Between_Products'] = identity_dataframe.apply(lambda row: pd.to_datetime(row.Last_product_dt, format='%Y-%m-%d') - pd.to_datetime(row.First_product_dt, format='%Y-%m-%d'), axis=1).copy()
transform_dataframe['Over_Credit_Limit'] = identity_dataframe.apply(lambda row: row.CreditCard_Total_Spending > row.CreditCard_Total_Limit, axis=1).copy()
transform_dataframe['SavingAccount_Balance_dt'] = identity_dataframe.apply(lambda row: pd.SavingAccount_Balance_LastDate - row.SavingAccount_Balance_FirstDate, axis=1).copy()
transform_dataframe['SavingAccount_Credits_Amount_Percentage'] = identity_dataframe.apply(lambda row: row.SavingAccount_Credits_Amounts / (row.SavingAccount_Credits_Amounts + row.SavingAccount_Debits_Amounts), axis=1).copy()
transform_dataframe['SavingAccount_Debits_Amount_Percentage'] = identity_dataframe.apply(lambda row: row.SavingAccount_Debits_Amounts / (row.SavingAccount_Credits_Amounts + row.SavingAccount_Debits_Amounts), axis=1).copy()
transform_dataframe['SavingAccount_Active_Both'] = identity_dataframe.apply(lambda row: row.SavingAccount_Active_ARG == 'Yes' &  row.SavingAccount_Active_DOLLAR == 'Yes' , axis=1).copy()
transform_dataframe['CreditCard_Balance_ARG_Percentage'] = identity_dataframe.apply(lambda row: row.CreditCard_Balance_ARG / (row.CreditCard_Balance_ARG + row.CreditCard_Balance_DOLLAR), axis=1).copy()
transform_dataframe['CreditCard_Balance_DOLLAR_Percentage'] = identity_dataframe.apply(lambda row: row.CreditCard_Balance_DOLLAR / (row.CreditCard_Balance_ARG + row.CreditCard_Balance_DOLLAR), axis=1).copy()
transform_dataframe['SavingAccount_Days_with_Total'] = identity_dataframe.apply(lambda row: row.SavingAccount_Days_with_Debits + row.SavingAccount_Days_with_Credits, axis=1).copy()
transform_dataframe['Email_And_Mobile'] = identity_dataframe.apply(lambda row: row.Mobile == 'Yes' &  row.Email == 'Yes' , axis=1).copy()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  transform_dataframe['Days_Between_Products'] = identity_dataframe.apply(lambda row: pd.to_datetime(row.Last_product_dt, format='%Y-%m-%d') - pd.to_datetime(row.First_product_dt, format='%Y-%m-%d'), axis=1).copy()
