In [69]:
import pandas as pd
import numpy as np
import datetime

In [70]:
df = pd.read_excel('no_missing.xlsx', converters={'Merchnum': lambda x: str(x), 'Cardnum': lambda x: str(x), 'Merch_zip': lambda x : str(x)})

In [71]:
df = df.drop(['Unnamed: 0','assign','minus'],axis = 1)

In [72]:
df.head()

Unnamed: 0,Merch_description,Merchnum,Recnum,Cardnum,Date,Merch_state,Merch_zip,Amount,Fraud,Merch_description_part
0,FEDEX SHP 12/23/09 AB#,5509006296254,1,5142190439,2010-01-01,TN,38118,3.62,0,FEDEXSHPAB
1,SERVICE MERCHANDISE #81,61003026333,2,5142183973,2010-01-01,MA,1803,31.42,0,SERVICEMERCHANDISE
2,OFFICE DEPOT #191,4503082993600,3,5142131721,2010-01-01,MD,20706,178.49,0,OFFICEDEPOT
3,FEDEX SHP 12/28/09 AB#,5509006296254,4,5142148452,2010-01-01,TN,38118,3.62,0,FEDEXSHPAB
4,FEDEX SHP 12/23/09 AB#,5509006296254,5,5142190439,2010-01-01,TN,38118,3.62,0,FEDEXSHPAB


In [73]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 96397 entries, 0 to 96396
Data columns (total 10 columns):
Merch_description         96397 non-null object
Merchnum                  96397 non-null object
Recnum                    96397 non-null int64
Cardnum                   96397 non-null object
Date                      96397 non-null datetime64[ns]
Merch_state               96397 non-null object
Merch_zip                 96397 non-null object
Amount                    96397 non-null float64
Fraud                     96397 non-null int64
Merch_description_part    96397 non-null object
dtypes: datetime64[ns](1), float64(1), int64(2), object(6)
memory usage: 7.4+ MB


## Entities

In [74]:
df['Cardnum-Merchnum'] = df['Cardnum'] + df['Merchnum']
df['Cardnum-Merch_description_part'] = df['Cardnum'] + df['Merch_description_part']
df['Cardnum-Merch_state'] = df['Cardnum'] + df['Merch_state']
df['Cardnum-Merch_zip'] = df['Cardnum'] + df['Merch_zip']
df['Cardnum-Merch_state-Merch_zip'] = df['Cardnum'] + df['Merch_state'] + df['Merch_zip']

In [75]:
entities = list(df.columns)
entities.remove('Recnum')
entities.remove('Date')
entities.remove('Merch_state')
entities.remove('Merch_zip')
entities.remove('Merch_description')
entities.remove('Amount')
entities.remove('Fraud')

In [76]:
entities

['Merchnum',
 'Cardnum',
 'Merch_description_part',
 'Cardnum-Merchnum',
 'Cardnum-Merch_description_part',
 'Cardnum-Merch_state',
 'Cardnum-Merch_zip',
 'Cardnum-Merch_state-Merch_zip']

In [77]:
df.head()

Unnamed: 0,Merch_description,Merchnum,Recnum,Cardnum,Date,Merch_state,Merch_zip,Amount,Fraud,Merch_description_part,Cardnum-Merchnum,Cardnum-Merch_description_part,Cardnum-Merch_state,Cardnum-Merch_zip,Cardnum-Merch_state-Merch_zip
0,FEDEX SHP 12/23/09 AB#,5509006296254,1,5142190439,2010-01-01,TN,38118,3.62,0,FEDEXSHPAB,51421904395509006296254,5142190439FEDEXSHPAB,5142190439TN,514219043938118,5142190439TN38118
1,SERVICE MERCHANDISE #81,61003026333,2,5142183973,2010-01-01,MA,1803,31.42,0,SERVICEMERCHANDISE,514218397361003026333,5142183973SERVICEMERCHANDISE,5142183973MA,514218397301803,5142183973MA01803
2,OFFICE DEPOT #191,4503082993600,3,5142131721,2010-01-01,MD,20706,178.49,0,OFFICEDEPOT,51421317214503082993600,5142131721OFFICEDEPOT,5142131721MD,514213172120706,5142131721MD20706
3,FEDEX SHP 12/28/09 AB#,5509006296254,4,5142148452,2010-01-01,TN,38118,3.62,0,FEDEXSHPAB,51421484525509006296254,5142148452FEDEXSHPAB,5142148452TN,514214845238118,5142148452TN38118
4,FEDEX SHP 12/23/09 AB#,5509006296254,5,5142190439,2010-01-01,TN,38118,3.62,0,FEDEXSHPAB,51421904395509006296254,5142190439FEDEXSHPAB,5142190439TN,514219043938118,5142190439TN38118


## Days-since Variables

In [124]:
daysince = df[['Recnum']].set_index('Recnum').copy()

In [125]:
for entity in entities:
    data1 = df[['Recnum','Date',entity]]
    data2 = df[['Recnum','Date',entity]]
    data1.Recnum = data1.Recnum.astype(int)
    data2.Recnum = data2.Recnum.astype(int)
    tempdf = pd.merge(data1, data2, left_on = entity, right_on = entity)
    tempdf = tempdf[tempdf['Recnum_x'] > tempdf['Recnum_y']]
    new_col = entity+'_' +'daysince'
    tempdf[new_col] = tempdf['Date_x'] - tempdf['Date_y']
    tempdf[new_col] = tempdf[new_col].astype('timedelta64[D]')
    tempday = tempdf.groupby('Recnum_x').agg({new_col:'min'})
    tempday.index = tempday.index.rename('Recnum')
    daysince = pd.merge(daysince, tempday, left_index = True, right_index = True, how = 'left')

In [126]:
daysince.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 96397 entries, 1 to 96753
Data columns (total 8 columns):
Merchnum_daysince                          82090 non-null float64
Cardnum_daysince                           94754 non-null float64
Merch_description_part_daysince            85493 non-null float64
Cardnum-Merchnum_daysince                  56639 non-null float64
Cardnum-Merch_description_part_daysince    60021 non-null float64
Cardnum-Merch_state_daysince               80146 non-null float64
Cardnum-Merch_zip_daysince                 62740 non-null float64
Cardnum-Merch_state-Merch_zip_daysince     62644 non-null float64
dtypes: float64(8)
memory usage: 9.1 MB


In [127]:
daysince = daysince.fillna(365)

In [128]:
daysince.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 96397 entries, 1 to 96753
Data columns (total 8 columns):
Merchnum_daysince                          96397 non-null float64
Cardnum_daysince                           96397 non-null float64
Merch_description_part_daysince            96397 non-null float64
Cardnum-Merchnum_daysince                  96397 non-null float64
Cardnum-Merch_description_part_daysince    96397 non-null float64
Cardnum-Merch_state_daysince               96397 non-null float64
Cardnum-Merch_zip_daysince                 96397 non-null float64
Cardnum-Merch_state-Merch_zip_daysince     96397 non-null float64
dtypes: float64(8)
memory usage: 9.1 MB


In [138]:
daysince.to_csv('Days-since Variables.csv')

## Frequency Variables

In [130]:
frequency = df[['Recnum']].set_index('Recnum').copy()

In [131]:
df['join_ts1'] = df['Date']

In [132]:
for t in [0,1,3,7,14,30]:
    end_time = 'join_ts2_'+str(t)
    df[end_time] = df['Date'] + datetime.timedelta(t)

In [134]:
for entity in entities:
    data3 = df[['Recnum','Date', entity]]
    data4 = df[[ 'Recnum','join_ts1','join_ts2_0','join_ts2_1','join_ts2_3','join_ts2_7','join_ts2_14','join_ts2_30', entity]]
    data3.Recnum = data3.Recnum.astype(int)
    data4.Recnum = data4.Recnum.astype(int)
    tempdf = pd.merge(data3, data4, left_on = entity, right_on = entity)
    for t in [0,1,3,7,14,30]:
        name_end_time = 'join_ts2_' + str(t)
        tempdf2 = tempdf[(tempdf['Date'] >= tempdf['join_ts1']) & (tempdf['Date'] <= tempdf[name_end_time])& (tempdf['Recnum_x']>=tempdf['Recnum_y'])]
        temp_name = str(entity) + '_' + 'count' + str(t) + '_'
        tempGroupby = tempdf2[['Recnum_x','Date']].groupby('Recnum_x')
        tempGroupby = getattr(tempGroupby, 'count')().add_prefix(temp_name)
        tempGroupby.index = tempGroupby.index.rename('Recnum')
        frequency = pd.merge(frequency, tempGroupby,left_index = True, right_index = True, how = 'left')

In [135]:
frequency.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 96397 entries, 1 to 96753
Data columns (total 48 columns):
Merchnum_count0_Date                           96397 non-null int64
Merchnum_count1_Date                           96397 non-null int64
Merchnum_count3_Date                           96397 non-null int64
Merchnum_count7_Date                           96397 non-null int64
Merchnum_count14_Date                          96397 non-null int64
Merchnum_count30_Date                          96397 non-null int64
Cardnum_count0_Date                            96397 non-null int64
Cardnum_count1_Date                            96397 non-null int64
Cardnum_count3_Date                            96397 non-null int64
Cardnum_count7_Date                            96397 non-null int64
Cardnum_count14_Date                           96397 non-null int64
Cardnum_count30_Date                           96397 non-null int64
Merch_description_part_count0_Date             96397 non-null int64
Merch_d

In [136]:
frequency.head()

Unnamed: 0_level_0,Merchnum_count0_Date,Merchnum_count1_Date,Merchnum_count3_Date,Merchnum_count7_Date,Merchnum_count14_Date,Merchnum_count30_Date,Cardnum_count0_Date,Cardnum_count1_Date,Cardnum_count3_Date,Cardnum_count7_Date,...,Cardnum-Merch_zip_count3_Date,Cardnum-Merch_zip_count7_Date,Cardnum-Merch_zip_count14_Date,Cardnum-Merch_zip_count30_Date,Cardnum-Merch_state-Merch_zip_count0_Date,Cardnum-Merch_state-Merch_zip_count1_Date,Cardnum-Merch_state-Merch_zip_count3_Date,Cardnum-Merch_state-Merch_zip_count7_Date,Cardnum-Merch_state-Merch_zip_count14_Date,Cardnum-Merch_state-Merch_zip_count30_Date
Recnum,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
2,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
3,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
4,2,2,2,2,2,2,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
5,3,3,3,3,3,3,2,2,2,2,...,2,2,2,2,2,2,2,2,2,2


In [137]:
frequency.to_csv('Frequency Variables.csv')