In [13]:
import numpy as np 
import pandas as pd 
import matplotlib.pylab as plt
import seaborn as sns
import warnings
warnings.simplefilter("ignore")

In [15]:
# Transaction CSVs
train_transaction = pd.read_csv('train_transaction_new.csv')
test_transaction = pd.read_csv('test_transaction_new.csv')
# Identity CSVs - These will be merged onto the transactions to create additional features
train_identity = pd.read_csv('train_identity.csv')
test_identity = pd.read_csv('test_identity.csv')

In [16]:
print('train_transaction shape is {}'.format(train_transaction.shape))
print('test_transaction shape is {}'.format(test_transaction.shape))
print('train_identity shape is {}'.format(train_identity.shape))
print('test_identity shape is {}'.format(test_identity.shape))

train_transaction shape is (140540, 394)
test_transaction shape is (56691, 393)
train_identity shape is (144233, 41)
test_identity shape is (141907, 41)


In [17]:
missing_values_count = train_transaction.isnull().sum()
print (missing_values_count[0:10])
total_cells = np.product(train_transaction.shape)
total_missing = missing_values_count.sum()
print ("% of missing data = ",(total_missing/total_cells) * 100)

TransactionID        0
isFraud              0
TransactionDT        0
TransactionAmt       0
ProductCD            0
card1                0
card2             2120
card3              742
card4              744
card5             1182
dtype: int64
% of missing data =  41.02831247710969


In [19]:
#train_identity.isna().sum()

In [20]:
## Function to reduce the DF size
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [21]:
## Reducing memory of train sets
train_transaction = reduce_mem_usage(train_transaction)
train_identity = reduce_mem_usage(train_identity)

Mem. usage decreased to 128.80 Mb (69.5% reduction)
Mem. usage decreased to 25.86 Mb (42.7% reduction)


In [23]:
## Reducing memory of test sets
test_transaction = reduce_mem_usage(test_transaction)
test_identity = reduce_mem_usage(test_identity)

Mem. usage decreased to 52.23 Mb (69.3% reduction)
Mem. usage decreased to 25.44 Mb (42.7% reduction)


In [24]:
#Summary Function
def fun_summary(df):
    print(f"Dataset Shape: {df.shape}")
    summary = pd.DataFrame(df.dtypes,columns=['dtypes'])
    summary = summary.reset_index()
    summary['Name'] = summary['index']
    summary = summary[['Name','dtypes']]
    summary['Missing'] = df.isnull().sum().values    
    summary['Uniques'] = df.nunique().values

    return summary

In [25]:
fun_summary(train_transaction)

Dataset Shape: (140540, 394)


Unnamed: 0,Name,dtypes,Missing,Uniques
0,TransactionID,int32,0,140540
1,isFraud,int8,0,2
2,TransactionDT,int32,0,137325
3,TransactionAmt,float16,0,4620
4,ProductCD,object,0,5
...,...,...,...,...
389,V335,float16,126355,329
390,V336,float16,126355,170
391,V337,float32,126355,168
392,V338,float32,126355,245


In [26]:
## Knowning the Card Features
fun_summary(train_transaction[['card1', 'card2', 'card3','card4', 'card5', 'card6']])

Dataset Shape: (140540, 6)


Unnamed: 0,Name,dtypes,Missing,Uniques
0,card1,int16,0,7583
1,card2,float16,2120,495
2,card3,float16,742,78
3,card4,object,744,4
4,card5,float16,1182,74
5,card6,object,743,2


In [None]:
# Catagorical and object features
catagorical_features=train_transaction.select_dtypes(include=['category','object']).columns

In [None]:
catagorical_features