In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

pd.options.display.max_columns = 1000
pd.options.display.max_rows = 100

In [2]:
train_transactions = pd.read_csv('../data/train_transaction.csv')
train_identity = pd.read_csv('../data/train_identity.csv')
train_data = pd.merge(train_transactions, train_identity, on=['TransactionID'])

### Before data Cleaning

In [None]:
train_data.head()

### Data Cleaning

In [None]:
def get_all_categorical_features():
    features = [
        'ProductCD',
        'addr1', 
        'addr2',
        'P_emaildomain',
        'R_emaildomain',
        'DeviceType',
        'DeviceInfo',
    ]
    for i in range(1,7):
        features.append(f'card{i}')
    for i in range(1,10):
        features.append(f'M{i}')
    for i in range(12,39):
        features.append(f'id_{i}')
    return features

def correcting_data_types(data, features):
    for i in data:
        if i in features:
            data[i] = data[i].astype("string")
        else:
            data[i] = pd.to_numeric(data[i])
    return data

def fill_missing_values(data):
    for column in  data:
        if data[column].dtype.name == 'string':
            data[column].fillna('not available', inplace=True)
        else:
            mean = data[column].mean()
            if str(mean) == 'nan':
                data[column].fillna(0, inplace=True)
            else:
                data[column].fillna(mean, inplace=True)
    return data

def cleaning_data(data, features):
    data = correcting_data_types(data=data, features=get_all_categorical_features())
    data = fill_missing_values(data=data)
    data = data.drop(features, axis=1)
    return data

def label_encode_categorical_features(data, features):
    encoder = LabelEncoder()
    for category in features:
        data[category] = encoder.fit_transform(data[category])
    return data

categorical_features = get_all_categorical_features()
clean_data = cleaning_data(train_data, ['TransactionID', 'TransactionDT'])
clean_data = label_encode_categorical_features(clean_data, categorical_features)
clean_data.to_csv('../data/clean/merged_train_n_identity.csv')
clean_data.head()


In [None]:
clean_data.head(3)