# Sourcing raw data and saving processed data

<ol>
    <li> Only columns <b>'Consumer complaint narrative'</b> and <b>'Product'</b> are needed. </li>
    <li> All observations with missing value in the varibale <b>'Consumer complaint narrative'</b> needs to be removed. </li>
    <li> All duplicate observations in the dataframe needs to be removed. </li>
    <li> Target variable 'Product' needs to be remapped based on the analysis done. </li>
    <li> Splitting data into training, testing and validation sets and saving the files. </li>
</ol>

# Importing Modules

In [1]:
%load_ext autotime

import os
import wget
import pandas as pd
import preprocessorRawdata as pp
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from feature_engine.imputation import DropMissingData

time: 953 ms (started: 2022-01-26 23:27:22 +05:30)


# Download and Load the Latest Data

In [2]:
# Download the latest data from the complaints site
if 'complaints.csv.zip' in os.listdir():
    os.unlink('complaints.csv.zip')
    wget.download('https://files.consumerfinance.gov/ccdb/complaints.csv.zip')
else:
    wget.download('https://files.consumerfinance.gov/ccdb/complaints.csv.zip')

# Import data from the downloaded .zip file
con_com = pd.read_csv('complaints.csv.zip', compression='zip' ,usecols=['Product', 'Consumer complaint narrative']) # Reading only the required columns

100% [......................................................................] 398362492 / 398362492time: 37.3 s (started: 2022-01-26 23:27:23 +05:30)


# Configuration

In [6]:
# variable mappings
PRODUCT_MAPPING = {'Credit card': 'Credit card or prepaid card',
                   'Prepaid card': 'Credit card or prepaid card',
                   'Credit reporting':'Credit reporting, credit repair services, or other personal consumer reports',
                   'Money transfers':'Money transfer, virtual currency, or money service',
                   'Virtual currency':'Money transfer, virtual currency, or money service',
                   'Payday loan':'Consumer loan, Vehicle loan or lease, Payday loan, title loan, or personal loan',
                   'Other financial service': 'Money transfer, virtual currency, or money service',
                   'Consumer Loan':'Consumer loan, Vehicle loan or lease, Payday loan, title loan, or personal loan',
                   'Vehicle loan or lease':'Consumer loan, Vehicle loan or lease, Payday loan, title loan, or personal loan',
                   'Payday loan, title loan, or personal loan':'Consumer loan, Vehicle loan or lease, Payday loan, title loan, or personal loan',
                   'Bank account or service':'Bank account or service, Savings account',
                   'Checking or savings account':'Bank account or service, Savings account'}

# Independent variables
INDEPENDENT_FEATURES = ['Consumer complaint narrative']

# Dependent variable
DEPENDENT_FEATURES = ['Product']

# Sample size
TRAIN_SIZE = 10000

TEST_SIZE = 3000

time: 15 ms (started: 2022-01-26 23:28:51 +05:30)


# Pipeline

In [7]:
# set up the pipeline
price_pipe = Pipeline([
    
    # ===== DROP MISSING DATA ===== #
    ('drop_missing_observation', DropMissingData(
        variables=INDEPENDENT_FEATURES)),
    
    # ===== DROP DUPLICATE DATA ===== #
    ('drop_duplicate_observations', pp.DropDuplicateData()),
    
    # ===== REMAPPING TARGET VARIABLE ===== #
    ('target_variable_mapping', pp.Mapper(DEPENDENT_FEATURES, PRODUCT_MAPPING)),
    
])

con_com = price_pipe.fit_transform(con_com)

time: 2.59 s (started: 2022-01-26 23:28:56 +05:30)


# Saving Train, Test and Valid split

In [8]:
trainX, testX, valX, trainY, testY, valY = pp.trainTestValid_split(con_com[INDEPENDENT_FEATURES],
                                                                   con_com[DEPENDENT_FEATURES],
                                                                   trainsize=TRAIN_SIZE,
                                                                   testsize=TEST_SIZE)

train = pd.concat([trainX,trainY], axis=1)
test = pd.concat([testX,testY], axis=1)
valid = pd.concat([valX,valY], axis=1)

# Saving train and test data
train.to_csv('train.csv', index=False)
test.to_csv('test.csv', index=False)
valid.to_csv('valid.csv', index=False)

time: 24.1 s (started: 2022-01-26 23:28:58 +05:30)


# End of Notebook