## Import Data

In [1]:
#Import initial packages
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from scipy import stats
sns.set_context('notebook') 
sns.set_style('ticks')
%matplotlib inline

In [2]:
#Read data
data = pd.read_csv('Clothing_Store.csv')
pd.set_option('display.max_columns',100)
data.tail()

Unnamed: 0,HHKEY,ZIP_CODE,REC,FRE,MON,CC_CARD,AVRG,PC_CALC20,PSWEATERS,PKNIT_TOPS,PKNIT_DRES,PBLOUSES,PJACKETS,PCAR_PNTS,PCAS_PNTS,PSHIRTS,PDRESSES,PSUITS,POUTERWEAR,PJEWELRY,PFASHION,PLEGWEAR,PCOLLSPND,AMSPEND,PSSPEND,CCSPEND,AXSPEND,TMONSPEND,OMONSPEND,SMONSPEND,PREVPD,GMP,PROMOS,DAYS,FREDAYS,MARKDOWN,CLASSES,COUPONS,STYLES,STORES,STORELOY,VALPHON,WEB,MAILED,RESPONDED,RESPONSERATE,HI,LTFREDAY,CLUSTYPE,PERCRET,RESP
21735,9964400000000.0,63105,322,2,39.0,0,19.5,11,0.0,0.0,0.39,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,39.0,0.0,0.0,0.0,0.0,39.0,0.67,7,323,161.5,0.0,2,0,3,2,870,Y,0,1,0,0.0,15.83,107.66,1,1.51,0
21736,9964400000000.0,72207,53,6,280.59,0,46.76,11,0.02,0.0,0.0,0.06,0.22,0.03,0.0,0.1,0.0,0.0,0.0,0.0,0.13,0.0,0.0,0.0,0.0,213.22,67.37,131.36,0.0,280.59,0.0,0.58,12,203,33.83,0.13,7,0,12,3,4507,N,0,5,2,40.0,8.67,18.45,12,0.69,0
21737,9964400000000.0,77084,24,9,1274.96,0,141.66,16,0.29,0.04,0.01,0.14,0.23,0.12,0.0,0.0,0.0,0.0,0.0,0.0,0.04,0.02,0.11,0.0,785.31,489.65,0.0,356.0,356.0,567.99,0.0,0.61,17,341,37.88,0.14,13,1,46,4,1615,N,0,7,2,28.57,19.71,12.62,15,0.08,1
21738,9964400000000.0,78640,37,3,265.94,0,88.64,11,0.18,0.0,0.03,0.11,0.0,0.0,0.09,0.16,0.0,0.0,0.0,0.0,0.0,0.07,0.16,0.0,13.99,251.95,0.0,0.0,0.0,0.0,0.0,0.56,12,257,85.66,0.12,9,0,15,2,36,Y,0,7,0,0.0,11.63,32.12,38,0.37,0
21739,9964400000000.0,95605,21,5,555.42,1,111.08,11,0.21,0.09,0.0,0.21,0.24,0.0,0.05,0.1,0.07,0.0,0.0,0.0,0.0,0.0,0.0,0.0,47.99,426.47,80.96,361.97,176.0,361.97,0.0,0.56,19,270,54.0,0.24,9,0,22,6,89,Y,0,6,2,33.33,18.07,11.25,17,0.0,1


## Data Process and Remove Unknown Variables

In [3]:
#Do a bit of data processing
valid_phone = {'N':0, 'Y':1} #Map valid phone to 0 and 1
data['VALPHON'] = data['VALPHON'].map(valid_phone).astype(int)

#Cluster is a category, not numerical
data['CLUSTYPE'] = data['CLUSTYPE'].astype(str)

#These three variables are not in documentation. Don't know what they mean so drop from analysis
del data['PC_CALC20']
del data['STYLES']
del data['STORELOY']

#Customer number, doesn't provide any insight to anything
del data['HHKEY']

#Too many zip codes. Could be insightful, but too difficult to implement
del data['ZIP_CODE']

In [4]:
data.head()

Unnamed: 0,REC,FRE,MON,CC_CARD,AVRG,PSWEATERS,PKNIT_TOPS,PKNIT_DRES,PBLOUSES,PJACKETS,PCAR_PNTS,PCAS_PNTS,PSHIRTS,PDRESSES,PSUITS,POUTERWEAR,PJEWELRY,PFASHION,PLEGWEAR,PCOLLSPND,AMSPEND,PSSPEND,CCSPEND,AXSPEND,TMONSPEND,OMONSPEND,SMONSPEND,PREVPD,GMP,PROMOS,DAYS,FREDAYS,MARKDOWN,CLASSES,COUPONS,STORES,VALPHON,WEB,MAILED,RESPONDED,RESPONSERATE,HI,LTFREDAY,CLUSTYPE,PERCRET,RESP
0,208,2,368.46,0,184.23,0.18,0.0,0.0,0.3,0.0,0.25,0.0,0.19,0.0,0.0,0.0,0.0,0.02,0.03,0.29,0.0,0.0,368.46,0.0,0.0,0.0,0.0,0.0,0.6,17,666,333.0,0.08,9,1,1,0,0,5,0,0.0,31.81,111.0,10,0.0,0
1,6,4,258.0,1,64.5,0.26,0.16,0.0,0.0,0.0,0.18,0.14,0.0,0.18,0.0,0.0,0.0,0.0,0.02,0.37,0.0,0.0,258.0,0.0,138.0,55.99,258.0,0.0,0.54,14,696,174.0,0.33,6,0,1,1,0,4,2,50.0,32.72,43.5,10,0.03,1
2,327,2,77.0,0,38.5,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,77.0,0.0,0.0,0.0,0.0,39.0,0.62,10,343,171.5,0.11,1,0,1,0,0,4,0,0.0,100.0,68.6,16,0.0,0
3,66,8,846.06,1,105.75,0.38,0.0,0.05,0.06,0.2,0.17,0.0,0.05,0.0,0.0,0.0,0.005307,0.03,0.01,0.0,0.0,0.0,846.06,0.0,104.94,0.0,373.87,166.25,0.43,24,701,87.62,0.29,15,3,1,1,0,9,6,66.67,23.27,26.96,10,0.0,0
4,49,1,87.44,0,87.44,0.2,0.2,0.0,0.0,0.0,0.0,0.41,0.0,0.0,0.0,0.0,0.17,0.0,0.0,0.0,0.0,0.0,87.44,0.0,87.44,0.0,87.44,0.0,0.22,0,49,49.0,0.42,4,0,1,1,0,0,0,0.0,28.52,24.5,20,0.0,0


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21740 entries, 0 to 21739
Data columns (total 46 columns):
REC             21740 non-null int64
FRE             21740 non-null int64
MON             21740 non-null float64
CC_CARD         21740 non-null int64
AVRG            21740 non-null float64
PSWEATERS       21740 non-null float64
PKNIT_TOPS      21740 non-null float64
PKNIT_DRES      21740 non-null float64
PBLOUSES        21740 non-null float64
PJACKETS        21740 non-null float64
PCAR_PNTS       21740 non-null float64
PCAS_PNTS       21740 non-null float64
PSHIRTS         21740 non-null float64
PDRESSES        21740 non-null float64
PSUITS          21740 non-null float64
POUTERWEAR      21740 non-null float64
PJEWELRY        21740 non-null float64
PFASHION        21740 non-null float64
PLEGWEAR        21740 non-null float64
PCOLLSPND       21740 non-null float64
AMSPEND         21740 non-null float64
PSSPEND         21740 non-null float64
CCSPEND         21740 non-null float64


In [6]:
variables_all = list(data.columns.values) #All variables
variables_all.remove('RESP')

response = 'RESP'

binary_all = ['CC_CARD', 'WEB', 'VALPHON'] #Binary

continuous_all = list(data.select_dtypes(['float64']).columns) #Continuous

categorical_all = ['CLUSTYPE'] #Categorical

discrete_all = list(data.select_dtypes(['int64']).columns) #Discrete
discrete_all.remove('RESP')
discrete_all.remove('CC_CARD')
discrete_all.remove('VALPHON')
discrete_all.remove('WEB')

len(binary_all)+len(continuous_all)+len(categorical_all)+len(discrete_all)+1

46