In [1]:
import numpy as np
import pandas as pd
from sklearn import linear_model, preprocessing
from sklearnex import patch_sklearn
patch_sklearn()
from patsy import dmatrices, dmatrix
import copy
import re
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


### Data Cleaning

In [4]:
data = pd.read_csv('C:/Users/Chris Li/Documents/UC Davis/STA 208/Data/bank-additional/bank-additional-full.csv', sep = ';') 

# replace dots with underscores in variable names, to make compatible with patsy
data.columns = list(map(lambda x: re.sub(r"\.", "_", x), data.columns))

# change pdays variable to binary, to deal with 999 issue
data['pdays'] = np.where(data['pdays']==999, "client NOT previously contacted", "client WAS previously contacted")

# drop duration and default. drop duration as recommended by UCI. drop default because answer is very rarely yes
data = data.drop(columns=['default', 'duration'])

In [5]:
# identify quantitative vars
datatypes = pd.DataFrame(data.dtypes)
datatypes = datatypes[datatypes.apply(lambda x: (x[0] == "float64" or x[0] == "int64"), axis=1)].reset_index()
quant_vars = datatypes.iloc[:,0].tolist()


In [6]:
# standardize quant vars
non_quant_vars = list(set(data.columns) - set(quant_vars))

stand = ColumnTransformer(
     [("num", StandardScaler(), quant_vars),
     ('pass', 'passthrough',non_quant_vars)])
stand.fit(data)

# need to add back data types
data_stand = pd.DataFrame(stand.transform(data), columns = quant_vars + non_quant_vars).convert_dtypes()

# change uppercase float64 to lowercase
float64_cols = list(data_stand.select_dtypes(include='Float64'))
data_stand[float64_cols] = data_stand[float64_cols].astype('float64')

string_cols = list(data_stand.select_dtypes(include='string'))
data_stand[string_cols] = data_stand[string_cols].astype('object')


In [7]:
# train test split
from sklearn.model_selection import train_test_split
train_cat,test_cat = train_test_split(data_stand, test_size = 0.2, train_size = 0.8, random_state = 22)

Output Clenaed Dataset with Categorical Vairables Left as String Columns

In [8]:
train_cat.to_csv('C:/Users/Chris Li/Documents/UC Davis/STA 208/Data/train_categ_vars_one_column.csv')
test_cat.to_csv('C:/Users/Chris Li/Documents/UC Davis/STA 208/Data/test_categ_vars_one_column.csv')

### Make Design matrix, dummy variable style, for logistic regression

In [10]:

# make equation of Xs for patsy
x_cols = data_stand.loc[:, set(data_stand.columns) - set(['y'])].columns.tolist()
x_eq = ' + '.join(x_cols)
x_eq

  x_cols = data_stand.loc[:, set(data_stand.columns) - set(['y'])].columns.tolist()


'housing + emp_var_rate + marital + euribor3m + age + poutcome + contact + day_of_week + cons_price_idx + loan + nr_employed + previous + education + cons_conf_idx + pdays + campaign + job + month'

In [11]:
# make numpy design matrix for patsy
x_temp = copy.copy(data_stand)
x = dmatrix(x_eq, x_temp, return_type = 'dataframe')

final_data = pd.concat([data_stand['y'],x], axis = 1)

In [12]:
# train test split
from sklearn.model_selection import train_test_split
train,test = train_test_split(final_data, test_size = 0.2, train_size = 0.8, random_state = 22)

Output Data Formatted as Design Matrix

In [14]:
train.to_csv('C:/Users/Chris Li/Documents/UC Davis/STA 208/Data/train.csv')

In [15]:
test.to_csv('C:/Users/Chris Li/Documents/UC Davis/STA 208/Data/test.csv')

### Make Cleaned Dataset where Categorical Variables are One Hot Encoded

In [16]:
# start with standardized data

# select categorical columns

cat_data = copy.copy(data_stand[list(set(non_quant_vars) - set(['y']))])
num_data = copy.copy(data_stand[quant_vars])

cat_columns = copy.copy(cat_data.columns + "_var")
og_cat_cols = cat_data.columns
for col in og_cat_cols:
    cat_data[col] = cat_data[col].astype('category')
    cat_data[col + "_var"] = cat_data[col].cat.codes

enc = OneHotEncoder()

# Passing encoded columns
enc_data = pd.DataFrame(enc.fit_transform(
      cat_data[cat_columns]).toarray())

enc_data.columns = enc.get_feature_names_out().tolist()

final_one_hot_data = pd.concat([data_stand['y'],enc_data], axis = 1)
final_one_hot_data = pd.concat([final_one_hot_data, num_data], axis = 1)


Crosswalk between categorical variable indices and level names

In [17]:
for col in og_cat_cols:
    print(cat_data[col].cat.categories)

Index(['no', 'unknown', 'yes'], dtype='object')
Index(['divorced', 'married', 'single', 'unknown'], dtype='object')
Index(['basic.4y', 'basic.6y', 'basic.9y', 'high.school', 'illiterate',
       'professional.course', 'university.degree', 'unknown'],
      dtype='object')
Index(['client NOT previously contacted', 'client WAS previously contacted'], dtype='object')
Index(['failure', 'nonexistent', 'success'], dtype='object')
Index(['cellular', 'telephone'], dtype='object')
Index(['fri', 'mon', 'thu', 'tue', 'wed'], dtype='object')
Index(['admin.', 'blue-collar', 'entrepreneur', 'housemaid', 'management',
       'retired', 'self-employed', 'services', 'student', 'technician',
       'unemployed', 'unknown'],
      dtype='object')
Index(['apr', 'aug', 'dec', 'jul', 'jun', 'mar', 'may', 'nov', 'oct', 'sep'], dtype='object')
Index(['no', 'unknown', 'yes'], dtype='object')


In [18]:
# train test split
from sklearn.model_selection import train_test_split
train_one_hot,test_one_hot = train_test_split(final_one_hot_data, test_size = 0.2, train_size = 0.8, random_state = 22)

output one hot encoded data

In [19]:
train_one_hot.to_csv('C:/Users/Chris Li/Documents/UC Davis/STA 208/Data/train_one_hot.csv')

In [20]:
test_one_hot.to_csv('C:/Users/Chris Li/Documents/UC Davis/STA 208/Data/test_one_hot.csv')