# Part 3 - Data Prep

https://www.udemy.com/course/feature-engineering-for-machine-learning

* Types and characteristics of data
* Missing data imputation
* Categorical encoding
* Variable transformation
* Discretization
* Outliers
* Datetime
* Scaling
* Feature creation

## Load Data

In [None]:
import pandas as pd

df = pd.read_csv('/content/created_raw_data.csv')
print(df.shape)
print(df.info())
df.head()

(1010, 41)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1010 entries, 0 to 1009
Data columns (total 41 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   binary                 808 non-null    object 
 1   given_name             707 non-null    object 
 2   surname                505 non-null    object 
 3   date_of_birth          807 non-null    object 
 4   phone_number           909 non-null    object 
 5   email                  958 non-null    object 
 6   address                633 non-null    object 
 7   city                   859 non-null    object 
 8   state                  865 non-null    object 
 9   zipcode                640 non-null    float64
 10  semi_constant_2        1010 non-null   object 
 11  multicollinearity 3    706 non-null    float64
 12  constant_1             1010 non-null   object 
 13  random choice 2        709 non-null    object 
 14  informative_2          1010 non-null   float6

Unnamed: 0,binary,given_name,surname,date_of_birth,phone_number,email,address,city,state,zipcode,...,uniform corr 2,multicollinearity 2,outliers 1,multicollinearity 4,random label num 3,standard scaling,corr_feature_class,informative_1,constant_2,class
0,,Shawn,,1933-08-31,(968)487-2956x27427,walkeramanda@example.com,045 Fowler Spring Apt. 450,West Bruce,MA,32951.0,...,0.336978,-1.030274,0.768145,0.371535,label num lo 3,53962.853545,3.184217,-0.999102,constant_value,1
1,binary_2,John,Bryant,2004-12-19,731.899.4368x677,,76988 Tony Plains Suite 161,Shannonhaven,ME,,...,0.832943,,0.274048,0.195892,label num lo 3,42805.421121,,1.246686,constant_value,1
2,binary_1,,,1944-02-04,001-877-232-0290x332,,,Kaylafort,WA,3222.0,...,0.685752,0.844525,0.188039,-0.239963,label num lo 2,42575.623356,,0.962777,constant_value,1
3,binary_2,,,,768-419-9512x282,hcollins@example.net,585 Rivas River,East Rachel,DC,,...,0.777135,0.174121,1.784847,-0.770139,label num lo 1,46758.872175,4.950943,-2.957441,constant_value,1
4,binary_1,Shannon,Williamson,1993-01-06,,davidanderson@example.org,322 Roberts Mountains Suite 220,Katiebury,KS,83537.0,...,0.992948,,0.846802,0.968097,label num lo 1,64374.589787,2.071837,1.141165,constant_value,1


In [None]:
import preppy.utils as utils
from preppy.version import __version__

print(__version__)

utils.report.write_report(df, thresh=.5)

PrepPy Version: 0.1.0
REPORT FOR DATA PREP

#################################################
Columns with Constant Values
#################################################
['constant_1', 'constant_2']

#################################################
Columns with Quasi-Constant Values
#################################################
['class', 'constant_1', 'constant_2']

#################################################
Duplicate Rows
#################################################
20

#################################################
Duplicate Columns
#################################################
['constant_2', 'duplicate_2', 'informative_1']

#################################################
Variables with Noticeably Higher Scales
#################################################
Features with Noticeably Higher Scales (Based on Standard Deviation):
zipcode             28281.011740
standard scaling     9799.815458
Name: std, dtype: float64

Features with Noticeably Higher Sca

In [None]:
import preppy.utils as preppy

consts = preppy.functions.identify_consts(df)
quasi_consts = preppy.functions.identify_quasi_consts(df)
duplicates = preppy.functions.check_col_duplicates(df)
print(duplicates)
print(consts)
print(quasi_consts)

['constant_2', 'duplicate_2', 'informative_1']
['constant_1', 'constant_2']
['constant_1', 'constant_2']


In [None]:
# numeric_df = df.apply(pd.to_numeric, errors='coerce')
all_deletes = list(set(consts + quasi_consts + duplicates))
for col in all_deletes:
  print(col, df[col].dtype)
  if df[col].dtype in ['float64', 'int64']:
    df_numerical.remove(col)
  elif df[col].dtype in ['object']:
    df_object.remove(col)
    df_categorical_features.remove(col)
  else:
    df_discreet.remove(col)


constant_1 object
informative_1 float64
constant_2 object
duplicate_2 float64


## PrepPy Pipeline

In [None]:
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
import preppy.utils as preppy

pipe = [
    ('constants', preppy.classes.RemoveConstants()),
    ('quasiconsts', preppy.classes.RemoveQuasiConstants(thresh=0.8)),
    ('duplicates', preppy.classes.DropDuplicates()),
    ('missing', preppy.classes.HandleMissingValues()),
    # ('encoding', HandleCatEncodeing())
]

pipe_model = Pipeline(pipe)
data = pipe_model.fit_transform(df)
cols = [col for col in df.columns if col not in consts + quasi_consts + duplicates]
nu_df = pd.DataFrame(data, columns=cols)
nu_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 37 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   binary                 1000 non-null   object
 1   given_name             1000 non-null   object
 2   surname                1000 non-null   object
 3   date_of_birth          1000 non-null   object
 4   phone_number           1000 non-null   object
 5   email                  1000 non-null   object
 6   address                1000 non-null   object
 7   city                   1000 non-null   object
 8   state                  1000 non-null   object
 9   zipcode                1000 non-null   object
 10  semi_constant_2        1000 non-null   object
 11  multicollinearity 3    1000 non-null   object
 12  random choice 2        1000 non-null   object
 13  informative_2          1000 non-null   object
 14  correlated w target 2  1000 non-null   object
 15  uniform corr 1        

  dfx[feat] = df[feat].fillna(df[feat].mode()[0])


In [None]:
import pickle

# Load the pickled variable from the file
with open('var_types.pkl', 'rb') as f:
    var_types = pickle.load(f)

print(var_types)

{'df_numerical': ['zipcode', 'multicollinearity 3', 'informative_2', 'correlated w target 2', 'uniform corr 1', 'target', 'min max scaling', 'correlated w target 1', 'outliers 2', 'multicollinearity 1', 'duplicate_2', 'duplicate_1', 'uniform corr 2', 'multicollinearity 2', 'outliers 1', 'multicollinearity 4', 'standard scaling', 'corr_feature_class', 'informative_1', 'class'], 'df_object': ['binary', 'given_name', 'surname', 'date_of_birth', 'phone_number', 'email', 'address', 'city', 'state', 'semi_constant_2', 'constant_1', 'random choice 2', 'pd qcut1', 'semi_constant_1', 'pd qcut3', 'random choice 7', 'random choice 4', 'pd qcut2', 'random label num 14', 'random label num 3', 'constant_2'], 'df_discreet': [], 'df_categorical_features': ['binary', 'given_name', 'surname', 'date_of_birth', 'phone_number', 'email', 'address', 'city', 'state', 'semi_constant_2', 'constant_1', 'random choice 2', 'pd qcut1', 'semi_constant_1', 'pd qcut3', 'random choice 7', 'random choice 4', 'pd qcut2',

In [None]:
df_numerical = var_types['df_numerical']
df_object = var_types['df_object']
df_discreet = var_types['df_discreet']
df_categorical_features = var_types['df_categorical_features']

In [None]:
# code along
df_numerical = [col for col in nu_df.columns if col in var_types['df_numerical']]
nu_df[df_numerical] = nu_df[df_numerical].astype(float)
nu_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 37 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   binary                 1000 non-null   object 
 1   given_name             1000 non-null   object 
 2   surname                1000 non-null   object 
 3   date_of_birth          1000 non-null   object 
 4   phone_number           1000 non-null   object 
 5   email                  1000 non-null   object 
 6   address                1000 non-null   object 
 7   city                   1000 non-null   object 
 8   state                  1000 non-null   object 
 9   zipcode                1000 non-null   float64
 10  semi_constant_2        1000 non-null   object 
 11  multicollinearity 3    1000 non-null   float64
 12  random choice 2        1000 non-null   object 
 13  informative_2          1000 non-null   float64
 14  correlated w target 2  1000 non-null   float64
 15  unifo

## Feature Engineering

### Feature Combination

In [None]:
# create a new variable by combining two variables
df['scaling_combined'] = df['standard scaling'] + df['min max scaling']
df.drop(['standard scaling', 'min max scaling'], axis=1, inplace=True)

### Categorical Encoding

In [None]:
# code along
import preppy.utils as utils

# Impute missing values before applying do_OHE
for col in df.select_dtypes(include=['object']).columns:
    df[col] = df[col].fillna(df[col].mode()[0])

df = utils.functions.do_OHE(df)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1010 entries, 0 to 1009
Data columns (total 45 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   binary                             808 non-null    float64
 1   given_name                         1010 non-null   float64
 2   surname                            1010 non-null   float64
 3   date_of_birth                      1010 non-null   float64
 4   phone_number                       1010 non-null   float64
 5   email                              1010 non-null   float64
 6   address                            1010 non-null   float64
 7   city                               1010 non-null   float64
 8   state                              1010 non-null   float64
 9   zipcode                            640 non-null    float64
 10  multicollinearity 3                706 non-null    float64
 11  random choice 2                    1010 non-null   int64

In [None]:
df.to_csv('prepared_data.csv', index=False)