In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, QuantileTransformer, PowerTransformer, RobustScaler, MinMaxScaler

from wrangle import wrangle_telco
import split_scale

In [2]:
df = wrangle_telco()

In [3]:
def split_my_data(df, train_pct=0.70, seed=123):
    train, test = train_test_split(df, train_size=train_pct, random_state=seed)
    return train, test

In [4]:
train, test = split_my_data(df)

In [5]:
train.head()

Unnamed: 0,customer_id,monthly_charges,tenure,total_charges,contract_type
5903,3396-DKDEL,115.15,70,8250.0,Two year
6866,8966-KZXXA,25.1,36,930.95,Two year
6846,8857-CUPFQ,19.25,63,1237.65,Two year
6505,6901-GOGZG,84.95,60,4984.85,Two year
6806,8606-OEGQZ,25.3,18,454.65,Two year


In [6]:
X_train = train[['tenure']]
X_test = test[['tenure']]
y_train = [['total_charges']]
y_test = [['total_charges']]

In [7]:
def standard_scaler(X_train, X_test):
    """
    Takes in X_train and X_test dfs with numeric values only
    Returns scaler, X_train_scaled, X_test_scaled dfs
    """
    scaler = StandardScaler().fit(X_train)
    X_train_scaled = (pd.DataFrame(scaler.transform(X_train), 
                      columns=X_train.columns, 
                      index=X_train.index))
    X_test_scaled = (pd.DataFrame(scaler.transform(X_test), 
                     columns=X_test.columns,
                     index=X_test.index))
    return scaler, X_train_scaled, X_test_scaled

In [8]:
scaler, X_train_scaled, X_test_scaled = split_scale.standard_scaler(X_train, X_test)

In [9]:
X_train_scaled.head()

Unnamed: 0,tenure
5903,0.727945
6866,-1.218583
6846,0.327189
6505,0.155437
6806,-2.249098


In [10]:
X_test_scaled.head()

Unnamed: 0,tenure
5653,0.842447
5800,0.556192
6265,0.327189
6769,0.38444
6905,-1.905593


In [11]:
train_all = train.merge(X_train_scaled, how='inner', on=None, left_index=True, right_index=True)
train_all.columns

Index(['customer_id', 'monthly_charges', 'tenure_x', 'total_charges',
       'contract_type', 'tenure_y'],
      dtype='object')

In [12]:
train_all.head()

Unnamed: 0,customer_id,monthly_charges,tenure_x,total_charges,contract_type,tenure_y
5903,3396-DKDEL,115.15,70,8250.0,Two year,0.727945
6866,8966-KZXXA,25.1,36,930.95,Two year,-1.218583
6846,8857-CUPFQ,19.25,63,1237.65,Two year,0.327189
6505,6901-GOGZG,84.95,60,4984.85,Two year,0.155437
6806,8606-OEGQZ,25.3,18,454.65,Two year,-2.249098


In [15]:
train_all.columns = ['customer_id', 'monthly_charges', 'tenure', 'total_charges', 'contract_type',
       'tenure_scaled']

In [16]:
train_all.head()

Unnamed: 0,customer_id,monthly_charges,tenure,total_charges,contract_type,tenure_scaled
5903,3396-DKDEL,115.15,70,8250.0,Two year,0.727945
6866,8966-KZXXA,25.1,36,930.95,Two year,-1.218583
6846,8857-CUPFQ,19.25,63,1237.65,Two year,0.327189
6505,6901-GOGZG,84.95,60,4984.85,Two year,0.155437
6806,8606-OEGQZ,25.3,18,454.65,Two year,-2.249098


In [17]:
def scale_inverse(scaler, X_train_scaled, X_test_scaled):
    """Takes in the scaler and X_train_scaled and X_test_scaled dfs
       and returns the X_train and X_test dfs
       in their original forms before scaling
    """
    X_train_unscaled = (pd.DataFrame(scaler.inverse_transform(X_train_scaled), 
                      columns=X_train_scaled.columns, 
                      index=X_train_scaled.index))
    X_test_unscaled = (pd.DataFrame(scaler.inverse_transform(X_test_scaled), 
                     columns=X_test_scaled.columns,
                     index=X_test_scaled.index))
    return X_train_unscaled, X_test_unscaled

In [18]:
X_train_unscaled, X_test_unscaled = scale_inverse(scaler, X_train_scaled, X_test_scaled)

In [19]:
X_train_unscaled.head()

Unnamed: 0,tenure
5903,70.0
6866,36.0
6846,63.0
6505,60.0
6806,18.0


In [20]:
X_test_unscaled.head()

Unnamed: 0,tenure
5653,72.0
5800,67.0
6265,63.0
6769,64.0
6905,24.0


In [21]:
def uniform_scaler(X_train, X_test):
    """Quantile transformer, non_linear transformation - uniform.
       Reduces the impact of outliers, smooths out unusual distributions.
       Takes in a X_train and X_test dfs
       Returns the scaler, X_train_scaled, X_test_scaled
    """
    scaler = (QuantileTransformer(n_quantiles=100, 
                                  output_distribution='uniform', 
                                  random_state=123, copy=True)
                                  .fit(X_train))
    X_train_scaled = (pd.DataFrame(scaler.transform(X_train), 
                      columns=X_train.columns, 
                      index=X_train.index))
    X_test_scaled = (pd.DataFrame(scaler.transform(X_test), 
                     columns=X_test.columns,
                     index=X_test.index))
    return scaler, X_train_scaled, X_test_scaled

In [22]:
scaler, X_train_scaled, X_test_scaled = uniform_scaler(X_train, X_test)

In [23]:
X_train_scaled.head()

Unnamed: 0,tenure
5903,0.686869
6866,0.136364
6846,0.469697
6505,0.40404
6806,0.053872


In [24]:
X_test_scaled.head()

Unnamed: 0,tenure
5653,1.0
5800,0.565657
6265,0.469697
6769,0.489899
6905,0.080808


In [25]:
def gaussian_scaler(X_train, X_test):
    """Transforms and then normalizes data.
       Takes in X_train and X_test dfs, 
       yeo_johnson allows for negative data,
       box_cox allows positive data only.
       Returns Zero_mean, unit variance normalized X_train_scaled and X_test_scaled and scaler.
    """
    scaler = (PowerTransformer(method='yeo-johnson', 
                               standardize=False, 
                               copy=True)
                              .fit(X_train))
    X_train_scaled = (pd.DataFrame(scaler.transform(X_train), 
                      columns=X_train.columns, 
                      index=X_train.index))
    X_test_scaled = (pd.DataFrame(scaler.transform(X_test), 
                     columns=X_test.columns,
                     index=X_test.index))
    return scaler, X_train_scaled, X_test_scaled

In [26]:
scaler, X_train_scaled, X_test_scaled = gaussian_scaler(X_train, X_test)

In [27]:
X_train_scaled.head()

Unnamed: 0,tenure
5903,12703.743914
6866,2615.616336
6846,9877.112399
6505,8791.727108
6806,519.394751


In [28]:
X_test_scaled.head()

Unnamed: 0,tenure
5653,13588.904397
5800,11441.158771
6265,9877.112399
6769,10255.492587
6905,1010.755807


In [29]:
def min_max_scaler(X_train, X_test):
    """Transforms features by scaling each feature to a given range.
       Takes in X_train and X_test,
       Returns the scaler and X_train_scaled and X_test_scaled within range.
       Sensitive to outliers.
    """
    scaler = (MinMaxScaler(copy=True, 
                           feature_range=(0,1))
                          .fit(X_train))
    X_train_scaled = (pd.DataFrame(scaler.transform(X_train), 
                      columns=X_train.columns, 
                      index=X_train.index))
    X_test_scaled = (pd.DataFrame(scaler.transform(X_test), 
                     columns=X_test.columns,
                     index=X_test.index))
    return scaler, X_train_scaled, X_test_scaled

In [30]:
scaler, X_train_scaled, X_test_scaled = min_max_scaler(X_train, X_test)

In [31]:
X_train_scaled.head()

Unnamed: 0,tenure
5903,0.971831
6866,0.492958
6846,0.873239
6505,0.830986
6806,0.239437


In [32]:
X_test_scaled.head()

Unnamed: 0,tenure
5653,1.0
5800,0.929577
6265,0.873239
6769,0.887324
6905,0.323944


In [34]:
def iqr_robust_scaler(X_train, X_test):
    """Scales features using stats that are robust to outliers
       by removing the median and scaling data to the IQR.
       Takes in a X_train and X_test,
       Returns the scaler and X_train_scaled and X_test_scaled.
    """
    scaler = (RobustScaler(quantile_range=(25.0,75.0), 
                           copy=True, 
                           with_centering=True, 
                           with_scaling=True)
                          .fit(X_train))
    X_train_scaled = (pd.DataFrame(scaler.transform(X_train), 
                      columns=X_train.columns, 
                      index=X_train.index))
    X_test_scaled = (pd.DataFrame(scaler.transform(X_test), 
                     columns=X_test.columns,
                     index=X_test.index))
    return scaler, X_train_scaled, X_test_scaled

In [35]:
scaler, X_train_scaled, X_test_scaled = iqr_robust_scaler(X_train, X_test)