In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, QuantileTransformer, PowerTransformer, RobustScaler, MinMaxScaler

from wrangle import wrangle_telco
import split_scale

Pull in data using wrangle_telco from the wrangle.py file. This will return a dataframe that we name df. Check the shape

In [2]:
df = wrangle_telco()
df, df.shape

(     customer_id  monthly_charges  tenure  total_charges contract_type
 5348  0013-SMEOE           109.70      71        7904.25      Two year
 5349  0014-BMAQU            84.65      63        5377.80      Two year
 5350  0016-QLJIS            90.45      65        5957.90      Two year
 5351  0017-DINOC            45.20      54        2460.55      Two year
 5352  0017-IUDMW           116.80      72        8456.75      Two year
 ...          ...              ...     ...            ...           ...
 7038  9964-WBQDJ            24.40      71        1725.40      Two year
 7039  9972-EWRJS            19.25      67        1372.90      Two year
 7040  9975-GPKZU            19.75      46         856.50      Two year
 7041  9993-LHIEB            67.85      67        4627.65      Two year
 7042  9995-HOTOH            59.00      63        3707.60      Two year
 
 [1685 rows x 5 columns], (1685, 5))

After the dataframe is pulled in, create a function to split the data using the import from sklearn.model_selection. Use the train_test_split function.
The random_state parameter allows the data to be selected randomly from the whole dataframe instead of the data being selected sequentially. For the purpose of reporduceability, set the random_state to a seed

In [3]:
def split_data(df, train_pct=0.75, seed=123):
    train, test = train_test_split(df, train_size=train_pct, random_state=seed)
    return train, test

Call the function. Remember to set the variable names as a tuple as the function returns two things

In [4]:
train, test = split_data(df)

Look at the train data to ensure it looks like what you think it should. Check the shape. There should be less rows but the same number of columns

In [5]:
train.head()

Unnamed: 0,customer_id,monthly_charges,tenure,total_charges,contract_type
5663,1971-DTCZB,90.95,72,6468.6,Two year
6552,7157-SMCFK,19.75,61,1124.2,Two year
5695,2167-FQSTQ,92.4,72,6786.1,Two year
6816,8659-IOOPU,100.45,71,7159.7,Two year
6719,8125-QPFJD,84.9,72,6065.3,Two year


In [6]:
train.shape

(1263, 5)

There are now 1263 rows. Original df had 1685. Df has been split correctly

Create a function that separates the target and dependent features into test and train variables

In [7]:
def create_train_test_variables(train, test):
    X_train = train[['tenure']]
    X_test = test[['tenure']]
    y_train = [['total_charges']]
    y_test = [['total_charges']]
    return X_train, X_test, y_train, y_test

In [8]:
X_train, X_test, y_train, y_test = create_train_test_variables(train, test)

Create a function that returns the scaler object, the X_train and X_test data scaled and in a dataframe format. Use scaler imports from sklearn library. 
Follow the following format:
> 1. Create the scaler object
> 2. Fit the object using the train variable. (This is what actually trians the model)
> 3. Transform the scaler and set the column names and indexes to be the same as the original df
> 4. Repeat steps 2 and 3 for the test variable
> 5. Return the scaler object, and the X_train and X_test scaled data. They should now be dataframes

In [9]:
def standard_scaler(X_train, X_test):
    """
    Takes in X_train and X_test dataframes with numeric values only
    Returns scaler, X_train_scaled, X_test_scaled dfs
    """
    scaler = StandardScaler().fit(X_train)
    X_train_scaled = (pd.DataFrame(scaler.transform(X_train), 
                      columns=X_train.columns, 
                      index=X_train.index))
    X_test_scaled = (pd.DataFrame(scaler.transform(X_test), 
                     columns=X_test.columns,
                     index=X_test.index))
    return scaler, X_train_scaled, X_test_scaled

Call the funtion, remember that the output is a tuple with three outputs so use three names for the varibles

In [10]:
scaler, X_train_scaled, X_test_scaled = split_scale.standard_scaler(X_train, X_test)

Look at the train and test scaled data

In [11]:
X_train_scaled.head()

Unnamed: 0,tenure
5663,0.849158
6552,0.225025
5695,0.849158
6816,0.792418
6719,0.849158


In [12]:
X_test_scaled.head()

Unnamed: 0,tenure
5653,0.849158
5800,0.565461
6265,0.338504
6769,0.395243
6905,-1.874328


Create a function to add the scaled train data to the original dataframe. This will make it easier in exploration if there is a case where exploring the non-scaled data is useful

In [13]:
df_plus_train = train.merge(X_train_scaled, how='inner', on=None, left_index=True, right_index=True)
df_plus_train.columns

Index(['customer_id', 'monthly_charges', 'tenure_x', 'total_charges',
       'contract_type', 'tenure_y'],
      dtype='object')

In [14]:
df_plus_train.head()

Unnamed: 0,customer_id,monthly_charges,tenure_x,total_charges,contract_type,tenure_y
5663,1971-DTCZB,90.95,72,6468.6,Two year,0.849158
6552,7157-SMCFK,19.75,61,1124.2,Two year,0.225025
5695,2167-FQSTQ,92.4,72,6786.1,Two year,0.849158
6816,8659-IOOPU,100.45,71,7159.7,Two year,0.792418
6719,8125-QPFJD,84.9,72,6065.3,Two year,0.849158


Create a function to adjust column names 

In [15]:
df_plus_train.columns = ['customer_id', 'monthly_charges', 'tenure', 'total_charges', 'contract_type','tenure_scaled']
df_plus_train

Unnamed: 0,customer_id,monthly_charges,tenure,total_charges,contract_type,tenure_scaled
5663,1971-DTCZB,90.95,72,6468.60,Two year,0.849158
6552,7157-SMCFK,19.75,61,1124.20,Two year,0.225025
5695,2167-FQSTQ,92.40,72,6786.10,Two year,0.849158
6816,8659-IOOPU,100.45,71,7159.70,Two year,0.792418
6719,8125-QPFJD,84.90,72,6065.30,Two year,0.849158
...,...,...,...,...,...,...
6479,6734-JDTTV,19.85,65,1267.05,Two year,0.451983
6704,8043-PNYSD,19.55,63,1245.60,Two year,0.338504
6764,8314-HTWVE,19.85,7,144.15,Two year,-2.838896
6747,8221-HVAYI,69.15,35,2490.15,Two year,-1.250196


In [16]:
df_plus_train.head()

Unnamed: 0,customer_id,monthly_charges,tenure,total_charges,contract_type,tenure_scaled
5663,1971-DTCZB,90.95,72,6468.6,Two year,0.849158
6552,7157-SMCFK,19.75,61,1124.2,Two year,0.225025
5695,2167-FQSTQ,92.4,72,6786.1,Two year,0.849158
6816,8659-IOOPU,100.45,71,7159.7,Two year,0.792418
6719,8125-QPFJD,84.9,72,6065.3,Two year,0.849158


In [17]:
def scale_inverse(scaler, X_train_scaled, X_test_scaled):
    """Takes in the scaler and X_train_scaled and X_test_scaled dfs
       and returns the X_train and X_test dfs
       in their original forms before scaling
    """
    X_train_unscaled = (pd.DataFrame(scaler.inverse_transform(X_train_scaled), 
                      columns=X_train_scaled.columns, 
                      index=X_train_scaled.index))
    X_test_unscaled = (pd.DataFrame(scaler.inverse_transform(X_test_scaled), 
                     columns=X_test_scaled.columns,
                     index=X_test_scaled.index))
    return X_train_unscaled, X_test_unscaled

In [18]:
X_train_unscaled, X_test_unscaled = scale_inverse(scaler, X_train_scaled, X_test_scaled)

In [19]:
X_train_unscaled.head()

Unnamed: 0,tenure
5663,72.0
6552,61.0
5695,72.0
6816,71.0
6719,72.0


In [20]:
X_test_unscaled.head()

Unnamed: 0,tenure
5653,72.0
5800,67.0
6265,63.0
6769,64.0
6905,24.0


In [21]:
def uniform_scaler(X_train, X_test):
    """Quantile transformer, non_linear transformation - uniform.
       Reduces the impact of outliers, smooths out unusual distributions.
       Takes in a X_train and X_test dfs
       Returns the scaler, X_train_scaled, X_test_scaled
    """
    scaler = (QuantileTransformer(n_quantiles=100, 
                                  output_distribution='uniform', 
                                  random_state=123, copy=True)
                                  .fit(X_train))
    X_train_scaled = (pd.DataFrame(scaler.transform(X_train), 
                      columns=X_train.columns, 
                      index=X_train.index))
    X_test_scaled = (pd.DataFrame(scaler.transform(X_test), 
                     columns=X_test.columns,
                     index=X_test.index))
    return scaler, X_train_scaled, X_test_scaled

In [22]:
scaler, X_train_scaled, X_test_scaled = uniform_scaler(X_train, X_test)

In [23]:
X_train_scaled.head()

Unnamed: 0,tenure
5663,1.0
6552,0.429293
5695,1.0
6816,0.752525
6719,1.0


In [24]:
X_test_scaled.head()

Unnamed: 0,tenure
5653,1.0
5800,0.575758
6265,0.469697
6769,0.5
6905,0.081008


In [25]:
def gaussian_scaler(X_train, X_test):
    """Transforms and then normalizes data.
       Takes in X_train and X_test dfs, 
       yeo_johnson allows for negative data,
       box_cox allows positive data only.
       Returns Zero_mean, unit variance normalized X_train_scaled and X_test_scaled and scaler.
    """
    scaler = (PowerTransformer(method='yeo-johnson', 
                               standardize=False, 
                               copy=True)
                              .fit(X_train))
    X_train_scaled = (pd.DataFrame(scaler.transform(X_train), 
                      columns=X_train.columns, 
                      index=X_train.index))
    X_test_scaled = (pd.DataFrame(scaler.transform(X_test), 
                     columns=X_test.columns,
                     index=X_test.index))
    return scaler, X_train_scaled, X_test_scaled

In [26]:
scaler, X_train_scaled, X_test_scaled = gaussian_scaler(X_train, X_test)

In [27]:
X_train_scaled.head()

Unnamed: 0,tenure
5663,11321.790148
6552,7678.360832
5695,11321.790148
6816,10956.515751
6719,11321.790148


In [28]:
X_test_scaled.head()

Unnamed: 0,tenure
5653,11321.790148
5800,9564.272599
6265,8280.411252
6769,8591.348361
6905,885.680021


In [29]:
def min_max_scaler(X_train, X_test):
    """Transforms features by scaling each feature to a given range.
       Takes in X_train and X_test,
       Returns the scaler and X_train_scaled and X_test_scaled within range.
       Sensitive to outliers.
    """
    scaler = (MinMaxScaler(copy=True, 
                           feature_range=(0,1))
                          .fit(X_train))
    X_train_scaled = (pd.DataFrame(scaler.transform(X_train), 
                      columns=X_train.columns, 
                      index=X_train.index))
    X_test_scaled = (pd.DataFrame(scaler.transform(X_test), 
                     columns=X_test.columns,
                     index=X_test.index))
    return scaler, X_train_scaled, X_test_scaled

In [30]:
scaler, X_train_scaled, X_test_scaled = min_max_scaler(X_train, X_test)

In [31]:
X_train_scaled.head()

Unnamed: 0,tenure
5663,1.0
6552,0.84507
5695,1.0
6816,0.985915
6719,1.0


In [32]:
X_test_scaled.head()

Unnamed: 0,tenure
5653,1.0
5800,0.929577
6265,0.873239
6769,0.887324
6905,0.323944


In [33]:
def iqr_robust_scaler(X_train, X_test):
    """Scales features using stats that are robust to outliers
       by removing the median and scaling data to the IQR.
       Takes in a X_train and X_test,
       Returns the scaler and X_train_scaled and X_test_scaled.
    """
    scaler = (RobustScaler(quantile_range=(25.0,75.0), 
                           copy=True, 
                           with_centering=True, 
                           with_scaling=True)
                          .fit(X_train))
    X_train_scaled = (pd.DataFrame(scaler.transform(X_train), 
                      columns=X_train.columns, 
                      index=X_train.index))
    X_test_scaled = (pd.DataFrame(scaler.transform(X_test), 
                     columns=X_test.columns,
                     index=X_test.index))
    return scaler, X_train_scaled, X_test_scaled

In [34]:
scaler, X_train_scaled, X_test_scaled = iqr_robust_scaler(X_train, X_test)