In [12]:
import pandas as pd
import numpy as np
from scipy import stats
from scipy import special
import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
import joblib
from sklearn.decomposition import PCA

In [2]:
data = pd.read_csv('../data/train_test_data/train.csv')
data.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,5442-PPTJY,Male,0,Yes,Yes,12,Yes,No,No,No internet service,...,No internet service,No internet service,No internet service,No internet service,Two year,No,Mailed check,19.7,258.35,No
1,6261-RCVNS,Female,0,No,No,42,Yes,No,DSL,Yes,...,Yes,Yes,No,Yes,One year,No,Credit card (automatic),73.9,3160.55,Yes
2,2176-OSJUV,Male,0,Yes,No,71,Yes,Yes,DSL,Yes,...,No,Yes,No,No,Two year,No,Bank transfer (automatic),65.15,4681.75,No
3,6161-ERDGD,Male,0,Yes,Yes,71,Yes,Yes,DSL,Yes,...,Yes,Yes,Yes,Yes,One year,No,Electronic check,85.45,6300.85,No
4,2364-UFROM,Male,0,No,No,30,Yes,No,DSL,Yes,...,No,Yes,Yes,No,One year,No,Electronic check,70.4,2044.75,No


In [3]:
def data_overview(df):
    print("Rows :  " , df.shape[0])
    print("Columns:  " , df.shape[1] )
    print()
    print("Feature types:  ")
    print(df.dtypes)
    print()
    print('Missing values : ')
    print(df.isnull().sum()) 
    print()
    print('Unique values:')
    print(df.nunique())
data_overview(data)

# no missing values!

Rows :   5634
Columns:   21

Feature types:  
customerID           object
gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
MultipleLines        object
InternetService      object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
PaperlessBilling     object
PaymentMethod        object
MonthlyCharges      float64
TotalCharges         object
Churn                object
dtype: object

Missing values : 
customerID          0
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Con

In [4]:
data.drop('customerID',axis=1,inplace=True)

In [5]:
# categorize tenure
bins = [0, 12, 24, 36, 48, 60, 72]
data['tenure'] = pd.cut(data['tenure'], bins)
data['tenure'].value_counts()
data['tenure']=data['tenure'].astype(object)

In [6]:
data['SeniorCitizen']=data['SeniorCitizen'].astype(object) # numeric->object
data['TotalCharges']=pd.to_numeric(data['TotalCharges'],errors='coerce').fillna(0)  # object->numeric

In [8]:
cat_cols=[]
num_cols=[]

# select_col_num_features
for col_name in data.columns:
    if data[col_name].dtypes == 'object':
        cat_cols.append(col_name)
    else:
        num_cols.append(col_name)

print("categorical features: ")
print(cat_cols)
print("numerical features: ")
print(num_cols)

categorical features: 
['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'tenure', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod', 'Churn']
numerical features: 
['MonthlyCharges', 'TotalCharges']


In [9]:
for col in num_cols:
    print("Skewness for "+str(col)+": " +str(data[col].skew()))
    if data[col].skew()>0.5 or data[col].skew()<-0.5:
        data[col] = special.boxcox1p(data[col], stats.boxcox_normmax(data[col] + 1))
        print("Skewness for "+str(col)+": " +str(data[col].skew()))

Skewness for MonthlyCharges: -0.22348699170214392
Skewness for TotalCharges: 0.9560825765922467
Skewness for TotalCharges: -0.11099295461761675


In [10]:
bin_cols = data.nunique()[data.nunique()==2].keys().tolist()
multi_cols = [col for col in cat_cols if col not in bin_cols]
print("2-class features: ")
print(bin_cols)
print("multi-class features: ")
print(multi_cols)

2-class features: 
['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'PhoneService', 'PaperlessBilling', 'Churn']
multi-class features: 
['tenure', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaymentMethod']


In [13]:
# encode categorical features
le = LabelEncoder()
data_bin = data[bin_cols]
for col in bin_cols:
    data_bin[col] = le.fit_transform(data_bin[col])
    
data_multi = pd.get_dummies(data = data[multi_cols], columns=multi_cols)

# scaling numerical features
ss = StandardScaler()
data_num = ss.fit_transform(data[num_cols])
data_num = pd.DataFrame(data_num,columns=num_cols)
joblib.dump(ss,'../models/scaler.m')  # save StandardScaler

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


['../models/scaler.m']

In [14]:
new_data = pd.concat([data_bin,data_multi,data_num],axis=1)
new_data.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,PhoneService,PaperlessBilling,Churn,"tenure_(0.0, 12.0]","tenure_(12.0, 24.0]","tenure_(24.0, 36.0]",...,StreamingMovies_Yes,Contract_Month-to-month,Contract_One year,Contract_Two year,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,MonthlyCharges,TotalCharges
0,1,0,1,1,1,0,0,1,0,0,...,0,0,0,1,0,0,0,1,-1.49753,-0.999585
1,0,0,0,0,1,0,1,0,0,0,...,1,0,1,0,0,1,0,0,0.302996,0.696496
2,1,0,1,0,1,0,0,0,0,0,...,0,0,0,1,1,0,0,0,0.01232,1.083687
3,1,0,1,1,1,0,0,0,0,0,...,1,0,1,0,0,0,1,0,0.686687,1.405025
4,1,0,0,0,1,0,0,0,0,1,...,0,0,1,0,0,0,1,0,0.186726,0.312977


In [17]:
new_data.to_csv('../data/featured_data/featured_train.csv',index=False)