# **Data Preprocessing**

### *1. Importing libraries and data*

In [None]:
import numpy as np
import pandas as pd
from google.colab import drive
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split

In [None]:
drive.mount('/content/drive')
%cd /content/drive/My\ Drive/2.\ Career\ Development/Data\ Science/4.\ Data\ Science\ Projects/Predicting\ Churn/Data

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/My Drive/2. Career Development/Data Science/4. Data Science Projects/Predicting Churn/Data


In [None]:
data = pd.read_csv('data.csv')

____________

### *2. Correcting data types*
* The data fields are in the correct format.

In [None]:
#Convert pd df to numpy array
data_values = data.values

____________

### *3. Missing data*

* Both in the numeric and categorical columns values are missing.
* Missing categorical values will be replaced with the string constant '**missing_values**'
* Missing numeric values will be replaced by the **mean**.

**3.1 Missing Values per Column**

In [None]:
pd.set_option('display.max_rows', 81)
missing_records= data.isnull().sum().sort_values(ascending = False)
cols_with_missing = missing_records[missing_records > 0]
cols_with_missing

Series([], dtype: int64)

No missing values

__________

### *4. Encoding Categorical Variables*

**4.1 Identifying Categorical Variables**
* After reinserting np array into a pd df all data types are 'object'.
* To fix this the data types are corrected in the below step.

In [None]:
int_types = [index for index, value in data.dtypes.iteritems() if str(data[index].dtypes)  == 'int64']
for i in int_types:
  convert_dict = {i: 'int64'}
  data = data.astype(convert_dict)

float_types = [index for index, value in data.dtypes.iteritems() if str(data[index].dtypes)  == 'float64']
for i in float_types:
  convert_dict = {i: 'float64'}
  data = data.astype(convert_dict)

object_dtypes = [index for index, value in data.dtypes.iteritems() if str(data[index].dtypes) == 'object']
for i in object_dtypes:
  convert_dict = {i: 'string'}
  data = data.astype(convert_dict)

**4.2 Encoding Independent Categorical Variables**
* Encoding categorical variables to dummy variables using one hot encoding.

In [None]:
index_categorical = [index for index, val in data.dtypes.iteritems() if str(val) == 'string'][:-1]
enc = OneHotEncoder(handle_unknown = 'ignore')
data_v2 = data

for i in index_categorical:
  col_names = [i+j for j in data[i].unique()]
  col_names.sort()
  enc_df = pd.DataFrame(enc.fit_transform(data[[i]]).toarray(), columns = col_names)
  data_v2 = data_v2.join(enc_df)
  del data_v2[i]

In [None]:
data_v2[data_v2.columns[16:]].head()

Unnamed: 0,stateAK,stateAL,stateAR,stateAZ,stateCA,stateCO,stateCT,stateDC,stateDE,stateFL,stateGA,stateHI,stateIA,stateID,stateIL,stateIN,stateKS,stateKY,stateLA,stateMA,stateMD,stateME,stateMI,stateMN,stateMO,stateMS,stateMT,stateNC,stateND,stateNE,stateNH,stateNJ,stateNM,stateNV,stateNY,stateOH,stateOK,stateOR,statePA,stateRI,stateSC,stateSD,stateTN,stateTX,stateUT,stateVA,stateVT,stateWA,stateWI,stateWV,stateWY,area_codearea_code_408,area_codearea_code_415,area_codearea_code_510,international_planno,international_planyes,voice_mail_planno,voice_mail_planyes
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0


**4.3 Encoding Dependent Categorical Variables**
* Encoding categorical dependent variables to dummy variables using the label encoder.

In [None]:
le = LabelEncoder()
enc_label = pd.DataFrame(le.fit_transform(data[['churn']].values.ravel()), columns = ['churn'])
data_v2 = data_v2.drop('churn', axis = 1)
data_v3 = data_v2.join(enc_label)

_______________

### *5. Feature Scaling*



**5.1 Feature Scaling**
*   Scaling numeric values (located in cols 0-19) using standardization.

In [None]:
sc = StandardScaler()
scaled = sc.fit_transform(data_v3.values[:, :15])
data_v4 = pd.DataFrame(data = scaled, columns = data_v3.columns[:15])

In [None]:
data_v4.head()

Unnamed: 0,account_length,number_vmail_messages,total_day_minutes,total_day_calls,total_day_charge,total_eve_minutes,total_eve_calls,total_eve_charge,total_night_minutes,total_night_calls,total_night_charge,total_intl_minutes,total_intl_calls,total_intl_charge,number_customer_service_calls
0,0.170399,1.366857,-0.34551,1.163449,-0.345788,-0.093025,0.141841,-0.092493,1.070003,0.157309,1.07082,1.247901,-0.579164,1.248591,-0.426346
1,0.926186,-0.567911,1.169136,0.710014,1.169295,-1.57182,0.49349,-1.572341,-0.75332,0.207083,-0.752053,0.704379,0.232927,0.698342,-1.18896
2,-0.409038,-0.567911,2.206058,-1.456398,2.206218,-2.75207,-0.611691,-2.752473,-0.072057,-0.539526,-0.072337,-1.32477,1.045017,-1.328187,0.336268
3,-0.635774,-0.567911,-0.251076,0.659633,-0.251027,-1.032448,1.096316,-1.031447,-0.270676,1.053239,-0.270956,-0.056552,-0.579164,-0.053219,1.098882
4,0.523099,1.218029,0.702522,-0.59991,0.702027,2.952139,0.393019,2.951497,0.239775,0.903917,0.241038,-0.998657,1.045017,-0.992669,1.098882


**5.2 Joining Scaled DF to Base DF**
* Joining the scaled numeric df with the one hot encoded categorical columns of data_v3.

In [None]:
data_v5 = data_v4.join(data_v3[data_v3.columns[15:]])

_____________

### *6. Exporting Data Sets*

**6.1 Scaled Data Set**

In [None]:
%cd /content/drive/My\ Drive/2.\ Career\ Development/Data\ Science/4.\ Data\ Science\ Projects/Predicting\ Churn/

/content/drive/My Drive/2. Career Development/Data Science/4. Data Science Projects/Predicting Churn


In [None]:
data_v5.to_csv('predicting_churn.csv')