IMPORTING LABRARIES

In [54]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix
from matplotlib.colors import ListedColormap
from sklearn.model_selection import train_test_split

Importing the datasets

In [2]:
data_raw = pd.read_csv('Social_Network_Ads.csv')

In [3]:
data_raw.head()

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
0,15624510,Male,19,19000,0
1,15810944,Male,35,20000,0
2,15668575,Female,26,43000,0
3,15603246,Female,27,57000,0
4,15804002,Male,19,76000,0


In [4]:
data_raw.describe()

Unnamed: 0,User ID,Age,EstimatedSalary,Purchased
count,400.0,400.0,400.0,400.0
mean,15691540.0,37.655,69742.5,0.3575
std,71658.32,10.482877,34096.960282,0.479864
min,15566690.0,18.0,15000.0,0.0
25%,15626760.0,29.75,43000.0,0.0
50%,15694340.0,37.0,70000.0,0.0
75%,15750360.0,46.0,88000.0,1.0
max,15815240.0,60.0,150000.0,1.0


DROPPING User ID

In [5]:
data_raw = data_raw.drop(['User ID'], axis= 1)

In [6]:
data_raw.head()

Unnamed: 0,Gender,Age,EstimatedSalary,Purchased
0,Male,19,19000,0
1,Male,35,20000,0
2,Female,26,43000,0
3,Female,27,57000,0
4,Male,19,76000,0


CHECKING FOR EMPTY ROWS 

In [7]:
data_raw.isnull().sum() 

Gender             0
Age                0
EstimatedSalary    0
Purchased          0
dtype: int64

DUMMY VARIABLES

In [8]:
le_Gender = LabelEncoder()

In [9]:
data_raw['Gender_n'] = le_Gender.fit_transform(data_raw['Gender'])

In [10]:
data_raw.head()

Unnamed: 0,Gender,Age,EstimatedSalary,Purchased,Gender_n
0,Male,19,19000,0,1
1,Male,35,20000,0,1
2,Female,26,43000,0,0
3,Female,27,57000,0,0
4,Male,19,76000,0,1


In [12]:
data_raw = data_raw.drop(['Gender'], axis= 1)

In [13]:
data_raw.head()

Unnamed: 0,Age,EstimatedSalary,Purchased,Gender_n
0,19,19000,0,1
1,35,20000,0,1
2,26,43000,0,0
3,27,57000,0,0
4,19,76000,0,1


In [25]:
data_unscaled = data_raw.copy()

In [26]:
data_unscaled

Unnamed: 0,Age,EstimatedSalary,Purchased,Gender_n
0,19,19000,0,1
1,35,20000,0,1
2,26,43000,0,0
3,27,57000,0,0
4,19,76000,0,1
...,...,...,...,...
395,46,41000,1,0
396,51,23000,1,1
397,50,20000,1,0
398,36,33000,0,1


SELECTING INPUTS AND OUTPUT

In [27]:
target = data_unscaled.Purchased

In [28]:
input_unscaled = data_unscaled.drop(['Purchased'], axis= 1)

In [29]:
input_unscaled

Unnamed: 0,Age,EstimatedSalary,Gender_n
0,19,19000,1
1,35,20000,1
2,26,43000,0
3,27,57000,0
4,19,76000,1
...,...,...,...
395,46,41000,0
396,51,23000,1
397,50,20000,0
398,36,33000,1


## Standardize the data

In [30]:
# standardize the inputs

# standardization is one of the most common preprocessing tools
# since data of different magnitude (scale) can be biased towards high values,
# we want all inputs to be of similar magnitude
# this is a peculiarity of machine learning in general - most (but not all) algorithms do badly with unscaled data

# a very useful module we can use is StandardScaler 
# it has much more capabilities than the straightforward 'preprocessing' method


# we will create a variable that will contain the scaling information for this particular dataset
# here's the full documentation: http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html

# define scaler as an object
Social_Network_scaler = StandardScaler()

In [31]:
# import the libraries needed to create the Custom Scaler
# note that all of them are a part of the sklearn package
# moreover, one of them is actually the StandardScaler module, 
# so you can imagine that the Custom Scaler is build on it

from sklearn.base import BaseEstimator, TransformerMixin


# create the Custom Scaler class

class CustomScaler(BaseEstimator,TransformerMixin): 
    
    # init or what information we need to declare a CustomScaler object
    # and what is calculated/declared as we do
    
    def __init__(self,columns,copy=True,with_mean=True,with_std=True):
        
        # scaler is nothing but a Standard Scaler object
        self.scaler = StandardScaler(copy,with_mean,with_std)
        # with some columns 'twist'
        self.columns = columns
        self.mean_ = None
        self.var_ = None
        
    
    # the fit method, which, again based on StandardScale
    
    def fit(self, X, y=None):
        self.scaler.fit(X[self.columns], y)
        self.mean_ = np.mean(X[self.columns])
        self.var_ = np.var(X[self.columns])
        return self
    
    # the transform method which does the actual scaling

    def transform(self, X, y=None, copy=None):
        
        # record the initial order of the columns
        init_col_order = X.columns
        
        # scale all features that you chose when creating the instance of the class
        X_scaled = pd.DataFrame(self.scaler.transform(X[self.columns]), columns=self.columns)
        
        # declare a variable containing all information that was not scaled
        X_not_scaled = X.loc[:,~X.columns.isin(self.columns)]
        
        # return a data frame which contains all scaled features and all 'not scaled' features
        # use the original order (that you recorded in the beginning)
        return pd.concat([X_not_scaled, X_scaled], axis=1)[init_col_order]

In [33]:
# check what are all columns that we've got
input_unscaled.columns.values

array(['Age', 'EstimatedSalary', 'Gender_n'], dtype=object)

In [34]:
# choose the columns to scale
# we later augmented this code and put it in comments
# columns_to_scale = ['Month Value','Day of the Week', 'Transportation Expense', 'Distance to Work',
       #'Age', 'Daily Work Load Average', 'Body Mass Index', 'Children', 'Pet']
    
# select the columns to omit
columns_to_omit = ['Gender_n']

In [35]:
# create the columns to scale, based on the columns to omit
# use list comprehension to iterate over the list
columns_to_scale = [x for x in input_unscaled.columns.values if x not in columns_to_omit]

In [36]:
# declare a scaler object, specifying the columns you want to scale
Social_Network_scaler = CustomScaler(columns_to_scale)

In [38]:
# fit the data (calculate mean and standard deviation); they are automatically stored inside the object 
Social_Network_scaler.fit(input_unscaled)



CustomScaler(columns=['Age', 'EstimatedSalary'], copy=None, with_mean=None,
             with_std=None)

In [39]:
# standardizes the data, using the transform method 
# in the last line, we fitted the data - in other words
# we found the internal parameters of a model that will be used to transform data. 
# transforming applies these parameters to our data
# note that when you get new data, you can just call 'scaler' again and transform it in the same way as now
scaled_inputs = Social_Network_scaler.transform(input_unscaled)

In [40]:
scaled_inputs

Unnamed: 0,Age,EstimatedSalary,Gender_n
0,-1.781797,-1.490046,1
1,-0.253587,-1.460681,1
2,-1.113206,-0.785290,0
3,-1.017692,-0.374182,0
4,-1.781797,0.183751,1
...,...,...,...
395,0.797057,-0.844019,0
396,1.274623,-1.372587,1
397,1.179110,-1.460681,0
398,-0.158074,-1.078938,1


### Balance the dataset

In [41]:
#creating a variable that contain the number of items in the array target
num_one_targets = int(np.sum(target))

BALANCE: MACHING THE QUANTITY OF ZEROS AND ONES, THIS IS REQUIRED TO HAVE A GOOD MODEL

In [42]:
unique_elements, counts_elements = np.unique(target, return_counts=True)
np.asarray((unique_elements, counts_elements))

array([[  0,   1],
       [257, 143]], dtype=int64)

THERE ARE 257 - 143 = 114, ZEROS THAN ONE's

In [43]:
# Count how many targets are 1 (meaning that the customer did convert)
num_one_targets = int(np.sum(target))

# Set a counter for targets that are 0 (meaning that the customer did not convert)
zero_targets_counter = 0

# We want to create a "balanced" dataset, so we will have to remove some input/target pairs.
# Declare a variable that will do that:
indices_to_remove = []

# Count the number of targets that are 0. 
# Once there are as many 0s as 1s, mark entries where the target is 0.
for i in range(target.shape[0]):
    if target[i] == 0:
        zero_targets_counter += 1
        if zero_targets_counter > num_one_targets:
            indices_to_remove.append(i)


.to_numpy() ==> conversion from dataframes to numpy arrays retaining column dtypes and names

In [46]:
scaled_inputs = scaled_inputs.to_numpy()

In [48]:
input_unscaled.shape

(400, 3)

In [49]:
# Create two new variables, one that will contain the inputs, and one that will contain the targets.
# We delete all indices that we marked "to remove" in the loop above.
x_scaled = np.delete(scaled_inputs, indices_to_remove, axis=0)


In [50]:
target = np.delete(target, indices_to_remove, axis=0)

In [51]:
target.shape

(286,)

In [52]:
x_scaled.shape

(286, 3)

In [53]:
x_scaled

array([[-1.78179743, -1.49004624,  1.        ],
       [-0.25358736, -1.46068138,  1.        ],
       [-1.11320552, -0.78528968,  0.        ],
       [-1.01769239, -0.37418169,  0.        ],
       [-1.78179743,  0.18375059,  1.        ],
       [-1.01769239, -0.34481683,  1.        ],
       [-1.01769239,  0.41866944,  0.        ],
       [-0.54012675,  2.35674998,  0.        ],
       [-1.20871865, -1.07893824,  1.        ],
       [-0.25358736, -0.13926283,  0.        ],
       [-1.11320552,  0.30121002,  0.        ],
       [-1.11320552, -0.52100597,  0.        ],
       [-1.6862843 ,  0.47739916,  1.        ],
       [-0.54012675, -1.51941109,  1.        ],
       [-1.87731056,  0.35993973,  1.        ],
       [-0.82666613,  0.30121002,  1.        ],
       [ 0.89257019, -1.3138571 ,  1.        ],
       [ 0.70154394, -1.28449224,  1.        ],
       [ 0.79705706, -1.22576253,  1.        ],
       [ 0.98808332, -1.19639767,  0.        ],
       [ 0.70154394, -1.40195167,  1.   

In [None]:
### Split the dataset into train and test

In [55]:
X_Train, X_Test, Y_Train, Y_Test = train_test_split(x_scaled, target, test_size = 0.2, random_state = 0)


In [56]:
sc_X = StandardScaler()
X_Train = sc_X.fit_transform(X_Train)
X_Test = sc_X.transform(X_Test)

### Save the three datasets in *.npz

In [58]:
# Save the three datasets in *.npz.
# In the next lesson, you will see that it is extremely valuable to name them in such a coherent way!

np.savez('Social_Network_Ads_data_train', inputs=X_Train, targets=Y_Train)
np.savez('Social_Network_Ads_test', inputs=X_Test, targets=Y_Test)