### Mall Customers - Wrangling Exercises

In [19]:
import numpy as np
import pandas as pd
import os
from env import get_db_url
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

In [2]:
def acquire_mall_customers():
    '''
    This function checks for a copy of the dataset in the local directory 
    and pulls a new copy and saves it if there is not one,
    it then cleans the data by removing significant outliers then
    removing the rows with null values for 'yearbuilt'
    '''
    #assign the file name
    filename = 'mall_clustering.csv'
    #check if the file exists in the current directory and read it if it is
    if os.path.exists(filename):
        print('Reading from csv file...')
        #read the local .csv into the notebook
        df = pd.read_csv(filename)
        return df
    #assign the sql query to a variable for use in pulling a new copy of the dataset from the database
    query = '''
    SELECT * FROM customers;
    '''
    #if needed pull a fresh copy of the dataset from the database
    print('Getting a fresh copy from SQL database...')
    df = pd.read_sql(query, get_db_url('mall_customers'))
    #save a copy of the dataset to the local directory as a .csv file
    df.to_csv(filename, index=False)
    return df

In [3]:
df = acquire_mall_customers()
df.head()

Reading from csv file...


Unnamed: 0,customer_id,gender,age,annual_income,spending_score
0,1,Male,19,15,39
1,2,Male,21,15,81
2,3,Female,20,16,6
3,4,Female,23,16,77
4,5,Female,31,17,40


In [4]:
df.describe()

Unnamed: 0,customer_id,age,annual_income,spending_score
count,200.0,200.0,200.0,200.0
mean,100.5,38.85,60.56,50.2
std,57.879185,13.969007,26.264721,25.823522
min,1.0,18.0,15.0,1.0
25%,50.75,28.75,41.5,34.75
50%,100.5,36.0,61.5,50.0
75%,150.25,49.0,78.0,73.0
max,200.0,70.0,137.0,99.0


In [5]:
df.shape

(200, 5)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   customer_id     200 non-null    int64 
 1   gender          200 non-null    object
 2   age             200 non-null    int64 
 3   annual_income   200 non-null    int64 
 4   spending_score  200 non-null    int64 
dtypes: int64(4), object(1)
memory usage: 7.9+ KB


---

In [7]:
def remove_outliers(df, k, col_list):
    ''' this function will remove outliers from a list of columns in a dataframe 
        and return that dataframe. A list of columns with significant outliers is 
        assigned to a variable in the below wrangle function and can be modified if needed
    '''
    #loop throught the columns in the list
    for col in col_list:
        q1, q3 = df[col].quantile([.25, .75])  # get quartiles
        iqr = q3 - q1   # calculate interquartile range
        upper_bound = q3 + k * iqr   # get upper bound
        lower_bound = q1 - k * iqr   # get lower bound
        # return dataframe without outliers
        df = df[(df[col] > lower_bound) & (df[col] < upper_bound)] 
    return df

In [8]:
col_list = ['age', 'annual_income', 'spending_score']
df = remove_outliers(df, 1.5, col_list)
df.head()

Unnamed: 0,customer_id,gender,age,annual_income,spending_score
0,1,Male,19,15,39
1,2,Male,21,15,81
2,3,Female,20,16,6
3,4,Female,23,16,77
4,5,Female,31,17,40


In [9]:
df.shape

(198, 5)

In [13]:
def split_data(df):
    '''
    this function takes the full dataset and splits it into three parts (train, validate, test) 
    and returns the resulting dataframes
    '''
    train_val, test = train_test_split(df, train_size = 0.8, random_state=123)
    train, validate = train_test_split(train_val, train_size = 0.7, random_state=123)
    return train, validate, test

In [14]:
train, validate, test = split_data(df)
train.shape, validate.shape, test.shape

((110, 5), (48, 5), (40, 5))

In [15]:
train.head()

Unnamed: 0,customer_id,gender,age,annual_income,spending_score
45,46,Female,24,39,65
39,40,Female,20,37,75
11,12,Female,35,19,99
171,172,Male,28,87,75
95,96,Male,24,60,52


In [16]:
#encode the categorical columns
def encode_cats(df):
    encode_cols = [col for col in df.columns if df[col].dtype == 'O']
    for col in encode_cols:
        dummie_df = pd.get_dummies(df[col], prefix = df[col].name, drop_first = True)
        df = pd.concat([df, dummie_df], axis=1)
    return df

In [18]:
train = encode_cats(train)
train = train.drop(columns='gender')
train.head()

Unnamed: 0,customer_id,age,annual_income,spending_score,gender_Male,gender_Male.1
45,46,24,39,65,0,0
39,40,20,37,75,0,0
11,12,35,19,99,0,0
171,172,28,87,75,1,1
95,96,24,60,52,1,1


In [20]:
train = train.drop(columns='customer_id')
def scale_data(df):
    scaler = MinMaxScaler()
    scaler.fit(df)
    scaled_df = scaler.transform(df)
    df = pd.DataFrame(scaled_df, columns=df.columns, index=df.index)
    return df

In [21]:
train = scale_data(train)
train.head()

Unnamed: 0,age,annual_income,spending_score,gender_Male,gender_Male.1
45,0.115385,0.216216,0.653061,0.0,0.0
39,0.038462,0.198198,0.755102,0.0,0.0
11,0.326923,0.036036,1.0,0.0,0.0
171,0.192308,0.648649,0.755102,1.0,1.0
95,0.115385,0.405405,0.520408,1.0,1.0


In [22]:
train.isna().sum()

age               0
annual_income     0
spending_score    0
gender_Male       0
gender_Male       0
dtype: int64