In [2]:
import pandas as pd
import seaborn as sns
import os
import wrangle_mall as w
import matplotlib.pyplot as plt
from scipy.stats import mannwhitneyu, pearsonr
from sklearn.model_selection import train_test_split

1. Acquire data from the customers table in the mall_customers database.

In [10]:
df = w.acquire_mall()
df.head()

Unnamed: 0,customer_id,gender,age,annual_income,spending_score
0,1,Male,19,15,39
1,2,Male,21,15,81
2,3,Female,20,16,6
3,4,Female,23,16,77
4,5,Female,31,17,40


2. Summarize the data (include distributions and descriptive statistics).

In [12]:
def summarize(df, k=1.5) -> None:
  
    # print info on the df
    print('Shape of Data: ')
    print(df.shape)
    print('======================\n======================')
    print('Info: ')
    print(df.info())
    print('======================\n======================')
    print('Descriptions:')
    # print the description of the df, transpose, output markdown
    print(df.describe().T.to_markdown())
    print('======================\n======================')
    # lets do that for categorical info as well
    # we will use select_dtypes to look at just Objects
    print(df.select_dtypes('O').describe().T.to_markdown())
    print('======================\n======================')
    print('missing values:')
    print('by column:')
    print(missing_by_col(df).to_markdown())
    print('by row: ')
    print(missing_by_row(df).to_markdown())
    print('======================\n======================')
    print('Outliers: ')
    print(report_outliers(df, k=k))
    print('======================\n======================')


In [16]:
w.summarize(df)

Shape of Data: 
(200, 5)
Info: 
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   customer_id     200 non-null    int64 
 1   gender          200 non-null    object
 2   age             200 non-null    int64 
 3   annual_income   200 non-null    int64 
 4   spending_score  200 non-null    int64 
dtypes: int64(4), object(1)
memory usage: 7.9+ KB
None
Descriptions:
|                |   count |   mean |     std |   min |   25% |   50% |    75% |   max |
|:---------------|--------:|-------:|--------:|------:|------:|------:|-------:|------:|
| customer_id    |     200 | 100.5  | 57.8792 |     1 | 50.75 | 100.5 | 150.25 |   200 |
| age            |     200 |  38.85 | 13.969  |    18 | 28.75 |  36   |  49    |    70 |
| annual_income  |     200 |  60.56 | 26.2647 |    15 | 41.5  |  61.5 |  78    |   137 |
| spending_score |     200 |  50.2  | 25.8

3. Detect outliers using IQR.

In [17]:
w.report_outliers(df)

Outliers for Col customer_id:
lower:  -98.5 upper:  299.5
Series([], Name: customer_id, dtype: int64)
----------
Outliers for Col age:
lower:  -1.625 upper:  79.375
Series([], Name: age, dtype: int64)
----------
Outliers for Col annual_income:
lower:  -13.25 upper:  132.75
198    137
199    137
Name: annual_income, dtype: int64
----------
Outliers for Col spending_score:
lower:  -22.625 upper:  130.375
Series([], Name: spending_score, dtype: int64)
----------


4. Split data into train, validate, and test.

In [20]:
train, val, test = w.split_data(df)
train.head()

Unnamed: 0,customer_id,gender,age,annual_income,spending_score
26,27,Female,45,28,32
23,24,Male,31,25,73
39,40,Female,20,37,75
37,38,Female,30,34,73
56,57,Female,51,44,50


5. Encode categorical columns using a one hot encoder (pd.get_dummies).

In [21]:
def dummies(train, val, test):
    '''
    This function applies one hot encoding to all categorical features in your dataset.
    
    Parameters:
    train = train data
    val = val data
    test = test data
    
    Output:
    This function returns your train, val, and test subsets with dummies added.
    '''
    
    train = pd.get_dummies(train)
    
    val = pd.get_dummies(val)
    
    test = pd.get_dummies(test)
    
    return train, val, test

In [22]:
train, val, test = dummies(train, val, test)
train.head()

Unnamed: 0,customer_id,age,annual_income,spending_score,gender_Female,gender_Male
26,27,45,28,32,1,0
23,24,31,25,73,0,1
39,40,20,37,75,1,0
37,38,30,34,73,1,0
56,57,51,44,50,1,0


6. Handles missing values.

7. Scaling