In [1]:
# import dependencies
import pandas as pd
from path import Path

In [2]:
# read csv into a df
shopping_df = pd.read_csv(Path('Resources/shopping_data.csv'))
shopping_df.head()

Unnamed: 0,CustomerID,Card Member,Age,Annual Income,Spending Score (1-100)
0,1,Yes,19.0,15000,39.0
1,2,Yes,21.0,15000,81.0
2,3,No,20.0,16000,6.0
3,4,No,23.0,16000,77.0
4,5,No,31.0,17000,40.0


In [3]:
# take a look at the data that you have
shopping_df.columns

Index(['CustomerID', 'Card Member', 'Age', 'Annual Income',
       'Spending Score (1-100)'],
      dtype='object')

In [4]:
# learn the data types of the columns
shopping_df.dtypes


CustomerID                  int64
Card Member                object
Age                       float64
Annual Income               int64
Spending Score (1-100)    float64
dtype: object

In [5]:
# loop through the columns and check for missing values using isnull()
for (col_name, col_data) in shopping_df.iteritems():
    print('Column Name:',col_name)
    print('Nulls:', col_data.isnull().sum())

Column Name: CustomerID
Nulls: 0
Column Name: Card Member
Nulls: 2
Column Name: Age
Nulls: 2
Column Name: Annual Income
Nulls: 0
Column Name: Spending Score (1-100)
Nulls: 1


In [6]:
# drop the rows with null values
shopping_df = shopping_df.dropna()

In [7]:
# check for duplicate values
print(f' Duplicate Values: {shopping_df.duplicated().sum()}')

 Duplicate Values: 0


In [8]:
# drop the CustomerID column 
shopping_df.drop('CustomerID', axis=1, inplace=True)
shopping_df.head()

Unnamed: 0,Card Member,Age,Annual Income,Spending Score (1-100)
0,Yes,19.0,15000,39.0
1,Yes,21.0,15000,81.0
2,No,20.0,16000,6.0
3,No,23.0,16000,77.0
4,No,31.0,17000,40.0


In [9]:
# create a function to change the strings to numeric values 
def change_string(member):
    if member == "Yes":
        return 1
    else: 
        return 0

In [10]:
# use the function to change the card member column strings to numeric values
shopping_df['Card Member'] = shopping_df['Card Member'].apply(change_string)
shopping_df.head()

Unnamed: 0,Card Member,Age,Annual Income,Spending Score (1-100)
0,1,19.0,15000,39.0
1,1,21.0,15000,81.0
2,0,20.0,16000,6.0
3,0,23.0,16000,77.0
4,0,31.0,17000,40.0


In [11]:
# scale the Annual Income
shopping_df['Annual Income'] = shopping_df['Annual Income'] /1000
shopping_df.head()

Unnamed: 0,Card Member,Age,Annual Income,Spending Score (1-100)
0,1,19.0,15.0,39.0
1,1,21.0,15.0,81.0
2,0,20.0,16.0,6.0
3,0,23.0,16.0,77.0
4,0,31.0,17.0,40.0


In [12]:
# rename the columns so that they have no spaces or numbers
col_names = {'Card Member': 'Card_Member', 'Annual Income': 'Annual_Income', 'Spending Score (1-100)': 'Spending_Score'}
shopping_df = shopping_df.rename(columns=col_names)
shopping_df.columns

Index(['Card_Member', 'Age', 'Annual_Income', 'Spending_Score'], dtype='object')

In [13]:
# read the dataframe into a csv
shopping_df.to_csv(Path('Resources/shopping_data_cleaned.csv'), index=False)