In [1]:
import pandas as pd
from pathlib import Path

In [2]:
file_path = Path("Resources/iris.csv")
iris_df = pd.read_csv(file_path)
iris_df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,class
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [3]:
#Determine the distinct groups and the number of values 
iris_df['class'].value_counts()

Iris-setosa        50
Iris-versicolor    50
Iris-virginica     50
Name: class, dtype: int64

In [4]:
new_iris_df = iris_df.drop(['class'],axis = 1)

In [5]:
#Change the order of the columns using dataframe indexing
new_iris_df = new_iris_df[['sepal_length','petal_length','sepal_width','petal_width']]
new_iris_df.head()

Unnamed: 0,sepal_length,petal_length,sepal_width,petal_width
0,5.1,1.4,3.5,0.2
1,4.9,1.4,3.0,0.2
2,4.7,1.3,3.2,0.2
3,4.6,1.5,3.1,0.2
4,5.0,1.4,3.6,0.2


In [6]:
output_file_path = Path("Resources/new_iris_data.csv")
new_iris_df.to_csv(output_file_path, index = False)

## Shopping data

In [7]:
file_path = Path("Resources/shopping_data.csv")
df_shopping =pd.read_csv(file_path, encoding = "ISO-8859-1")
df_shopping.head()

Unnamed: 0,CustomerID,Card Member,Age,Annual Income,Spending Score (1-100)
0,1,Yes,19.0,15000,39.0
1,2,Yes,21.0,15000,81.0
2,3,No,20.0,16000,6.0
3,4,No,23.0,16000,77.0
4,5,No,31.0,17000,40.0


In [8]:
df_shopping.columns

Index(['CustomerID', 'Card Member', 'Age', 'Annual Income',
       'Spending Score (1-100)'],
      dtype='object')

In [9]:
df_shopping.dtypes

CustomerID                  int64
Card Member                object
Age                       float64
Annual Income               int64
Spending Score (1-100)    float64
dtype: object

In [10]:
#Find null values 
for column in df_shopping:
    print(f"Column {column} has {df_shopping[column].isnull().sum()} null values")

Column CustomerID has 0 null values
Column Card Member has 2 null values
Column Age has 2 null values
Column Annual Income has 0 null values
Column Spending Score (1-100) has 1 null values


In [11]:
#Drop na's
df_shopping = df_shopping.dropna()

In [12]:
#Find duplicates 
print(f"Duplicate entries : {df_shopping.duplicated().sum()}")

Duplicate entries : 0


In [13]:
# Drop duplicates if any
# df_shopping = df_shopping.drop_duplicates()

In [14]:
df_shopping.drop(columns=['CustomerID'],inplace=True)
df_shopping.head()

Unnamed: 0,Card Member,Age,Annual Income,Spending Score (1-100)
0,Yes,19.0,15000,39.0
1,Yes,21.0,15000,81.0
2,No,20.0,16000,6.0
3,No,23.0,16000,77.0
4,No,31.0,17000,40.0


In [15]:
#Use function to change string to int 
def change_string(member):
    if member == "Yes":
        return 1 
    else:
        return 0
    
df_shopping['Card Member'] = df_shopping['Card Member'].apply(change_string)
df_shopping.head()

Unnamed: 0,Card Member,Age,Annual Income,Spending Score (1-100)
0,1,19.0,15000,39.0
1,1,21.0,15000,81.0
2,0,20.0,16000,6.0
3,0,23.0,16000,77.0
4,0,31.0,17000,40.0


In [16]:
#Transforn annual income
df_shopping['Annual Income'] = df_shopping['Annual Income']/1000
df_shopping.head()

Unnamed: 0,Card Member,Age,Annual Income,Spending Score (1-100)
0,1,19.0,15.0,39.0
1,1,21.0,15.0,81.0
2,0,20.0,16.0,6.0
3,0,23.0,16.0,77.0
4,0,31.0,17.0,40.0


In [17]:
#Rename columns to drop spaces and numbers
df_shopping.rename(columns={'Card Member':'CardMember','Annual Income':'AnnualIncome','Spending Score (1-100)':'SpendingScore'},inplace=True)
df_shopping.head()

Unnamed: 0,CardMember,Age,AnnualIncome,SpendingScore
0,1,19.0,15.0,39.0
1,1,21.0,15.0,81.0
2,0,20.0,16.0,6.0
3,0,23.0,16.0,77.0
4,0,31.0,17.0,40.0


In [18]:
#Save cleaned data 
out_file_path = Path("Resources/shopping_data_cleaned.csv")
df_shopping.to_csv(out_file_path)