# <u>Data Preparation & EDA</u>

### Importing the libraries

In [28]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import seaborn as sns

### Importing the dataset

In [29]:
dataset = pd.read_csv('./Customers.csv')
df = pd.DataFrame(dataset)

### Clean Dataset

##### Check for missing values

In [30]:
df.isna().sum()

CustomerID                 0
Gender                     0
Age                        0
Annual Income ($)          0
Spending Score (1-100)     0
Profession                35
Work Experience            0
Family Size                0
dtype: int64

#### outliar

In [31]:
df[df['Work Experience'] == df['Work Experience'].max()]

Unnamed: 0,CustomerID,Gender,Age,Annual Income ($),Spending Score (1-100),Profession,Work Experience,Family Size
392,393,Male,21,119116,30,Artist,17,4
405,406,Female,65,119889,11,Artist,17,6
473,474,Male,20,130813,92,Artist,17,5
566,567,Female,19,180331,14,Artist,17,5
603,604,Female,91,69720,78,Lawyer,17,6


#### clean

In [32]:
# Replace the missing values with mode
df['Profession'] = df['Profession'].fillna(df['Profession'].mode()[0])

# Remove outliars
df = df.drop(df[df['Work Experience'] == 17].index)

#### Explode categorical variables into dummy variables

In [33]:
# creating an instance of labelencoder
labelencoder = LabelEncoder()

# Assigning numerical values and storing it in another column called "Gender_Code"
df["Gender_Code"] = labelencoder.fit_transform(df["Gender"])

# Assigning numerical values and storing it in another column called "Profession_Code"
df["Profession_Code"] = labelencoder.fit_transform(df["Profession"])

# Display dataframe
df

Unnamed: 0,CustomerID,Gender,Age,Annual Income ($),Spending Score (1-100),Profession,Work Experience,Family Size,Gender_Code,Profession_Code
0,1,Male,19,15000,39,Healthcare,1,4,1,5
1,2,Male,21,35000,81,Engineer,3,3,1,2
2,3,Female,20,86000,6,Engineer,1,1,0,2
3,4,Female,23,59000,77,Lawyer,0,2,0,7
4,5,Female,31,38000,40,Entertainment,2,6,0,3
...,...,...,...,...,...,...,...,...,...,...
1995,1996,Female,71,184387,40,Artist,8,7,0,0
1996,1997,Female,91,73158,32,Doctor,7,7,0,1
1997,1998,Male,87,90961,14,Healthcare,9,2,1,5
1998,1999,Male,77,182109,4,Executive,7,2,1,4


# <b>Exploring the dataset</b>

### Checking the dimensions and size of the dataset

In [35]:
# Check the dimensions of the dataset
print("Number of rows and columns in the dataset:", df.shape)

# Check the size of the dataset
print("Size of the dataset:", df.size)

Number of rows and columns in the dataset: (1995, 10)
Size of the dataset: 19950


### Examining the first few rows and columns of the dataset

In [34]:
df.head()

Unnamed: 0,CustomerID,Gender,Age,Annual Income ($),Spending Score (1-100),Profession,Work Experience,Family Size,Gender_Code,Profession_Code
0,1,Male,19,15000,39,Healthcare,1,4,1,5
1,2,Male,21,35000,81,Engineer,3,3,1,2
2,3,Female,20,86000,6,Engineer,1,1,0,2
3,4,Female,23,59000,77,Lawyer,0,2,0,7
4,5,Female,31,38000,40,Entertainment,2,6,0,3


# <b>Visualizing the data</b>

# <b>Analyzing the data</b>

### General statistical information

In [39]:
df.describe(include='all')

Unnamed: 0,CustomerID,Gender,Age,Annual Income ($),Spending Score (1-100),Profession,Work Experience,Family Size,Gender_Code,Profession_Code
count,1995.0,1995,1995.0,1995.0,1995.0,1995,1995.0,1995.0,1995.0,1995.0
unique,,2,,,,9,,,,
top,,Female,,,,Artist,,,,
freq,,1183,,,,643,,,,
mean,1001.782456,,48.974436,110698.633584,50.977444,,4.070175,3.764912,0.407018,2.784461
std,577.633256,,28.42532,45758.034811,27.917452,,3.873513,1.971556,0.491401,2.548164
min,1.0,,0.0,0.0,0.0,,0.0,1.0,0.0,0.0
25%,502.5,,25.0,74537.0,28.0,,1.0,2.0,0.0,0.0
50%,1003.0,,48.0,109759.0,50.0,,3.0,4.0,0.0,3.0
75%,1501.5,,73.0,149093.5,75.0,,7.0,5.0,1.0,5.0
