In [1]:
# importing libraries
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# reading the file
file=pd.read_csv('customer1.csv')
print("File read successfully")

FileNotFoundError: [Errno 2] No such file or directory: 'customer1.csv'

#### Info about file

In [None]:
file.info()

This dataset includes a total of <b>100 rows and 8 columns.</b> All the column are fully valid, meaning they do noy contain any null values. The dataset is basically about the customer information which records their details like age, gender, income etc. For analysis, we have a total of 2 int columns, & 1 float column, rest are objects. A total memory consumed is approx. 6.4KB

#### Sample Values

In [None]:
file.head() # prints ny default, initial 5 values

In [None]:
file.tail() # prints ny default, last 5 values

In [None]:
print("Missing value count in each column-\n",file.isnull().sum())
# isnull() will identify if there is any null value in dataset or not(returns true or false)
# .sum()

#### Replacing Age null values(assumption)

In [None]:
age_new=int(np.median(file['Age']))
print("Median of age=",age_new)

file['Age_new']=file['Age'].fillna(age_new)

In [None]:
file.info()

In [None]:
file.head()

### Fixing City Names (Standardizing capitalization & strip spaces)

In [None]:
file['City']=file['City'].str.title().str.strip()
# str.title() will capitalize all values || str.strip() will remove space from front and back
file.head()

### Working with date-time

In [None]:
# converting SignupDate to date-time format
file['SignupDate']=pd.to_datetime(file['SignupDate'])
file.head()

In [None]:
# extracting Year and Month as new feature
file['SignupYear']=file['SignupDate'].dt.year # dt is an object , year is reference to it
file['SignupMonth']=file['SignupDate'].dt.month # dt is an object , month is reference to it
file.drop(columns=['SignupDate'],inplace=True) # drop() will delete columns written in attribute
file.head()

In [None]:
file.info()

### Label Encoding using sk-learn

<p>Can only be done for columns having categorical value</p>

In [None]:
le_gender=LabelEncoder() # LabelEncoder() is responsible for encoding each value uniquely
le_active=LabelEncoder()

file['Gender']=le_gender.fit_transform(file['Gender']) # fit-> it maps the encode to all values of column
file['IsActive']=le_active.fit_transform(file['IsActive']) # transform-> conversion

file.head()

### Finding range of values within column

In [None]:
min_age=file['Age'].min()
max_age=file['Age'].max()
print(f"Range of age-> {min_age} to {max_age}.")

range_inc=file['AnnualIncome'].agg(['min','max'])
print(f"Range of Income->\n {range_inc}.")

### Finding count of unique values in a column

In [None]:
unique_city=file['City'].value_counts()

print("Number of unique values of cities-",unique_city)

<h3 style="color:skyblue">Visualization</h3>

<h4 style="color:orange">There are 2 types of charts:-</h4>
<ul>
<li><b>Univariate chart</b>-> These charts depend on single values eg:Pie Chart etc.</li>
<li><b>Multivariate chart</b>-> These charts depend on multiple values eg:Line Chart etc.</li>    

For visualization we are going to use 2 libraries:- a) <b>Matplotlib</b> b) <b>Seaborn</b>

### Signup Trend over Time

In [None]:
monthly_signup=file['SignupMonth'].value_counts().sort_index()
print(type(monthly_signup))

In [None]:
# Step-1 set the size of figure
plt.figure(figsize=(12,5))
monthly_signup.plot(kind='bar')
plt.title('Customer Signup over Time')
plt.xlabel('Month')
plt.ylabel('Number of Signups')
plt.xticks(rotation=0)
plt.grid(True)
plt.show()

<p>From the observation of above charts, we can clearly identify that <b>October has most number of signups</b>, followed by January. While, <b>May has least number of signups</b>

### Countplot: Gender Distribution

In [None]:
plt.figure(figsize=(6,4))
sns.countplot(x='Gender',data=file)
plt.title('Gender Distribution')
plt.grid(True)
plt.show()

### Boxplot: Annual Income By Gender

In [None]:
plt.figure(figsize=(10,5))
sns.boxplot(x='Gender',y='AnnualIncome',data=file)
plt.title('Annual Income By Gender')
plt.show()