# Filter the data of US - Baby Names dataset using Pandas

### Introduction:

    We are going to use a subset of [US Baby Names] dataset
    Dataset contains a US-baby names from 2004 until 2014 is available 

In [82]:
# Step 1. Import the necessary libraries
import pandas as pd
import numpy as np

### Step 2. Import the dataset 

### Step 3. Assign it to a variable called baby_names.

In [83]:
baby_names=pd.read_csv("US_Baby_Names_right.csv")

In [84]:
baby_names # Data information 

Unnamed: 0.1,Unnamed: 0,Id,Name,Year,Gender,State,Count
0,11349,11350,Emma,2004,F,AK,62
1,11350,11351,Madison,2004,F,AK,48
2,11351,11352,Hannah,2004,F,AK,46
3,11352,11353,Grace,2004,F,AK,44
4,11353,11354,Emily,2004,F,AK,41
...,...,...,...,...,...,...,...
1016390,5647421,5647422,Seth,2014,M,WY,5
1016391,5647422,5647423,Spencer,2014,M,WY,5
1016392,5647423,5647424,Tyce,2014,M,WY,5
1016393,5647424,5647425,Victor,2014,M,WY,5


### Step 4. See the first 10 entries

In [79]:
baby_names.head(10)

Unnamed: 0.1,Unnamed: 0,Id,Name,Year,Gender,State,Count
0,11349,11350,Emma,2004,F,AK,62
1,11350,11351,Madison,2004,F,AK,48
2,11351,11352,Hannah,2004,F,AK,46
3,11352,11353,Grace,2004,F,AK,44
4,11353,11354,Emily,2004,F,AK,41
5,11354,11355,Abigail,2004,F,AK,37
6,11355,11356,Olivia,2004,F,AK,33
7,11356,11357,Isabella,2004,F,AK,30
8,11357,11358,Alyssa,2004,F,AK,29
9,11358,11359,Sophia,2004,F,AK,28


### Step 5. Delete the column 'Unnamed: 0' and 'Id'

    1. Dataset contain 'Unnamed: 0' and 'Id' columns, which are not required for data analysis, i.e. irrelevant features
    2. Irrelevant columns in the dataset will be drop/deleted

In [80]:
# solution 1-- using drop command 
baby_names.drop(["Unnamed: 0","Id"],axis=1,inplace=True)

In [85]:
# solution 2-- using del command

del baby_names["Unnamed: 0"]
del baby_names["Id"]

In [86]:
baby_names.head()

Unnamed: 0,Name,Year,Gender,State,Count
0,Emma,2004,F,AK,62
1,Madison,2004,F,AK,48
2,Hannah,2004,F,AK,46
3,Grace,2004,F,AK,44
4,Emily,2004,F,AK,41


### Step 6. Is there more male or female names in the dataset?

In [87]:
baby_names["Gender"].value_counts() # value_counts gives the categorical distribution i.e. values

F    558846
M    457549
Name: Gender, dtype: int64

### Step 7. Group the dataset by name and assign to names

In [88]:
del baby_names["Year"] 
names=baby_names.groupby("Name").sum().sort_values('Count',ascending=False)
names.head(10)

Unnamed: 0_level_0,Count
Name,Unnamed: 1_level_1
Jacob,242874
Emma,214852
Michael,214405
Ethan,209277
Isabella,204798
William,197894
Joshua,191551
Sophia,191446
Daniel,191440
Emily,190318


### Step 8. How many different names exist in the dataset?

In [91]:
# solution 1-- using len function
len(names)

17632

In [92]:
# solution 2-- using shape function
names.shape[0]

17632

In [93]:
# solution 3-- using nunique function
baby_names["Name"].nunique() # to print the names use-- print(baby_names["Name"].unique())

17632

### Step 9. What is the name with most occurrences?

In [94]:
#solution 1-- using idxmax
names['Count'].idxmax()

'Jacob'

In [95]:
names[names.Count==names.Count.max()]

Unnamed: 0_level_0,Count
Name,Unnamed: 1_level_1
Jacob,242874


### Step 10. How many different names have the least occurrences?

In [98]:
#solution 1-- using min function
names['Count'].min()
#In dataset 5 is the least occurrences for the name

5

In [100]:
len(names[names.Count==names.Count.min()]) 
#it show that 2578 names have least occurrences i.e. 5

2578

### Step 11. What is the median name occurrence?

In [108]:
names['Count'].median()  # it gives the median for the dataset

49.0

In [111]:
names[names['Count']==names['Count'].median()] # it gives the names which having median value in the dataset

Unnamed: 0_level_0,Count
Name,Unnamed: 1_level_1
Zuleima,49
Vita,49
Aziah,49
Kaedence,49
Antonina,49
...,...
Mckynzie,49
Yoni,49
Hareem,49
Jkwon,49


### Step 12. What is the standard deviation of names?

In [112]:
names['Count'].std()

11006.0694678915

### Step 13. Get a summary with the mean, min, max, std and quartiles.

In [114]:
baby_names.describe() # main dataset is used here i.e. "US_Baby_Names_right"

Unnamed: 0,Count
count,1016395.0
mean,34.85012
std,97.39735
min,5.0
25%,7.0
50%,11.0
75%,26.0
max,4167.0


In [115]:
names.describe() # dataset is used here is "names"

Unnamed: 0,Count
count,17632.0
mean,2008.932169
std,11006.069468
min,5.0
25%,11.0
50%,49.0
75%,337.0
max,242874.0
