# Module1. Pandas and Numpy
Pandas and numpy are the two most commonly used package for doing data science in Python. Pandas provides comprehensive tools for user to manipulate the structured data, and Numpy is a package designed to handle the vector and matrix operation. Both are powerful and useful.
In this module, we will focus more on Pandas, and try to: 1.explore the data, 2.merge the data, and 3.clean and transform the data.

In [1]:
# import the package
import pandas as pd
import numpy as np

In [2]:
# read the data
df = pd.read_csv("titanic.csv")

## 1.Use Pandas for Exploratory Data Analysis
Pandas is just like Excel, instead of it use the command to operate. You can use pandas to quickly produce some statistics for the data.
This process is sometimes called exploratorry data analysis(EDA). EDA is a basic but important step for doing data analysis.

### Take a look: head() and describe()

In [3]:
# have a look at the data
# use "head" to display the top n data
df.head(n=10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


In [4]:
# we can also use "describe" to show the simple stat
df.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [5]:
# extract the data by column name
# there are two major ways to get the data from the dataframe: loc and iloc
# loc use index and column name
# loc[row index, column name]
df.loc[:, "Age"] # ":" means all rows or columns

0      22.0
1      38.0
2      26.0
3      35.0
4      35.0
       ... 
886    27.0
887    19.0
888     NaN
889    26.0
890    32.0
Name: Age, Length: 891, dtype: float64

### Index and slice: loc and iloc

In [6]:
# extract the data by row index
df.loc[0, :]

PassengerId                          1
Survived                             0
Pclass                               3
Name           Braund, Mr. Owen Harris
Sex                               male
Age                                 22
SibSp                                1
Parch                                0
Ticket                       A/5 21171
Fare                              7.25
Cabin                              NaN
Embarked                             S
Name: 0, dtype: object

In [7]:
# of course you can extract multiple index or columns by using list
df.loc[[0,1,2], ["Name", "Sex", "Age"]]

Unnamed: 0,Name,Sex,Age
0,"Braund, Mr. Owen Harris",male,22.0
1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0
2,"Heikkinen, Miss. Laina",female,26.0


In [8]:
# iloc use the coordinate
df.iloc[:, 3]

0                                Braund, Mr. Owen Harris
1      Cumings, Mrs. John Bradley (Florence Briggs Th...
2                                 Heikkinen, Miss. Laina
3           Futrelle, Mrs. Jacques Heath (Lily May Peel)
4                               Allen, Mr. William Henry
                             ...                        
886                                Montvila, Rev. Juozas
887                         Graham, Miss. Margaret Edith
888             Johnston, Miss. Catherine Helen "Carrie"
889                                Behr, Mr. Karl Howell
890                                  Dooley, Mr. Patrick
Name: Name, Length: 891, dtype: object

In [9]:
# iloc use the coordinate
df.iloc[1, :]

PassengerId                                                    2
Survived                                                       1
Pclass                                                         1
Name           Cumings, Mrs. John Bradley (Florence Briggs Th...
Sex                                                       female
Age                                                           38
SibSp                                                          1
Parch                                                          0
Ticket                                                  PC 17599
Fare                                                     71.2833
Cabin                                                        C85
Embarked                                                       C
Name: 1, dtype: object

In [10]:
# again, you can use list to contain all the rows and columns' index
df.iloc[[1,2,3], [1,2,3]]

Unnamed: 0,Survived,Pclass,Name
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th..."
2,1,3,"Heikkinen, Miss. Laina"
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)"


In [11]:
# you can also use this way to extract the entire column
df.Age

0      22.0
1      38.0
2      26.0
3      35.0
4      35.0
       ... 
886    27.0
887    19.0
888     NaN
889    26.0
890    32.0
Name: Age, Length: 891, dtype: float64

### value_counts(), groupby(), and pivot_table()

In [12]:
# count the number
df.Sex.value_counts()

male      577
female    314
Name: Sex, dtype: int64

In [18]:
# grouped by
df.groupby(by="Sex").agg({"Pclass":"mean"})

Unnamed: 0_level_0,Pclass
Sex,Unnamed: 1_level_1
female,2.159236
male,2.389948


In [13]:
# pivot table
df.pivot_table(index="Sex", columns="Pclass", aggfunc="size")

Pclass,1,2,3
Sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
female,94,76,144
male,122,108,347


## 2. Use Pandas to combine the data
In practice, it is rare to have a complete, clean, and merged data. You typically need to combine several data into one. Pandas has many tools to help you achieve this. Now let's try some of them.

In [32]:
# To learn this, we split the data into two pieces
# Ignore this block, as this is not important at all
df_personal = df.loc[:, ["Name", "Sex", "Age"]].sample(frac=1).reset_index(drop=True)
df_ticket = df.loc[:, ['PassengerId', 'Pclass', 'Name', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked']].sample(frac=1).reset_index(drop=True)
df_survival = df.loc[:, ['PassengerId', 'Survived']].sample(frac=1).reset_index(drop=True)

In [33]:
df_personal.head()

Unnamed: 0,Name,Sex,Age
0,"Davison, Mrs. Thomas Henry (Mary E Finck)",female,
1,"Taussig, Mrs. Emil (Tillie Mandelbaum)",female,39.0
2,"Waelens, Mr. Achille",male,22.0
3,"Sobey, Mr. Samuel James Hayden",male,25.0
4,"Jensen, Mr. Hans Peder",male,20.0


In [34]:
df_ticket.head()

Unnamed: 0,PassengerId,Pclass,Name,Parch,Ticket,Fare,Cabin,Embarked
0,187,3,"O'Brien, Mrs. Thomas (Johanna ""Hannah"" Godfrey)",0,370365,15.5,,Q
1,121,2,"Hickman, Mr. Stanley George",0,S.O.C. 14879,73.5,,S
2,613,3,"Murphy, Miss. Margaret Jane",0,367230,15.5,,Q
3,265,3,"Henry, Miss. Delia",0,382649,7.75,,Q
4,230,3,"Lefebre, Miss. Mathilde",1,4133,25.4667,,S


In [35]:
df_survival.head()

Unnamed: 0,PassengerId,Survived
0,776,0
1,280,1
2,778,1
3,395,1
4,765,0


### merge() and concat()
When you have multiple data, and you want to bundle them, you can use merge(). merge() basically combine the two data based on the "key".
The key is usually an ID or name. Using merger(), you can choose different way to merge the data. For instance, you can decide whether to keep only the IDs that exist in both data or to keep all the IDs.

In [36]:
pd.merge(df_ticket, df_survival, on='PassengerId', how='outer', indicator=True)

Unnamed: 0,PassengerId,Pclass,Name,Parch,Ticket,Fare,Cabin,Embarked,Survived,_merge
0,187,3,"O'Brien, Mrs. Thomas (Johanna ""Hannah"" Godfrey)",0,370365,15.5000,,Q,1,both
1,121,2,"Hickman, Mr. Stanley George",0,S.O.C. 14879,73.5000,,S,0,both
2,613,3,"Murphy, Miss. Margaret Jane",0,367230,15.5000,,Q,1,both
3,265,3,"Henry, Miss. Delia",0,382649,7.7500,,Q,0,both
4,230,3,"Lefebre, Miss. Mathilde",1,4133,25.4667,,S,0,both
...,...,...,...,...,...,...,...,...,...,...
886,712,1,"Klaber, Mr. Herman",0,113028,26.5500,C124,S,0,both
887,489,3,"Somerton, Mr. Francis William",0,A.5. 18509,8.0500,,S,0,both
888,26,3,"Asplund, Mrs. Carl Oscar (Selma Augusta Emilia...",5,347077,31.3875,,S,1,both
889,291,1,"Barber, Miss. Ellen ""Nellie""",0,19877,78.8500,,S,1,both


Besides the case the several data share one id, sometimes you will face the scenario that there are many dataframe with same structure but collected in different timing. To analyze the whole data, you need to use "concatenate".

In [37]:
df_old = df.iloc[:400, :]
df_new = df.iloc[400:, :]

In [38]:
df_old.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [39]:
df_new.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
400,401,1,3,"Niskanen, Mr. Juha",male,39.0,0,0,STON/O 2. 3101289,7.925,,S
401,402,0,3,"Adams, Mr. John",male,26.0,0,0,341826,8.05,,S
402,403,0,3,"Jussila, Miss. Mari Aina",female,21.0,1,0,4137,9.825,,S
403,404,0,3,"Hakkarainen, Mr. Pekka Pietari",male,28.0,1,0,STON/O2. 3101279,15.85,,S
404,405,0,3,"Oreskovic, Miss. Marija",female,20.0,0,0,315096,8.6625,,S


In [41]:
pd.concat([df_old, df_new])

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


## 3. Use Pandas and Numpy to clean and transform the data
Data is not always clean. In fact, the most of your time as a data analyst will be spending on cleaning the data.

### fillna() and dropna()

In [43]:
df.isnull().any()

PassengerId    False
Survived       False
Pclass         False
Name           False
Sex            False
Age             True
SibSp          False
Parch          False
Ticket         False
Fare           False
Cabin           True
Embarked        True
dtype: bool

In [44]:
df_nona = df.fillna(0)
df_nona.isnull().any()

PassengerId    False
Survived       False
Pclass         False
Name           False
Sex            False
Age            False
SibSp          False
Parch          False
Ticket         False
Fare           False
Cabin          False
Embarked       False
dtype: bool

In [45]:
df_nona = df.dropna()
df_nona.isnull().any()

PassengerId    False
Survived       False
Pclass         False
Name           False
Sex            False
Age            False
SibSp          False
Parch          False
Ticket         False
Fare           False
Cabin          False
Embarked       False
dtype: bool