In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:98% !important; }</style>"))

# Training Workshop Part I (Skip this if you already know python)
https://jupyter-notebook.readthedocs.io/en/stable/examples/Notebook/Working%20With%20Markdown%20Cells.html

1. Quick Review on Python Fundamentals
2. Numpy 
3. Basic Introduction to Pandas

## 1. Import Package

In [2]:
import pandas as pd
import numpy as np

##### everything in python is an object

In [3]:
a = 3  # a is referring to that object
b = 6
print(a)
print(type(a))
print(id(a))
print(id(b))

3
<class 'int'>
140728228090752
140728228090848


In [4]:
print("this is a test")

this is a test


In [5]:
!python --version

Python 3.7.3


In [6]:
np.__version__

'1.16.4'

## 2. Python Fundamentals

In this part, we will recap some fundamental operations in Python. 

In [7]:
# print('comment')
type('123')
#type(1 == 2)

str

In [8]:
a = 2 * 5 # assign values to variables
a # jupyter notebook won't print variable value in assignment

10

In [9]:
a * 199 # this is an expression, result is not stored

1990

In [10]:
# list
ages = [15,17,10,23,26,27,29,30]

In [11]:
new_ages = [e * 10 for e in ages] # for loop construction
new_ages

[150, 170, 100, 230, 260, 270, 290, 300]

In [12]:
# list can actually hold different types of values
combined = [15, 'str', 3.0, 'blah..']
combined

[15, 'str', 3.0, 'blah..']

In [13]:
# dict
menu = {
    "Big Mac": 3.99,
    "McSpicy": 5.45,
    "McWings": 4.25
}
menu

{'Big Mac': 3.99, 'McSpicy': 5.45, 'McWings': 4.25}

In [14]:
# iterate list values
for e in ages:
    print(e)

15
17
10
23
26
27
29
30


In [15]:
# iterate list with index and value
for i, e in enumerate(ages):
    print(i, e)

0 15
1 17
2 10
3 23
4 26
5 27
6 29
7 30


In [16]:
# key-value iteration over map
for k, v in menu.items():
    print(k, v)

Big Mac 3.99
McSpicy 5.45
McWings 4.25


In [17]:
for k in menu:
    print(k)

Big Mac
McSpicy
McWings


### function and lambda function

In [18]:
# function
def cube(x):
    return x * x * x
print(type(cube))
print(id(cube))
print(cube(3))

<class 'function'>
1300604166760
27


- cube is an instance of the function class.

- a was an instance of the integer class.

- The only difference is that a variable stores data whereas a function stores code.

#### lambda is another way to define function

The general syntax of a python lambda
```
lambda arguments: expression
```

In [19]:
# lambda: anoter way to define function
g = lambda x: x * x * x
print(type(g))
print(id(g))
print(g(3))

<class 'function'>
1300604166216
27


- When to use lambda

1. Actually, lambdas are only useful when you want to define a `one-off` function.
2. Check the examples of lambda in action

In [20]:
# filter: given any iterable object (like a list), the filter function filters out some of the elements while keeping some based on some criteria
nums = [5, 7, 22, 97, 54, 62, 77, 23, 73, 61] 
even_nums = list(filter(lambda x: (x%2 == 0) , nums))
print(filter(lambda x: (x%2 == 0) , nums))
print(even_nums)

<filter object at 0x0000012ED20A92E8>
[22, 54, 62]


In [21]:
# Map is a Python built-in function that takes in a function and a sequence as arguments and then calls the input function on each item of the sequence.
nums_2x = list(map(lambda x: x * 2, nums))
print(nums_2x)
#nums_3x = list(map(g, nums))
nums_3x = list(map(cube, nums))
print(nums_3x)

[10, 14, 44, 194, 108, 124, 154, 46, 146, 122]
[125, 343, 10648, 912673, 157464, 238328, 456533, 12167, 389017, 226981]


In [22]:
# reduce
from functools import reduce
nums_sum = reduce(lambda x, y: x + y, nums)
print(nums_sum)

481


## 3. Numpy

In [23]:
import numpy as np

### Array Basics

In [24]:
x = np.array(nums)
print(x)
print(type(x))

[ 5  7 22 97 54 62 77 23 73 61]
<class 'numpy.ndarray'>


In [25]:
y = np.array([[2,5,6],[5,8,-1]])
print(y)
print(y.shape)

[[ 2  5  6]
 [ 5  8 -1]]
(2, 3)


In [26]:
# arrange [) (start from left boundary, with given steps)
m = np.arange(1,100,5)
print(m)
print(type(m))
print(m.shape)

[ 1  6 11 16 21 26 31 36 41 46 51 56 61 66 71 76 81 86 91 96]
<class 'numpy.ndarray'>
(20,)


In [27]:
# linspace [] (evenly partitioned both boundaries are included)
m2 = np.linspace(1,10,7)
print(m2)
print(type(m2))
print(m2.dtype)

[ 1.   2.5  4.   5.5  7.   8.5 10. ]
<class 'numpy.ndarray'>
float64


In [28]:
# reshape
print(m)
print(m.shape)
print(m.reshape(10, 2))
print(m.reshape(4, 5))
#print(m.reshape(10,3))
print(m)

[ 1  6 11 16 21 26 31 36 41 46 51 56 61 66 71 76 81 86 91 96]
(20,)
[[ 1  6]
 [11 16]
 [21 26]
 [31 36]
 [41 46]
 [51 56]
 [61 66]
 [71 76]
 [81 86]
 [91 96]]
[[ 1  6 11 16 21]
 [26 31 36 41 46]
 [51 56 61 66 71]
 [76 81 86 91 96]]
[ 1  6 11 16 21 26 31 36 41 46 51 56 61 66 71 76 81 86 91 96]


In [29]:
# resize
# resize changes the variable while reshape does not
print(x)
x.resize(2, 5)
print(x)

[ 5  7 22 97 54 62 77 23 73 61]
[[ 5  7 22 97 54]
 [62 77 23 73 61]]


In [30]:
print(np.ones((3,5)))
print(np.zeros(8))

[[1. 1. 1. 1. 1.]
 [1. 1. 1. 1. 1.]
 [1. 1. 1. 1. 1.]]
[0. 0. 0. 0. 0. 0. 0. 0.]


In [31]:
np.eye(10)

array([[1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 1.]])

In [32]:
np.diag([2,5,6])

array([[2, 0, 0],
       [0, 5, 0],
       [0, 0, 6]])

In [33]:
t = np.ones((2,3), int)
print(t)
print(t + 10)

[[1 1 1]
 [1 1 1]]
[[11 11 11]
 [11 11 11]]


In [34]:
print(np.hstack([t, t+10]))
print((np.hstack([t, t+10])).shape)

[[ 1  1  1 11 11 11]
 [ 1  1  1 11 11 11]]
(2, 6)


In [35]:
print(np.vstack([t, t+ 10]))
# same as np.concatenate(t, t*2)

[[ 1  1  1]
 [ 1  1  1]
 [11 11 11]
 [11 11 11]]


### Array Indexing and Slicing

In [36]:
x = np.array([[ 5,  7, 22, 97, 54],
    [62, 77, 23, 73, 61]])
print(x)
print('-----------------')
print(x[0,0])
print(x[1,2])
print(x[0:])
print(x[:,-3])
print(x[-1,::2])

[[ 5  7 22 97 54]
 [62 77 23 73 61]]
-----------------
5
23
[[ 5  7 22 97 54]
 [62 77 23 73 61]]
[22 23]
[62 23 61]


In [37]:
index_temp = x>10
print(index_temp.dtype)
print(x[x>10])

bool
[22 97 54 62 77 23 73 61]


In [38]:
# distinguish between slicing and slice assignment
x = np.array([[ 5,  7, 22, 97, 54],
    [62, 77, 23, 73, 61]])
print(x)
r = x[:,-3] # create a slice of x, assign to r
print(r)
#r[:] = 1 # as slice points to the original array, changing values affect original array

r = 1 # this is assignment, only changes value for r
print(r)
print(x)

[[ 5  7 22 97 54]
 [62 77 23 73 61]]
[22 23]
1
[[ 5  7 22 97 54]
 [62 77 23 73 61]]


In [39]:
r = 100
print(x)
x[:,-3] = 100
print(x)

[[ 5  7 22 97 54]
 [62 77 23 73 61]]
[[  5   7 100  97  54]
 [ 62  77 100  73  61]]


### Array Operation

In [40]:
a = [1,3.4,1.5]
b = [3,4,2]
from operator import mul
print(list(map(mul, a, b)))

[3, 13.6, 3.0]


In [41]:
import numpy as np
a = [1.2,3.4,1.5]
b = [3,4,2]
print(a)
print(b)
print(np.array(a) * np.array(b))

[1.2, 3.4, 1.5]
[3, 4, 2]
[ 3.6 13.6  3. ]


In [42]:
np.array([2,5,8]).dot(np.array([0,1,2]))

21

In [43]:
x = np.array([[  5, 7, 1, 97, 54], [ 62, 77, 1, 100, 61]])
print(x)
x.T

[[  5   7   1  97  54]
 [ 62  77   1 100  61]]


array([[  5,  62],
       [  7,  77],
       [  1,   1],
       [ 97, 100],
       [ 54,  61]])

In [44]:
x.sum()

465

In [45]:
x.max()

100

In [46]:
x[1,3] = 100
print(x)
x.argmax() ## flatten axis

[[  5   7   1  97  54]
 [ 62  77   1 100  61]]


8

In [47]:
np.sort(x)

array([[  1,   5,   7,  54,  97],
       [  1,  61,  62,  77, 100]])

In [48]:
np.ndarray.flatten(x)

array([  5,   7,   1,  97,  54,  62,  77,   1, 100,  61])

In [49]:
np.median(x)

57.5

In [50]:
print(np.mean(x))
print(np.sum(x)/np.size(x))

46.5
46.5


## 4. Introduction to Dataframe (Skip this if you already know how to read csv files)

In [51]:
import pandas as pd
titanic = pd.read_csv("titanic.csv")

In [52]:
pd.options.display.max_rows = 10 #display entire dataset in 10 rows

#### Built-in Functions and Methods of pandas

In [53]:
titanic

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [54]:
# first n rows
titanic.head(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [55]:
titanic.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [56]:
titanic.index

RangeIndex(start=0, stop=891, step=1)

In [57]:
titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


In [58]:
titanic.shape
# print(type(titanic.shape)), returns value of rows, columns

(891, 12)

In [59]:
titanic.describe() #a summary of the entire dataset, countains number of elements, mean, standard deviation etc

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [60]:
titanic.min() #returns minimum value of each column

PassengerId                      1
Survived                         0
Pclass                           1
Name           Abbing, Mr. Anthony
Sex                         female
Age                           0.42
SibSp                            0
Parch                            0
Ticket                      110152
Fare                             0
dtype: object

In [61]:
titanic.shape

(891, 12)

In [62]:
len(titanic)

891

In [63]:
titanic.size #row times column

10692

In [64]:
titanic.sort_values(by = 'Age', ascending=False) #sort values by decending based on Age column

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
630,631,1,1,"Barkworth, Mr. Algernon Henry Wilson",male,80.0,0,0,27042,30.0000,A23,S
851,852,0,3,"Svensson, Mr. Johan",male,74.0,0,0,347060,7.7750,,S
493,494,0,1,"Artagaveytia, Mr. Ramon",male,71.0,0,0,PC 17609,49.5042,,C
96,97,0,1,"Goldschmidt, Mr. George B",male,71.0,0,0,PC 17754,34.6542,A5,C
116,117,0,3,"Connors, Mr. Patrick",male,70.5,0,0,370369,7.7500,,Q
...,...,...,...,...,...,...,...,...,...,...,...,...
859,860,0,3,"Razi, Mr. Raihed",male,,0,0,2629,7.2292,,C
863,864,0,3,"Sage, Miss. Dorothy Edith ""Dolly""",female,,8,2,CA. 2343,69.5500,,S
868,869,0,3,"van Melkebeke, Mr. Philemon",male,,0,0,345777,9.5000,,S
878,879,0,3,"Laleff, Mr. Kristo",male,,0,0,349217,7.8958,,S


#### Indexing DataFrame


##### select column

In [65]:
# select column
print(type(titanic['Age'])) # -> this returns a series
print(type(titanic[['Age']])) # -> this returns a dataframe

<class 'pandas.core.series.Series'>
<class 'pandas.core.frame.DataFrame'>


* Series is the datastructure for a single column of a DataFrame
* The data in a DataFrame is actually stored in memory as a collection of Series.

In [66]:
titanic[['Age', 'Sex']].head() 

Unnamed: 0,Age,Sex
0,22.0,male
1,38.0,female
2,26.0,female
3,35.0,female
4,35.0,male


In [67]:
# select column with dot
print(titanic.Age)

0      22.0
1      38.0
2      26.0
3      35.0
4      35.0
       ... 
886    27.0
887    19.0
888     NaN
889    26.0
890    32.0
Name: Age, Length: 891, dtype: float64


###### Get selected row information

In [68]:
# 1st method: iloc
# position based index
titanic.iloc[2]

PassengerId                         3
Survived                            1
Pclass                              3
Name           Heikkinen, Miss. Laina
Sex                            female
                        ...          
Parch                               0
Ticket               STON/O2. 3101282
Fare                            7.925
Cabin                             NaN
Embarked                            S
Name: 2, Length: 12, dtype: object

In [69]:
# last row
titanic.iloc[-1]

PassengerId                    891
Survived                         0
Pclass                           3
Name           Dooley, Mr. Patrick
Sex                           male
                      ...         
Parch                            0
Ticket                      370376
Fare                          7.75
Cabin                          NaN
Embarked                         Q
Name: 890, Length: 12, dtype: object

In [70]:
# select row 2-4
titanic.iloc[2:5]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [71]:
# select last 5 row
titanic.iloc[-5:]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.45,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0,C148,C
890,891,0,3,"Dooley, Mr. Patrick",male,32.0,0,0,370376,7.75,,Q


In [72]:
# select a selection of rows
titanic.iloc[[2,5,141]] # -> params is a list

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
141,142,1,3,"Nysten, Miss. Anna Sofia",female,22.0,0,0,347081,7.75,,S


###### combination of selection rows and columns
###### iloc = interger locate (number indexing) , loc = locate based on labels

In [73]:
# iloc: first position is row, second position is column
titanic.iloc[0,3]

'Braund, Mr. Owen Harris'

In [74]:
# a range selection
titanic.iloc[[0,2,4], 3]

0     Braund, Mr. Owen Harris
2      Heikkinen, Miss. Laina
4    Allen, Mr. William Henry
Name: Name, dtype: object

In [75]:
titanic.iloc[[0,2,4], [3,5,8,9]]

Unnamed: 0,Name,Age,Ticket,Fare
0,"Braund, Mr. Owen Harris",22.0,A/5 21171,7.25
2,"Heikkinen, Miss. Laina",26.0,STON/O2. 3101282,7.925
4,"Allen, Mr. William Henry",35.0,373450,8.05


In [76]:
titanic.iloc[5,:]

PassengerId                   6
Survived                      0
Pclass                        3
Name           Moran, Mr. James
Sex                        male
                     ...       
Parch                         0
Ticket                   330877
Fare                     8.4583
Cabin                       NaN
Embarked                      Q
Name: 5, Length: 12, dtype: object

* loc gets rows (or columns) with particular labels from the index.
* iloc gets rows (or columns) at particular positions in the index (so it only takes integers).

In [77]:
newdf = titanic.set_index('Name')
newdf.loc['Rice, Master. Eugene']

PassengerId        17
Survived            0
Pclass              3
Sex              male
Age                 2
                ...  
Parch               1
Ticket         382652
Fare           29.125
Cabin             NaN
Embarked            Q
Name: Rice, Master. Eugene, Length: 11, dtype: object

In [78]:
newdf.loc['Rice, Master. Eugene', ['Age', 'Ticket']]

Age            2
Ticket    382652
Name: Rice, Master. Eugene, dtype: object

In [79]:
# slicing rows with loc
# label must be unique for slice
newdf.loc[:'Rice, Master. Eugene', ['Age', 'Ticket']]

Unnamed: 0_level_0,Age,Ticket
Name,Unnamed: 1_level_1,Unnamed: 2_level_1
"Braund, Mr. Owen Harris",22.0,A/5 21171
"Cumings, Mrs. John Bradley (Florence Briggs Thayer)",38.0,PC 17599
"Heikkinen, Miss. Laina",26.0,STON/O2. 3101282
"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35.0,113803
"Allen, Mr. William Henry",35.0,373450
...,...,...
"Saundercock, Mr. William Henry",20.0,A/5. 2151
"Andersson, Mr. Anders Johan",39.0,347082
"Vestrom, Miss. Hulda Amanda Adolfina",14.0,350406
"Hewlett, Mrs. (Mary D Kingcome)",55.0,248706


In [80]:
newdf.loc[:'Rice, Master. Eugene', 'Age':'Ticket']

Unnamed: 0_level_0,Age,SibSp,Parch,Ticket
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
"Braund, Mr. Owen Harris",22.0,1,0,A/5 21171
"Cumings, Mrs. John Bradley (Florence Briggs Thayer)",38.0,1,0,PC 17599
"Heikkinen, Miss. Laina",26.0,0,0,STON/O2. 3101282
"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35.0,1,0,113803
"Allen, Mr. William Henry",35.0,0,0,373450
...,...,...,...,...
"Saundercock, Mr. William Henry",20.0,0,0,A/5. 2151
"Andersson, Mr. Anders Johan",39.0,1,5,347082
"Vestrom, Miss. Hulda Amanda Adolfina",14.0,0,0,350406
"Hewlett, Mrs. (Mary D Kingcome)",55.0,0,0,248706


#### Pandas Series

In [81]:
age = titanic['Age']
age.dtype

dtype('float64')

##### each row/column is a Series!!

In [82]:
print(titanic.iloc[5])
print(type(titanic.iloc[5]))

PassengerId                   6
Survived                      0
Pclass                        3
Name           Moran, Mr. James
Sex                        male
                     ...       
Parch                         0
Ticket                   330877
Fare                     8.4583
Cabin                       NaN
Embarked                      Q
Name: 5, Length: 12, dtype: object
<class 'pandas.core.series.Series'>


In [83]:
print(age)
print(type(age))

0      22.0
1      38.0
2      26.0
3      35.0
4      35.0
       ... 
886    27.0
887    19.0
888     NaN
889    26.0
890    32.0
Name: Age, Length: 891, dtype: float64
<class 'pandas.core.series.Series'>


In [84]:
age.describe()

count    714.000000
mean      29.699118
std       14.526497
min        0.420000
25%       20.125000
50%       28.000000
75%       38.000000
max       80.000000
Name: Age, dtype: float64

#### analyzing series

In [85]:
age.sum()

21205.17

In [86]:
# sum cannot handle missing values
sum(age)

nan

In [87]:
age.size

891

In [88]:
len(age)

891

In [89]:
age.mean()

29.69911764705882

In [90]:
age.mean(skipna=False)

nan

In [91]:
# unique is only available for series
age.unique()

array([22.  , 38.  , 26.  , 35.  ,   nan, 54.  ,  2.  , 27.  , 14.  ,
        4.  , 58.  , 20.  , 39.  , 55.  , 31.  , 34.  , 15.  , 28.  ,
        8.  , 19.  , 40.  , 66.  , 42.  , 21.  , 18.  ,  3.  ,  7.  ,
       49.  , 29.  , 65.  , 28.5 ,  5.  , 11.  , 45.  , 17.  , 32.  ,
       16.  , 25.  ,  0.83, 30.  , 33.  , 23.  , 24.  , 46.  , 59.  ,
       71.  , 37.  , 47.  , 14.5 , 70.5 , 32.5 , 12.  ,  9.  , 36.5 ,
       51.  , 55.5 , 40.5 , 44.  ,  1.  , 61.  , 56.  , 50.  , 36.  ,
       45.5 , 20.5 , 62.  , 41.  , 52.  , 63.  , 23.5 ,  0.92, 43.  ,
       60.  , 10.  , 64.  , 13.  , 48.  ,  0.75, 53.  , 57.  , 80.  ,
       70.  , 24.5 ,  6.  ,  0.67, 30.5 ,  0.42, 34.5 , 74.  ])

In [92]:
age.nunique()

88

In [93]:
# nunique by default does not count nan/null
age.nunique(dropna=False)

89

In [94]:
age.value_counts()

24.00    30
22.00    27
18.00    26
19.00    25
30.00    25
         ..
55.50     1
70.50     1
66.00     1
23.50     1
0.42      1
Name: Age, Length: 88, dtype: int64

In [95]:
age.value_counts(normalize=True)

24.00    0.042017
22.00    0.037815
18.00    0.036415
19.00    0.035014
30.00    0.035014
           ...   
55.50    0.001401
70.50    0.001401
66.00    0.001401
23.50    0.001401
0.42     0.001401
Name: Age, Length: 88, dtype: float64

In [96]:
age.value_counts(sort=True).head()

24.0    30
22.0    27
18.0    26
19.0    25
30.0    25
Name: Age, dtype: int64

In [97]:
age.value_counts(bins=10,sort=False)

(0.339, 8.378]       54
(8.378, 16.336]      46
(16.336, 24.294]    177
(24.294, 32.252]    169
(32.252, 40.21]     118
(40.21, 48.168]      70
(48.168, 56.126]     45
(56.126, 64.084]     24
(64.084, 72.042]      9
(72.042, 80.0]        2
Name: Age, dtype: int64

#### create pandas series

###### * from dataframe

In [98]:
# a single row and a single column is a series
print(type(titanic.Age))
print(type(titanic.iloc[0]))

<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>


##### * create from list

In [99]:
pd.Series([2,5,6])

0    2
1    5
2    6
dtype: int64

In [100]:
# pass index
pd.Series([2,5,6], index=['two','five','six'])

two     2
five    5
six     6
dtype: int64

##### * create from dictionary

In [101]:
d = {"two":2, "five":5, "six":6}
pd.Series(d)

two     2
five    5
six     6
dtype: int64

#### sort series

In [102]:
age = titanic.Age

# sort_values return a new series, won't affect original series
sorted_age = age.sort_values()
print(sorted_age.head())
print(age.head())

803    0.42
755    0.67
644    0.75
469    0.75
78     0.83
Name: Age, dtype: float64
0    22.0
1    38.0
2    26.0
3    35.0
4    35.0
Name: Age, dtype: float64


In [103]:
# use inplace to replace original series
tmp_num = pd.Series([2,3,4,1,100,15])
tmp_num.sort_values(inplace=True)
print(tmp_num)

3      1
0      2
1      3
2      4
5     15
4    100
dtype: int64


In [104]:
tmp_num.nlargest(3)

4    100
5     15
2      4
dtype: int64

In [105]:
tmp_num.idxmin()

3

#### Pandas Index

In [106]:
titanic.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [107]:
titanic.index

RangeIndex(start=0, stop=891, step=1)

### change index (row/column index of dataframe)

In [108]:
titanic.set_index('PassengerId', inplace=True) #replaces index with PassengerId instead

In [109]:
titanic

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...
887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [110]:
titanic.reset_index() #resets back to original index

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [111]:
test_titanic = titanic.loc[:10,['Name', 'Sex']] #creates own dataset with only name and sex

In [112]:
test_titanic

Unnamed: 0_level_0,Name,Sex
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,"Braund, Mr. Owen Harris",male
2,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female
3,"Heikkinen, Miss. Laina",female
4,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female
5,"Allen, Mr. William Henry",male
6,"Moran, Mr. James",male
7,"McCarthy, Mr. Timothy J",male
8,"Palsson, Master. Gosta Leonard",male
9,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female
10,"Nasser, Mrs. Nicholas (Adele Achem)",female


In [113]:
test_titanic.columns = ['New_name', "New_sex"]

In [114]:
test_titanic

Unnamed: 0_level_0,New_name,New_sex
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,"Braund, Mr. Owen Harris",male
2,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female
3,"Heikkinen, Miss. Laina",female
4,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female
5,"Allen, Mr. William Henry",male
6,"Moran, Mr. James",male
7,"McCarthy, Mr. Timothy J",male
8,"Palsson, Master. Gosta Leonard",male
9,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female
10,"Nasser, Mrs. Nicholas (Adele Achem)",female
