# 21 Pandas operations for absolute beginners

https://towardsdatascience.com/21-pandas-operations-for-absolute-beginners-5653e54f4cda

# Titanic Dataset

<img src="images\titanic_datadescription.gif">

In [1]:
# import python packages
import pandas as pd
import numpy as np
import matplotlib as plt
%matplotlib inline

How to read data from a CSV file or a text file?

In [2]:
# import titanic_train.csv
df=pd.read_csv("data/titanic_train.csv")
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,350.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",hi,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",hello,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,0.35,0,0,373450,8.05,,S


In [3]:
print(df.columns.tolist())

['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked']


How to create a data frame using a dictionary of pre-existing columns or NumPy 2D arrays?

In [4]:
dflist = df.values.tolist()
len(dflist)

891

In [7]:
for i in dflist[:2]:
    print(i)

[1, 0, 3, 'Braund, Mr. Owen Harris', 'male', 22.0, 1, 0, 'A/5 21171', 7.25, nan, 'S']
[2, 1, 1, 'Cumings, Mrs. John Bradley (Florence Briggs Thayer)', 'female', 350.0, 1, 0, 'PC 17599', 71.2833, 'C85', 'C']


In [5]:
df1 = pd.DataFrame(dflist)
df1.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,350.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",hi,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",hello,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,0.35,0,0,373450,8.05,,S


In [8]:
new_cols = ['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked']

In [9]:
#https://stackoverflow.com/questions/38101009/changing-multiple-column-names-but-not-all-of-them-pandas-python
df1.rename(columns=dict(zip(df1.columns[0:], new_cols)),inplace=True)
df1.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,350.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",hi,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",hello,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,0.35,0,0,373450,8.05,,S


How to visualize the top and bottom x values in a data frame?

In [10]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,350.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",hi,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",hello,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,0.35,0,0,373450,8.05,,S


In [11]:
df.tail()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.45,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0,C148,C
890,891,0,3,"Dooley, Mr. Patrick",male,32.0,0,0,370376,7.75,,Q


How to get column names in a list?

In [12]:
df.columns.tolist()

['PassengerId',
 'Survived',
 'Pclass',
 'Name',
 'Sex',
 'Age',
 'SibSp',
 'Parch',
 'Ticket',
 'Fare',
 'Cabin',
 'Embarked']

How to get the frequency of values in a series?

In [13]:
df['Pclass'].value_counts()

3    491
1    216
2    184
Name: Pclass, dtype: int64

How to reset an index to an existing column or another list or array?

In [14]:
df1.head(1)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S


In [15]:
df1.columns.tolist()

['PassengerId',
 'Survived',
 'Pclass',
 'Name',
 'Sex',
 'Age',
 'SibSp',
 'Parch',
 'Ticket',
 'Fare',
 'Cabin',
 'Embarked']

In [18]:
df2 = df1.set_index('Name')
df2.head()

Unnamed: 0_level_0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
"Braund, Mr. Owen Harris",1,0,3,male,22.0,1,0,A/5 21171,7.25,,S
"Cumings, Mrs. John Bradley (Florence Briggs Thayer)",2,1,1,female,350.0,1,0,PC 17599,71.2833,C85,C
"Heikkinen, Miss. Laina",3,1,3,hi,26.0,0,0,STON/O2. 3101282,7.925,,S
"Futrelle, Mrs. Jacques Heath (Lily May Peel)",4,1,1,hello,35.0,1,0,113803,53.1,C123,S
"Allen, Mr. William Henry",5,0,3,male,0.35,0,0,373450,8.05,,S


In [19]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


How to iterate over rows?

In [20]:
for index, row in df.iterrows():
    print(row['Fare'], row['Age'])

7.25 22.0
71.2833 350.0
7.925 26.0
53.1 35.0
8.05 0.35
8.4583 nan
51.8625 54.0
21.075 2.0
11.1333 27.0
30.0708 14.0
16.7 4.0
26.55 58.0
8.05 20.0
31.275 39.0
7.8542 14.0
16.0 55.0
29.125 2.0
13.0 nan
18.0 31.0
7.225 nan
26.0 35.0
13.0 34.0
8.0292 15.0
35.5 28.0
21.075 8.0
31.3875 38.0
7.225 nan
263.0 19.0
7.8792 nan
7.8958 nan
27.7208 40.0
146.5208 nan
7.75 nan
10.5 66.0
82.1708 28.0
52.0 42.0
7.2292 nan
8.05 21.0
18.0 18.0
11.2417 14.0
9.475 40.0
21.0 27.0
7.8958 nan
41.5792 3.0
7.8792 19.0
8.05 nan
15.5 nan
7.75 nan
21.6792 nan
17.8 18.0
39.6875 7.0
7.8 21.0
76.7292 49.0
26.0 29.0
61.9792 65.0
35.5 nan
10.5 21.0
7.2292 28.5
27.75 5.0
46.9 11.0
7.2292 22.0
80.0 38.0
83.475 45.0
27.9 4.0
27.7208 nan
15.2458 nan
10.5 29.0
8.1583 19.0
7.925 17.0
8.6625 26.0
10.5 32.0
46.9 16.0
73.5 21.0
14.4542 26.0
56.4958 32.0
7.65 25.0
7.8958 nan
8.05 nan
29.0 0.83
12.475 30.0
9.0 22.0
9.5 29.0
7.7875 nan
47.1 28.0
10.5 17.0
15.85 33.0
34.375 16.0
8.05 nan
263.0 23.0
8.05 24.0
8.05 29.0
7.8542 20.0
61

In [27]:
for index, i in df.iterrows():
    print(int(i['Fare']))

7
71
7
53
8
8
51
21
11
30
16
26
8
31
7
16
29
13
18
7
26
13
8
35
21
31
7
263
7
7
27
146
7
10
82
52
7
8
18
11
9
21
7
41
7
8
15
7
21
17
39
7
76
26
61
35
10
7
27
46
7
80
83
27
27
15
10
8
7
8
10
46
73
14
56
7
7
8
29
12
9
9
7
47
10
15
34
8
263
8
8
7
61
20
7
8
34
63
23
26
7
7
77
8
7
7
7
7
7
24
52
14
8
9
14
7
7
21
247
31
73
8
30
13
77
11
7
7
22
6
7
7
14
26
13
15
26
53
9
79
15
7
15
6
11
36
7
34
26
13
12
66
8
14
7
61
7
8
8
69
16
15
7
8
39
20
55
27
25
56
33
29
11
7
30
7
25
28
13
0
69
15
31
39
22
50
15
26
15
7
13
13
7
26
27
146
7
8
7
13
9
69
6
7
8
10
15
18
7
31
7
21
7
13
7
113
7
27
76
10
8
13
8
7
90
9
10
7
13
25
83
7
13
31
10
7
26
26
10
12
14
15
10
7
7
90
7
14
52
26
7
10
26
16
20
15
79
86
512
26
7
31
79
0
7
10
39
7
153
135
31
0
19
29
7
77
7
0
29
20
7
7
9
8
26
8
9
7
13
7
78
91
12
8
7
27
7
151
30
247
7
23
0
12
8
151
110
108
24
56
83
262
26
7
26
7
26
14
164
134
7
7
12
29
69
135
6
13
20
57
23
28
153
18
133
7
66
134
8
35
26
263
13
13
13
13
13
16
15
8
9
35
7
17
7
9
55
13
7
7
27
27
14
7
15
7
75
7
7
69
55

In [22]:
for index, row in df.iterrows():
    print(int(row['Fare']) * 100)

700
7100
700
5300
800
800
5100
2100
1100
3000
1600
2600
800
3100
700
1600
2900
1300
1800
700
2600
1300
800
3500
2100
3100
700
26300
700
700
2700
14600
700
1000
8200
5200
700
800
1800
1100
900
2100
700
4100
700
800
1500
700
2100
1700
3900
700
7600
2600
6100
3500
1000
700
2700
4600
700
8000
8300
2700
2700
1500
1000
800
700
800
1000
4600
7300
1400
5600
700
700
800
2900
1200
900
900
700
4700
1000
1500
3400
800
26300
800
800
700
6100
2000
700
800
3400
6300
2300
2600
700
700
7700
800
700
700
700
700
700
2400
5200
1400
800
900
1400
700
700
2100
24700
3100
7300
800
3000
1300
7700
1100
700
700
2200
600
700
700
1400
2600
1300
1500
2600
5300
900
7900
1500
700
1500
600
1100
3600
700
3400
2600
1300
1200
6600
800
1400
700
6100
700
800
800
6900
1600
1500
700
800
3900
2000
5500
2700
2500
5600
3300
2900
1100
700
3000
700
2500
2800
1300
0
6900
1500
3100
3900
2200
5000
1500
2600
1500
700
1300
1300
700
2600
2700
14600
700
800
700
1300
900
6900
600
700
800
1000
1500
1800
700
3100
700
2100
700
1300
700
1130

In [23]:
sum  = 0
for index, row in df.iterrows():
    sum = sum + (int(row['Fare']))
print(sum)

28321


How to apply a function to each element to a series?

In [24]:
def f(x):
    #do_somthing
    return x + 100

In [25]:
df['Age'].apply(f) 

0      122.00
1      450.00
2      126.00
3      135.00
4      100.35
5         NaN
6      154.00
7      102.00
8      127.00
9      114.00
10     104.00
11     158.00
12     120.00
13     139.00
14     114.00
15     155.00
16     102.00
17        NaN
18     131.00
19        NaN
20     135.00
21     134.00
22     115.00
23     128.00
24     108.00
25     138.00
26        NaN
27     119.00
28        NaN
29        NaN
        ...  
861    121.00
862    148.00
863       NaN
864    124.00
865    142.00
866    127.00
867    131.00
868       NaN
869    104.00
870    126.00
871    147.00
872    133.00
873    147.00
874    128.00
875    115.00
876    120.00
877    119.00
878       NaN
879    156.00
880    125.00
881    133.00
882    122.00
883    128.00
884    125.00
885    139.00
886    127.00
887    119.00
888       NaN
889    126.00
890    132.00
Name: Age, Length: 891, dtype: float64

In [26]:
df['Current_Age'] = df['Age'].apply(f)
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Current_Age
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,122.0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,350.0,1,0,PC 17599,71.2833,C85,C,450.0
2,3,1,3,"Heikkinen, Miss. Laina",hi,26.0,0,0,STON/O2. 3101282,7.925,,S,126.0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",hello,35.0,1,0,113803,53.1,C123,S,135.0
4,5,0,3,"Allen, Mr. William Henry",male,0.35,0,0,373450,8.05,,S,100.35
