![](https://xakep.ru/wp-content/uploads/2018/01/150493/Green-Python-h.jpg)

# Библиотека Pandas

![](http://tripkendall.com/wp-content/uploads/2018/01/pandas_logo-1080x675.jpg)

[Pandas](http://pandas.pydata.org/) - библиотека для обработки и анализа данных. Предназначена для данных разной природы - матричных, панельных данных, временных рядов. Претендует на звание самого мощного и гибкого средства для анализа данных с открытым исходным кодом.

In [1]:
import pandas as pd

## Загрузка и запись данных
Функции типа .read_формат и .to_формат считывают и записывают данные соответственно. Полный список можно найти в документации:
http://pandas.pydata.org/pandas-docs/stable/io.html

Научимся считывать данные в формате csv (comma separated value) функцией:

- [pd.read_csv()](http://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_csv.html#pandas.read_csv): 

Аргументов у нее очень много, критически важные:
 - **filepath_or_buffer** - текстовая строка с названием (адресом) файла
 - **sep** - разделитель между данными
 - **header** - номер строки, в которой в файле указаны названия столбцов, None, если нет
 - **names** - список с названиями колонок
 - **index_col** - или номер столбца, или список,  или ничего - названия строк

![](https://i.pinimg.com/originals/54/62/bd/5462bd4d6ec6150ce91fbc70ac46d5f7.jpg)

In [2]:
train = pd.read_csv('00_titanic_train.csv')

In [3]:
train

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.0750,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


## Описание данных

| Variable | Definition | Key |
|----------|------------|-----|
|survival | Survival | 0 = No <br> 1 = Yes|
|pclass | Ticket class | 1 = 1st = Upper <br> 2 = 2nd = Middle <br> 3 = 3rd = Lower |
|sex | Sex | |
|Age | Age in years |  |
|sibsp | # of siblings <br> spouses aboard the Titanic | |
|parch | # of parents <br> children aboard the Titanic | |
|ticket | Ticket number | |
|fare | Passenger fare | |
|cabin | Cabin number | |
|embarked | Port of Embarkation | C = Cherbourg <br> Q = Queenstown <br> S = Southampton|

<b>age</b><br>
    Age is fractional if less than 1. If the age is estimated, is it in the form of xx.5

<b>sibsp</b><br>
    The dataset defines family relations in this way...<br>
    Sibling = brother, sister, stepbrother, stepsister<br>
    Spouse = husband, wife (mistresses and fiancés were ignored)

<b>parch</b><br>
    The dataset defines family relations in this way...<br>
    Parent = mother, father<br>
    Child = daughter, son, stepdaughter, stepson<br>
    Some children travelled only with a nanny, therefore parch=0 for them.

## pandas: head, tail

https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.head.html
    
```python
DataFrame.head(n=5)
```
Return the first n rows.

In [4]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [5]:
train.head(2)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C


In [6]:
train.tail(2)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0,C148,C
890,891,0,3,"Dooley, Mr. Patrick",male,32.0,0,0,370376,7.75,,Q


---
## pandas: shape

In [7]:
train.shape

(891, 12)

---
## pandas: columns, index


In [8]:
train.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [9]:
train.index

RangeIndex(start=0, stop=891, step=1)

## pandas: T/transpose()

In [10]:
train.head(4).T

Unnamed: 0,0,1,2,3
PassengerId,1,2,3,4
Survived,0,1,1,1
Pclass,3,1,3,1
Name,"Braund, Mr. Owen Harris","Cumings, Mrs. John Bradley (Florence Briggs Th...","Heikkinen, Miss. Laina","Futrelle, Mrs. Jacques Heath (Lily May Peel)"
Sex,male,female,female,female
Age,22,38,26,35
SibSp,1,1,0,1
Parch,0,0,0,0
Ticket,A/5 21171,PC 17599,STON/O2. 3101282,113803
Fare,7.25,71.2833,7.925,53.1


In [11]:
train.head(4).transpose()

Unnamed: 0,0,1,2,3
PassengerId,1,2,3,4
Survived,0,1,1,1
Pclass,3,1,3,1
Name,"Braund, Mr. Owen Harris","Cumings, Mrs. John Bradley (Florence Briggs Th...","Heikkinen, Miss. Laina","Futrelle, Mrs. Jacques Heath (Lily May Peel)"
Sex,male,female,female,female
Age,22,38,26,35
SibSp,1,1,0,1
Parch,0,0,0,0
Ticket,A/5 21171,PC 17599,STON/O2. 3101282,113803
Fare,7.25,71.2833,7.925,53.1


In [12]:
transpose_train = train.T

In [13]:
transpose_train.index

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

---
## pandas: info

In [14]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


---
## pandas: describe

In [15]:
train.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [16]:
train.describe(percentiles=[0.9, 0.99])

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
90%,802.0,1.0,3.0,50.0,1.0,2.0,77.9583
99%,882.1,1.0,3.0,65.87,5.0,4.0,249.00622
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


---
## pandas: Series

In [17]:
train['Name']

0                                Braund, Mr. Owen Harris
1      Cumings, Mrs. John Bradley (Florence Briggs Th...
2                                 Heikkinen, Miss. Laina
3           Futrelle, Mrs. Jacques Heath (Lily May Peel)
4                               Allen, Mr. William Henry
5                                       Moran, Mr. James
6                                McCarthy, Mr. Timothy J
7                         Palsson, Master. Gosta Leonard
8      Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)
9                    Nasser, Mrs. Nicholas (Adele Achem)
10                       Sandstrom, Miss. Marguerite Rut
11                              Bonnell, Miss. Elizabeth
12                        Saundercock, Mr. William Henry
13                           Andersson, Mr. Anders Johan
14                  Vestrom, Miss. Hulda Amanda Adolfina
15                      Hewlett, Mrs. (Mary D Kingcome) 
16                                  Rice, Master. Eugene
17                          Wil

In [18]:
type(train['Name'])

pandas.core.series.Series

In [19]:
## как пройти циклом for по всем именам? и вывести например всех 'John'
for i in train['Name']:
    if ('John ' in i) or (i.endswith('John')):
        print(i)

Cumings, Mrs. John Bradley (Florence Briggs Thayer)
Turpin, Mrs. William John Robert (Dorothy Ann Wonnacott)
Rogers, Mr. William John
Doling, Mrs. John T (Ada Julia Bone)
Barton, Mr. David John
Turpin, Mr. William John Robert
Cribb, Mr. John Hatfield
Bengtsson, Mr. John Viktor
Goldsmith, Master. Frank John William "Frankie"
Baumann, Mr. John D
Bourke, Mr. John
Perkin, Mr. John Henry
Mellors, Mr. William John
Lovell, Mr. John Hall ("Henry")
Sage, Mr. George John Jr
Goldsmith, Mrs. Frank John (Emily Alice Brown)
Adams, Mr. John
Matthews, Mr. William John
Smart, Mr. John Montgomery
Farthing, Mr. John
Goldsmith, Mr. Frank John
Davies, Master. John Morgan Jr
Thayer, Mr. John Borland Jr
Simmons, Mr. John
Flynn, Mr. John Irwin ("Irving")
Rush, Mr. Alfred George John
Thayer, Mrs. John Borland (Marian Longstreth Morris)
Ross, Mr. John Hugo
Jarvis, Mr. John Denzil
Chapman, Mr. John Henry
Horgan, Mr. John
Bowen, Mr. David John "Dai"
Bourke, Mrs. John (Catherine)
Weir, Col. John
Thayer, Mr. John B

---

## pandas: map

In [20]:
'MLStart'

'MLStart'

In [21]:
len('MLStart')

7

In [22]:
train['Name'].map(len)

0      23
1      51
2      22
3      44
4      24
5      16
6      23
7      30
8      49
9      35
10     31
11     24
12     30
13     27
14     36
15     32
16     20
17     28
18     55
19     23
20     20
21     21
22     27
23     28
24     29
25     57
26     23
27     30
28     29
29     19
       ..
861    27
862    51
863    33
864    22
865    24
866    28
867    36
868    27
869    31
870    17
871    48
872    24
873    27
874    37
875    32
876    29
877    20
878    18
879    45
880    44
881    18
882    28
883    29
884    22
885    36
886    21
887    28
888    40
889    21
890    19
Name: Name, Length: 891, dtype: int64

In [23]:
def is_Mr(s):
    return 'Mr.' in s

In [24]:
is_Mr('Mr. machine learning')

True

In [25]:
train['Name'].map(lambda s: 'Mr. ' in s)

0       True
1      False
2      False
3      False
4       True
5       True
6       True
7      False
8      False
9      False
10     False
11     False
12      True
13      True
14     False
15     False
16     False
17      True
18     False
19     False
20      True
21      True
22     False
23      True
24     False
25     False
26      True
27      True
28     False
29      True
       ...  
861     True
862    False
863    False
864     True
865    False
866    False
867     True
868     True
869    False
870     True
871    False
872     True
873     True
874    False
875    False
876     True
877     True
878     True
879    False
880    False
881     True
882    False
883     True
884     True
885    False
886    False
887    False
888    False
889     True
890     True
Name: Name, Length: 891, dtype: bool

In [26]:
train['is_Mr'] = train['Name'].map(lambda s: 'Mr.' in s)
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,is_Mr
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,True
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,False
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,False
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,False
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,True


In [27]:
# тот же результат, но уже без лямбды и map
train['Name'].str.contains('Mr. ')

0       True
1      False
2      False
3      False
4       True
5       True
6       True
7      False
8      False
9      False
10     False
11     False
12      True
13      True
14     False
15     False
16     False
17      True
18     False
19     False
20      True
21      True
22     False
23      True
24     False
25     False
26      True
27      True
28     False
29      True
       ...  
861     True
862    False
863    False
864     True
865    False
866    False
867     True
868     True
869    False
870     True
871    False
872     True
873     True
874    False
875    False
876     True
877     True
878     True
879    False
880    False
881     True
882    False
883     True
884     True
885    False
886    False
887    False
888    False
889     True
890     True
Name: Name, Length: 891, dtype: bool

In [28]:
train['Name'].is_unique

True

In [29]:
train['PassengerId'].is_unique

True

In [30]:
train['Age'].is_unique

False

---
## pandas: select columns

In [31]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,is_Mr
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,True
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,False
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,False
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,False
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,True


In [32]:
train[['PassengerId', 'Embarked', 'Pclass']]

Unnamed: 0,PassengerId,Embarked,Pclass
0,1,S,3
1,2,C,1
2,3,S,3
3,4,S,1
4,5,S,3
5,6,Q,3
6,7,S,1
7,8,S,3
8,9,S,3
9,10,C,2


---
## pandas: select rows

In [33]:
train[train['Sex'] == 'male'].head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,is_Mr
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,True
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,True
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q,True
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S,True
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S,False


In [34]:
train[train['PassengerId'] == 666]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,is_Mr
665,666,0,2,"Hickman, Mr. Lewis",male,32.0,2,0,S.O.C. 14879,73.5,,S,True


In [35]:
train[train['Age'].isnull()].head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,is_Mr
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q,True
17,18,1,2,"Williams, Mr. Charles Eugene",male,,0,0,244373,13.0,,S,True
19,20,1,3,"Masselmani, Mrs. Fatima",female,,0,0,2649,7.225,,C,False
26,27,0,3,"Emir, Mr. Farred Chehab",male,,0,0,2631,7.225,,C,True
28,29,1,3,"O'Dwyer, Miss. Ellen ""Nellie""",female,,0,0,330959,7.8792,,Q,False


In [36]:
train[~train['Age'].isnull()].head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,is_Mr
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,True
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,False
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,False
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,False
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,True


In [37]:
train[
    train['Embarked'].isin(['Q', 'C']) & (train['Age'] > 40)
].head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,is_Mr
52,53,1,1,"Harper, Mrs. Henry Sleeper (Myna Haxtun)",female,49.0,1,0,PC 17572,76.7292,D33,C,False
54,55,0,1,"Ostby, Mr. Engelhart Cornelius",male,65.0,0,1,113509,61.9792,B30,C,True
96,97,0,1,"Goldschmidt, Mr. George B",male,71.0,0,0,PC 17754,34.6542,A5,C,True
116,117,0,3,"Connors, Mr. Patrick",male,70.5,0,0,370369,7.75,,Q,True
155,156,0,1,"Williams, Mr. Charles Duane",male,51.0,0,1,PC 17597,61.3792,,C,True


In [38]:
train[
    train['Embarked'].isin(['Q', 'C']) | (train['Age'] > 40)
].head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,is_Mr
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,False
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q,True
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S,True
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C,False
11,12,1,1,"Bonnell, Miss. Elizabeth",female,58.0,0,0,113783,26.55,C103,S,False


---
## pandas: loc, iloc

In [39]:
tmp = train[
     train['Embarked'].isin(['Q', 'C']) & (train['Age'] > 40)
].head()
train.loc[train['Age'].isnull()==True,]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,is_Mr
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q,True
17,18,1,2,"Williams, Mr. Charles Eugene",male,,0,0,244373,13.0000,,S,True
19,20,1,3,"Masselmani, Mrs. Fatima",female,,0,0,2649,7.2250,,C,False
26,27,0,3,"Emir, Mr. Farred Chehab",male,,0,0,2631,7.2250,,C,True
28,29,1,3,"O'Dwyer, Miss. Ellen ""Nellie""",female,,0,0,330959,7.8792,,Q,False
29,30,0,3,"Todoroff, Mr. Lalio",male,,0,0,349216,7.8958,,S,True
31,32,1,1,"Spencer, Mrs. William Augustus (Marie Eugenie)",female,,1,0,PC 17569,146.5208,B78,C,False
32,33,1,3,"Glynn, Miss. Mary Agatha",female,,0,0,335677,7.7500,,Q,False
36,37,1,3,"Mamee, Mr. Hanna",male,,0,0,2677,7.2292,,C,True
42,43,0,3,"Kraeff, Mr. Theodor",male,,0,0,349253,7.8958,,C,True


In [40]:
tmp

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,is_Mr
52,53,1,1,"Harper, Mrs. Henry Sleeper (Myna Haxtun)",female,49.0,1,0,PC 17572,76.7292,D33,C,False
54,55,0,1,"Ostby, Mr. Engelhart Cornelius",male,65.0,0,1,113509,61.9792,B30,C,True
96,97,0,1,"Goldschmidt, Mr. George B",male,71.0,0,0,PC 17754,34.6542,A5,C,True
116,117,0,3,"Connors, Mr. Patrick",male,70.5,0,0,370369,7.75,,Q,True
155,156,0,1,"Williams, Mr. Charles Duane",male,51.0,0,1,PC 17597,61.3792,,C,True


In [41]:
tmp.loc[54:116, 'Pclass':'Parch']

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch
54,1,"Ostby, Mr. Engelhart Cornelius",male,65.0,0,1
96,1,"Goldschmidt, Mr. George B",male,71.0,0,0
116,3,"Connors, Mr. Patrick",male,70.5,0,0


In [42]:
tmp.iloc[1:4, 2:8]

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch
54,1,"Ostby, Mr. Engelhart Cornelius",male,65.0,0,1
96,1,"Goldschmidt, Mr. George B",male,71.0,0,0
116,3,"Connors, Mr. Patrick",male,70.5,0,0


---
## pandas: sort

In [43]:
train.sort_values('Name')

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,is_Mr
845,846,0,3,"Abbing, Mr. Anthony",male,42.00,0,0,C.A. 5547,7.5500,,S,True
746,747,0,3,"Abbott, Mr. Rossmore Edward",male,16.00,1,1,C.A. 2673,20.2500,,S,True
279,280,1,3,"Abbott, Mrs. Stanton (Rosa Hunt)",female,35.00,1,1,C.A. 2673,20.2500,,S,False
308,309,0,2,"Abelson, Mr. Samuel",male,30.00,1,0,P/PP 3381,24.0000,,C,True
874,875,1,2,"Abelson, Mrs. Samuel (Hannah Wizosky)",female,28.00,1,0,P/PP 3381,24.0000,,C,False
365,366,0,3,"Adahl, Mr. Mauritz Nils Martin",male,30.00,0,0,C 7076,7.2500,,S,True
401,402,0,3,"Adams, Mr. John",male,26.00,0,0,341826,8.0500,,S,True
40,41,0,3,"Ahlin, Mrs. Johan (Johanna Persdotter Larsson)",female,40.00,1,0,7546,9.4750,,S,False
855,856,1,3,"Aks, Mrs. Sam (Leah Rosen)",female,18.00,0,1,392091,9.3500,,S,False
207,208,1,3,"Albimona, Mr. Nassef Cassem",male,26.00,0,0,2699,18.7875,,C,True


In [44]:
train.sort_values('Fare', ascending=False)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,is_Mr
258,259,1,1,"Ward, Miss. Anna",female,35.00,0,0,PC 17755,512.3292,,C,False
737,738,1,1,"Lesurer, Mr. Gustave J",male,35.00,0,0,PC 17755,512.3292,B101,C,True
679,680,1,1,"Cardeza, Mr. Thomas Drake Martinez",male,36.00,0,1,PC 17755,512.3292,B51 B53 B55,C,True
88,89,1,1,"Fortune, Miss. Mabel Helen",female,23.00,3,2,19950,263.0000,C23 C25 C27,S,False
27,28,0,1,"Fortune, Mr. Charles Alexander",male,19.00,3,2,19950,263.0000,C23 C25 C27,S,True
341,342,1,1,"Fortune, Miss. Alice Elizabeth",female,24.00,3,2,19950,263.0000,C23 C25 C27,S,False
438,439,0,1,"Fortune, Mr. Mark",male,64.00,1,4,19950,263.0000,C23 C25 C27,S,True
311,312,1,1,"Ryerson, Miss. Emily Borie",female,18.00,2,2,PC 17608,262.3750,B57 B59 B63 B66,C,False
742,743,1,1,"Ryerson, Miss. Susan Parker ""Suzette""",female,21.00,2,2,PC 17608,262.3750,B57 B59 B63 B66,C,False
118,119,0,1,"Baxter, Mr. Quigg Edmond",male,24.00,0,1,PC 17558,247.5208,B58 B60,C,True


---
## pandas: create from list

In [45]:
pd.DataFrame(
    [[1,2,3], [6,None, 7]]
)

Unnamed: 0,0,1,2
0,1,2.0,3
1,6,,7


In [46]:
df_embarked = pd.DataFrame(
    [
        ['C', 'Cherbourg'],
        ['Q', 'Queenstown'],
        ['S', 'Southampton']
    ],
    columns=['id', 'City']
)

In [47]:
df_embarked

Unnamed: 0,id,City
0,C,Cherbourg
1,Q,Queenstown
2,S,Southampton


---
## pandas: merge
https://pandas.pydata.org/pandas-docs/stable/merging.html

In [48]:
pd.merge(
    train,
    df_embarked,
    left_on='Embarked',
    right_on='id'
)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,is_Mr,id,City
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S,True,S,Southampton
1,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S,False,S,Southampton
2,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S,False,S,Southampton
3,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S,True,S,Southampton
4,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S,True,S,Southampton
5,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.0750,,S,False,S,Southampton
6,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S,False,S,Southampton
7,11,1,3,"Sandstrom, Miss. Marguerite Rut",female,4.0,1,1,PP 9549,16.7000,G6,S,False,S,Southampton
8,12,1,1,"Bonnell, Miss. Elizabeth",female,58.0,0,0,113783,26.5500,C103,S,False,S,Southampton
9,13,0,3,"Saundercock, Mr. William Henry",male,20.0,0,0,A/5. 2151,8.0500,,S,True,S,Southampton


---
## pandas: groupby

In [49]:
train[['PassengerId', 'Sex']].groupby('Sex').count()

Unnamed: 0_level_0,PassengerId
Sex,Unnamed: 1_level_1
female,314
male,577


In [50]:
train[['Age', 'Sex']].groupby('Sex').median()

Unnamed: 0_level_0,Age
Sex,Unnamed: 1_level_1
female,27.0
male,29.0


---
## pandas: pivot_table

In [51]:
d = train.pivot_table(
    'PassengerId', 'Pclass', 'Survived', 'count')

In [1]:
d.sum(1)

NameError: name 'd' is not defined

In [53]:
# процент выживших/погибших по классам
d.div(d.sum(1), axis='rows')

Survived,0,1
Pclass,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0.37037,0.62963
2,0.527174,0.472826
3,0.757637,0.242363


# Visualization

In [2]:
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
import seaborn as sns

In [4]:
sns.factorplot('Sex',data=train,kind='count');

NameError: name 'train' is not defined

In [None]:
sns.factorplot('Pclass',data=train,kind='count');

In [None]:
_, ax = plt.subplots(figsize=(20,8))

train.pivot_table(values='PassengerId', index='Pclass', columns='Survived', aggfunc='count').plot(
    kind='bar', stacked=True, ax=ax
)
plt.show()

In [None]:
sns.factorplot('Pclass','Survived',data=train,hue='Sex');

In [None]:
train['Age'].hist(bins=25)
plt.title("Распределение возраста  пассажиров");

In [None]:
train['Age'].hist(bins=25, normed=True)
train['Age'].plot(kind='kde')

plt.xlim(0, 80)
plt.title("Распределение возраста  пассажиров");

In [None]:
train['Age'][train.Sex=="male"].hist(alpha=0.6, label="Male")
train['Age'][train.Sex=="female"].hist(alpha=0.6, label="Female")
plt.legend()
plt.title("Распределение возраста  пассажиров");

In [None]:
train['Age'][train.Sex=="male"].hist(alpha=0.6, label="Male", normed=True)
train['Age'][train.Sex=="female"].hist(alpha=0.6, label="Female", normed=True)
plt.legend()
plt.title("Распределение возраста  пассажиров");

In [None]:
train[train.Sex=='male']

In [None]:
_, axes = plt.subplots(2, 2, sharey=True, figsize=(15,12))

sns.boxplot(x='Survived', y='Age', data=train[train.Sex=='male'], ax=axes[0, 0]);
sns.boxplot(x='Survived', y='Age', data=train[train.Sex=='female'], ax=axes[0, 1]);

sns.violinplot(x='Survived', y='Age', data=train[train.Sex=='male'], ax=axes[1, 0]);
sns.violinplot(x='Survived', y='Age', data=train[train.Sex=='female'], ax=axes[1, 1]);

In [None]:
df_numeric = train.drop(['Name', 'Ticket', 'Cabin'], axis=1)

In [None]:
df_numeric['Age'] = df_numeric['Age'].fillna(-1)

In [None]:
plt.figure(figsize=(12, 10))
sns.heatmap(df_numeric.corr('kendall'), annot=True);

In [None]:
df_numeric.hist(figsize=(20, 12));

In [None]:
sns.pairplot(df_numeric, hue='Survived');