# Python toolboxes and libraries:
    -> Numpy
    -> SciPy
    -> SciKit-Learn
    -> Pandas
    
Variable Notes
pclass: A proxy for socio-economic status (SES)
1st = Upper
2nd = Middle
3rd = Lower

age: Age is fractional if less than 1. If the age is estimated, is it in the form of xx.5

sibsp: The dataset defines family relations in this way...
Sibling = brother, sister, stepbrother, stepsister
Spouse = husband, wife (mistresses and fiancés were ignored)

parch: The dataset defines family relations in this way...
Parent = mother, father
Child = daughter, son, stepdaughter, stepson
Some children travelled only with a nanny, therefore parch=0 for them.

In [1]:
import pandas as pd
import numpy as np
import scipy as sp
import matplotlib as mpl
import seaborn as sns

In [8]:
# reading a csv file

df = pd.read_csv('D:/Kaggle_Titanic/train.csv')
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [12]:
Sp_dtype = df['Ticket'].dtype

df.dtypes

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

# Data Frame Attribiutes

In [26]:
# list the column names
df.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [27]:
# list row labels and column names
df.axes

[RangeIndex(start=0, stop=891, step=1),
 Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
        'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
       dtype='object')]

In [28]:
# number of dimensions 
df.ndim

2

In [29]:
# number of elements
df.size

10692

In [31]:
# return a tuple reresenting total number of rows and columns
df.shape

(891, 12)

In [32]:
# numpy representation of the data 
df.values

array([[1, 0, 3, ..., 7.25, nan, 'S'],
       [2, 1, 1, ..., 71.2833, 'C85', 'C'],
       [3, 1, 3, ..., 7.925, nan, 'S'],
       ...,
       [889, 0, 3, ..., 23.45, nan, 'S'],
       [890, 1, 1, ..., 30.0, 'C148', 'C'],
       [891, 0, 3, ..., 7.75, nan, 'Q']], dtype=object)

# Data Frames methods


In [37]:
# generate descriptive statistics for numerical val
df.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [40]:
# statistics for categorical values
df.describe(include = 'object')

Unnamed: 0,Name,Sex,Ticket,Cabin,Embarked
count,891,891,891,204,889
unique,891,2,681,147,3
top,"Slemen, Mr. Richard James",male,1601,C23 C25 C27,S
freq,1,577,7,4,644


In [48]:
# Maximum and minimum values for all numeric data

print('max fare: {}'.format(df['Fare'].max()))

print('min fare: {}'.format(df['Fare'].min()))

max fare: 512.3292
min fare: 0.0


In [52]:
# Mean , median and standard deviation of all numeric values

print('mean of fare: {}'.format(df['Fare'].mean()))

print('median of fare: {}'.format(df['Fare'].median()))

print('standard deviation of fare: {}'.format(df['Fare'].std()))

mean of fare: 32.204207968574636
median of fare: 14.4542
standard deviation of fare: 49.6934285971809


In [57]:
# return a random sample of the data frame
df.sample(4)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
700,701,1,1,"Astor, Mrs. John Jacob (Madeleine Talmadge Force)",female,18.0,1,0,PC 17757,227.525,C62 C64,C
207,208,1,3,"Albimona, Mr. Nassef Cassem",male,26.0,0,0,2699,18.7875,,C
314,315,0,2,"Hart, Mr. Benjamin",male,43.0,1,1,F.C.C. 13529,26.25,,S
817,818,0,2,"Mallet, Mr. Albert",male,31.0,1,1,S.C./PARIS 2079,37.0042,,C


# Using Count Function
How many survived ?

In [70]:
df[df['Survived']==1]['Survived'].count()


342

# Data frames groupby method 

In [72]:
df.groupby(['Sex']).mean()

Unnamed: 0_level_0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
Sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
female,431.028662,0.742038,2.159236,27.915709,0.694268,0.649682,44.479818
male,454.147314,0.188908,2.389948,30.726645,0.429809,0.235702,25.523893


In [80]:
df[df['Survived']==1].groupby('Sex')[['Survived']].count()

Unnamed: 0_level_0,Survived
Sex,Unnamed: 1_level_1
female,233
male,109


In [83]:
df.groupby('Survived')[['Sex']].count()

Unnamed: 0_level_0,Sex
Survived,Unnamed: 1_level_1
0,549
1,342


In [87]:
df.groupby(['Survived', 'Sex'])[['Survived']].count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Age
Survived,Sex,Unnamed: 2_level_1
0,female,64
0,male,360
1,female,197
1,male,93


In [88]:
df.groupby('Sex')[['Fare']].sum()

Unnamed: 0_level_0,Fare
Sex,Unnamed: 1_level_1
female,13966.6628
male,14727.2865


In [None]:
# using aggregate function