# Exploring and Processing Data

In [1]:
# imports
import pandas as pd
import numpy as np
import os

## Import Data

In [3]:
# read the data with all default parameters
train_df = pd.read_csv("train.csv", index_col='PassengerId')
test_df = pd.read_csv("test.csv", index_col='PassengerId')

## Basic Structure

In [4]:
# use .info() to get brief information about the dataframe 
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 1 to 891
Data columns (total 11 columns):
Survived    891 non-null int64
Pclass      891 non-null int64
Name        891 non-null object
Sex         891 non-null object
Age         714 non-null float64
SibSp       891 non-null int64
Parch       891 non-null int64
Ticket      891 non-null object
Fare        891 non-null float64
Cabin       204 non-null object
Embarked    889 non-null object
dtypes: float64(2), int64(4), object(5)
memory usage: 83.5+ KB


In [None]:
test_df.info()

In [None]:
test_df['Survived'] = -888 # Adding Survived with a default value

In [None]:
df = pd.concat((train_df, test_df),axis=0)

In [None]:
# use .head() to get top 5 rows
df.head()

In [None]:
# use .tail() to get last 5 rows
df.tail()

In [None]:
# selection using column name as string
df['Name']

In [None]:
# selecting multiple columns using a list of column name strings
df[['Name','Age']]

In [None]:
# indexing : use loc for label based indexing 
# all columns
df.loc[5:10,]

In [None]:
# selecting discrete columns
df.loc[5:10, ['Survived', 'Fare','Embarked']]

In [None]:
# filter rows based on the condition 
male_passengers = df.loc[df.Sex == 'male',:]
print('Number of male passengers ' +str(len(male_passengers)))

## Summary Statistics

In [None]:
# use .describe() to get statistics for all numeric columns
df.describe()

In [None]:
# numerical feature
# centrality measures
print('Mean fare ' + str(df.Fare.mean())) # mean
print('Median fare ' + str(df.Fare.median())) # median

In [None]:
# dispersion measures
print('Min fare ' + str(df.Fare.min())) # minimum
print('Max fare ' + str(df.Fare.max())) # maximum
print('Fare range ' + str(df.Fare.max()  - df.Fare.min())) # range
print('25 percentile ' + str(df.Fare.quantile(.25))) # 25 percentile
print('50 percentile :' + str(df.Fare.quantile(.5))) # 50 percentile
print('75 percentile :' + str(df.Fare.quantile(.75))) # 75 percentile
print('Variance fare : ' + str(df.Fare.var())) # variance
print('Standard deviation fare : ' + str(df.Fare.std())) # standard deviation

In [None]:
%matplotlib inline

In [None]:
# box-whisker plot
df.Fare.plot(kind='box')

In [None]:
# categorical column : Counts
df.Sex.value_counts()

In [None]:
# categorical column : Proprotions
df.Sex.value_counts(normalize=True)

In [None]:
# visualize counts
df.Pclass.value_counts().plot(kind='bar')

In [None]:
# title : to set title, color : to set color,  rot : to rotate labels 
df.Pclass.value_counts().plot(kind='bar',rot = 0, title='Class wise passenger count', color='c');

## Distributions

In [None]:
# use bins to add or remove bins
df.Age.plot(kind='hist', title='histogram for Age', color='c', bins=20);

In [None]:
# use kde for density plot
df.Age.plot(kind='kde', title='Density plot for Age', color='c');

In [None]:
# use scatter plot for bi-variate distribution
df.plot.scatter(x='Age', y='Fare', color='c', title='scatter plot : Age vs Fare');

In [None]:
# use alpha to set the transparency
df.plot.scatter(x='Age', y='Fare', color='c', title='scatter plot : Age vs Fare', alpha=0.1);

## Grouping and Aggregations

In [None]:
# group by 
df.groupby('Sex').Age.median()

In [None]:
df.groupby(['Pclass'])['Fare','Age'].median()

In [None]:
df.groupby(['Pclass', 'Embarked']).Fare.median()

## Crosstabs

In [None]:
# crosstab on Sex and Pclass
pd.crosstab(df.Sex, df.Pclass)

## Pivots

In [None]:
# pivot table
df.pivot_table(index='Sex',columns = 'Pclass',values='Age', aggfunc='mean')