In [None]:
#imports
import pandas as pd
import numpy as np
import os

### Import data

In [None]:
#set the path of the raw data
raw_data_path=os.path.join(os.path.pardir,'data','raw')
train_data_path=os.path.join(raw_data_path,'train.csv')
test_data_path=os.path.join(raw_data_path,'test.csv')

In [None]:
#read data with all default parameters
train_df=pd.read_csv(train_data_path, index_col='PassengerId')
test_df=pd.read_csv(test_data_path, index_col='PassengerId')

In [None]:
#type
type(train_df)

In [None]:
#use .info() to get brief information about dataframe
train_df.info()

In [None]:
test_df.info()

In [None]:
test_df['Survived'] = -888 #adding survived with default value

In [None]:
df = pd.concat((train_df,test_df),axis=0)

In [None]:
df.info()

In [None]:
# GET the first five rows
df.head()

In [None]:
# use head to get top 5rows
df.head(10)

In [None]:
# use tail to get last 5rows
df.tail()

In [None]:
df['Name']

In [None]:
# multiple columns
df[['Name','Age']]

In [None]:
# indexing: use loc for position based indexing
# all columns
df.loc[5:10]

In [None]:
# selecting column range
df.loc[5:10,'Age':'Pclass']

In [None]:
# discript column
df.loc[5:10,['Age','Pclass']]

In [None]:
# indexing: use iloc for position based indexing
# all columns
df.iloc[5:10, 3:5]

In [None]:
# filtering
male_passanger=df.loc[df.Sex=='male']
print(male_passanger)

### Summary statistics

#### Numeric
##### Cetrality Measure
- mean/average
- meadian

##### Dispersion measure(range, percetile, variance, standard deviation)
- How values are spread from central value
- variance
- standard deviation
- Range
- percentile

#### Categorical
  - Total count
  - unique count
  - Category count and proportions


In [None]:
# use describe to get statistic for all numeric columns
df.describe()

In [None]:
# numerical centrality
print("Mean Fare",df.Fare.mean())
print("Median Fare", df.Fare.median())

In [None]:
# dispersion measure
print("Min Fare",df.Fare.min())
print("Max Fare", df.Fare.max())
print("Variance Fare",df.Fare.var())
print("25 Percentile", df.Fare.quantile(.25))
print("50 Percentile", df.Fare.quantile(.5))
print("75 Percentile", df.Fare.quantile(.75))
print("Standard variation", df.Fare.std())

In [None]:
%matplotlib inline
# import matplotlib.pyplot as plt

In [None]:
df.Fare.plot(kind='box')
# df.Fare

In [None]:
# categorical
df.describe(include='all')

In [None]:
df.Sex.value_counts()

In [None]:
df.Sex.value_counts(normalize=True)

In [None]:
df.Pclass.value_counts().plot(kind='bar',rot=0,title='Class wise passenger count',color='#d23f2c')

## Exploratory data analysis

### Distributions
#### Univariate(Disribution of one feature at one time)
- Histograms
- Kernel Density Estimation(KDE) Plot

#### Bivrate (Disribution of two feature at one time)
- Scatter Plot

### Basic Grouping and Agregation
### Cross tabs and pivots

## Univariate distribution

In [None]:
df.Age.plot(kind='hist', title='histogram for Age', color='c')

In [None]:
df.Age.plot(kind='kde', title='histogram for Age', color='c')

In [None]:
df.Fare.plot(kind='hist', title='histogram for Age', color='c')

In [None]:
print("age skewness : {0:.2f}".format(df.Age.skew()))
print("fare skewness : {0:.2f}".format(df.Fare.skew()))

## Bivariate Distribution
- Using scatter plot

In [None]:
df.plot.scatter(x='Age',y='Fare',color='c',title='scatter plot:Age vs Fare')

In [None]:
df.plot.scatter(x='Age',y='Fare',color='c',title='scatter plot:Age vs Fare',alpha=0.1)

In [None]:
df.plot.scatter(x='Pclass',y='Fare',color='c',title='scatter plot:Pclass vs Fare',alpha=0.1)

## Grouping/Agregation 

In [None]:
df.groupby('Sex').Age.median()

In [None]:
df.groupby('Pclass').Fare.median()

In [None]:
df.groupby('Sex').Age.median()

In [None]:
df.groupby(['Pclass'])['Fare','Age'].median()

In [None]:
df.groupby(['Pclass']).agg({'Fare':'mean','Age':'median'})

In [None]:
aggregation ={
    'Fare':{
        'mean_fare':'mean',
        'meadian_fare':'median'
    },
    'Age':{
      'mean_age':'mean',
      'min_age':min  
    }
}

In [None]:
df.groupby(['Pclass']).agg(aggregation)

### CrossTabs

In [None]:
pd.crosstab(df.Sex,df.Pclass)

In [None]:
pd.crosstab(df.Sex,df.Pclass).plot(kind='bar')

### Pivotal Table

In [None]:
df.pivot_table(index='Sex',columns='Pclass', values='Age', aggfunc='mean')

In [None]:
df.groupby(['Sex','Pclass']).Age.mean()

In [None]:
df.groupby(['Sex','Pclass']).Age.mean().unstack()

## Exploring and processing data III
### Data Munging 
 - involves activities of looking into potential issues
   in data and solving them using appropriate techniques
 - Treating Missing values
 - Working with outliers
#### Data issues
- Missing values 
- Extreme values
- Erroneous Values
### Feature Engineering
 - Derived features
 - Categorical Feature encoding
### Advanced visualization


### Data Munging: Working with Missing Values

In [None]:
df.info()

In [None]:
df[df.Embarked.isnull()]

In [None]:
df.Embarked.value_counts()

In [None]:
pd.crosstab(df[df.Survived!=-888].Survived,df[df.Survived!=-888].Embarked)
# df[df.Survived!=-888].Embarked

In [None]:
# df.loc[df.Embarked.isnull(),'Embarked'] = 'S'

In [None]:
df.groupby(['Pclass','Embarked']).Fare.median()

In [None]:
df.Embarked.fillna('C',inplace=True)

In [None]:
df[df.Embarked.isnull()]

In [None]:
df.info()

### Treating missing values of Fare value

In [None]:
# checking all columns where fare is Null
df[df.Fare.isnull()]

In [None]:
# median fare
median_fare = df.loc[(df.Pclass==3) & (df.Embarked =='S'),'Fare'].median()
print(median_fare)

In [None]:
df.Fare.fillna(median_fare,inplace=True)

In [None]:
df.info()

### Treating missing values of Age value

#### Option 1

In [None]:
# set maxmum number of rows to be diaplyed
pd.options.display.max_rows =15 

In [None]:
# get all columns where age is null
df[df.Age.isnull()]

In [None]:
# ploting age 
df.Age.plot(kind='hist',bins=20,color='C')

In [None]:
# mean 
# demerits mean can be easlity affected by extream values
df.Age.mean()

#### Option 2

In [None]:
# mean of ages according to gender
df.groupby('Sex').Age.mean()

In [None]:
# visualize using BOXplot
df[df.Age.notnull()].boxplot('Age','Sex')

In [None]:
# replace with meadian age
# meadian_age=df.groupby('Sex').Age.transform('median')
# df.Age.fillna(meadian_age,inplace=True)
# df.info()

#### Option 3
- Replace with the median age of the title

In [None]:
# get all names
df.Name

In [None]:
def extract_title(name):
    first_name_with_title = name.split(',')[1]
    title = first_name_with_title.split('.')[0]
    title=title.strip().lower()
    return title

In [None]:
# Use function to apply the functionon each Name Value
df.Name.map(lambda x: extract_title(x))

In [None]:
# unique title
df.Name.map(lambda x: extract_title(x)).unique()

In [None]:
def extract_title(name):
    title_group = {
        'mr':'Mr',
        'mrs':'Mrs',
        'miss':'Mrs',
        'master':'Master',
        'don':'Sir',
        'rev':"Sir",
        'dr':'Officer',
        'mme':'Mrs',
        'ms':'Mrs',
        'major': 'Officer',
        'lady':'Lady',
        'sir':'Sir',
        'mlle':'Miss',
        'col':'Officer',
        'capt':'Officer',
        'the countess':'Lady',
        'jonkheer': 'Sir',
        'dona': 'Lady'
    }
    first_name_with_title = name.split(',')[1]
    title = first_name_with_title.split('.')[0]
    title=title.strip().lower()
    return title_group[title]

In [None]:
df['Title'] = df.Name.map(lambda x :extract_title(x))

In [None]:
df.head()

In [None]:
df[df.Age.notnull()].boxplot('Age','Title')

In [None]:
title_age_medium = df.groupby('Title').Age.transform('median')
df.Age.fillna(title_age_medium,inplace=True)

In [None]:
df.info()

In [None]:
df.head()

### Working with Outliers
- Something that is significantly difrrent form common behavior
#### Outlier detection
- Histogram
- Boxplot
- Scatter plot
#### Solution
- Transformation
- Remove
- Binning- create bins and put you values in bins
- imputation

### Detecting and treating ouliers

### Age

In [None]:
df.Age.plot(kind='hist',rwidth=0.9,title='histogram for detecting outliers on Age', color='c');

In [None]:
# passengers where Age>=70
df.loc[df.Age>=70]

#### Fare

In [None]:
df.Fare.plot(kind='hist',rwidth=0.9,title='histogram for detecting outliers on Age', color='c');

In [None]:
df.loc[df.Fare>=400]

In [None]:
df.Fare.plot(kind='box');

In [None]:
# tranformation to reduce skweness
log_fare = np.log(df.Fare + 1.0)

In [None]:
log_fare.plot(kind='hist',rwidth=0.9, bins=20, color='C')

In [None]:
# Binning
pd.qcut(df.Fare, 4)

In [None]:
# labling our bin
pd.qcut(df.Fare, 4,labels= ['very_low','low','high','very_high']) #discritiztion

In [None]:
# ploting on bar graph
pd.qcut(df.Fare, 4,labels= ['very_low','low','high','very_high']).value_counts().plot(kind='bar', rot=0)

In [None]:
df['Fare_bin']=pd.qcut(df.Fare, 4,labels= ['very_low','low','high','very_high'])

In [None]:
df.head()

### Feature Enginnering
- Process of transforming raw data to better representative 
  features inorder to create better predecitve models

In [None]:
df['AgeState']= np.where(df.Age>=18,'Adult','Child')

In [None]:
df.head()

In [None]:
df['AgeState'].value_counts()

In [None]:
# crosstabs for survival state
pd.crosstab(df[df.Survived !=-888].Survived,df[df.Survived !=-888].AgeState)

### Family size

In [None]:
df.head()

In [None]:
df['FamilySize']=df.Parch + df.SibSp +1

In [None]:
df.Fare.plot(kind='hist',color='C');

In [None]:
df.loc[df.FamilySize == df.FamilySize.max()]

In [None]:
pd.crosstab(df[df.Survived !=-888].Survived,df[df.Survived !=-888].FamilySize)

### Mother Hood

In [None]:
df['IsMother'] = np.where((df.Sex =='female') & (df.Parch > 0) & (df.Age > 18) & (df.Title !='Miss'),1,0)

In [None]:
df.tail()

In [None]:
pd.crosstab(df[df.Survived != -888].Survived, df[df.Survived != -888].IsMother)

### Categorical Feature enconding
- Process of taking some categorical features and convert them in numerical form
#### Binary enconding
#### Label enconding
#### One hot enconding

In [None]:
df.info()

In [None]:
df.Cabin

In [None]:
df.head()

In [None]:
df.Cabin.unique()

In [None]:
df.loc[df.Cabin=='T']

In [None]:
df.loc[df.Cabin=='T', 'Cabin'] = np.NaN

In [None]:
df.loc[df.Cabin=='T']

In [None]:
#Extract the first character of Cabin string to the dec
def get_deck(cabin):
    return np.where(pd.notnull(cabin),str(cabin)[0].upper(),'Z')
df['Deck']=df['Cabin'].map(lambda x:get_deck(x))

In [None]:
df.tail()

In [None]:
df.Deck.value_counts()

In [None]:
pd.crosstab(df[df.Survived !=-888].Survived,df[df.Survived!=-888].Deck)

### Categorical Enconding

In [None]:
# checking the age state
df['IsMale'] = np.where(df.Sex == 'male',1,0)

In [None]:
df.head()

In [None]:
# df['IsAdult'] = np.where(df.Age > 18,1,0)

In [None]:
# df.tail()

In [None]:
df=pd.get_dummies(df,columns=['Deck','Pclass','Title','Fare_bin','Embarked','AgeState'])

In [None]:
df.info()

In [None]:
df.Ticket.value_counts()

### Drop and reorder columns

In [None]:
# drop columns
df.drop(['Cabin','Name','Ticket','Parch','SibSp','Sex'],axis=1,inplace=True)

In [None]:
# reorder columns
columns = [column for column in df.columns if column != 'Survived']
columns = ['Survived'] + columns
df=df[columns]
df.info()

### Save proceesed dataset

In [None]:
processed_data_set = os.path.join(os.path.pardir,'data','processed')
write_train_data = os.path.join(processed_data_set,'train.csv')
write_test_data = os.path.join(processed_data_set,'test.csv')

In [None]:
#train data
df.loc[df.Survived !=-888].to_csv(write_train_data)
#test data
columns = [column for column in df.columns if column != 'Survived']
df.loc[df.Survived ==-888,columns].to_csv(write_test_data)

In [None]:
df.info()