# Data Wrangling

In [1]:
# Import libraries
import numpy as np
import pandas as pd
import seaborn as sns

In [2]:
# Load data
kashti = sns.load_dataset('titanic')
kashti.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [3]:
# Make some copies of data for later use
ks = kashti.copy()
ks1 = kashti.copy()

In [4]:
# Simple math operation on a column
(kashti['age']+6).head(10)

0    28.0
1    44.0
2    32.0
3    41.0
4    41.0
5     NaN
6    60.0
7     8.0
8    33.0
9    20.0
Name: age, dtype: float64

## Dealing with missing values

* In a dataset missing values are either *?*, or *N/A* or *NaN*, or *0* or a blank cell.
* We say a parameter without value in a row is called missing value

> Steps to handle missing values:
1. Try if possible to collect the data for where the values are missing.
2. Drop the column (variable) where the values are missing if it doesn't effect the data or simply remove the row or data entry where the data is missing.
3. Replace the missing values:
    1. How?
        * Average value of entire variable or similar data point
        * Frequency or MODE replacement
        * Replace values based on other functions (e.g. ask data sampler who collected the data)
        * ML algorithm can also be used
        * Leave it like that (in some cases we want to know how many values are missing for analysis)
    
    2. Why?
        * Its better because no data is lost
        * Less accurate

In [5]:
# Where exactly missing values are?
kashti.isnull().sum()

survived         0
pclass           0
sex              0
age            177
sibsp            0
parch            0
fare             0
embarked         2
class            0
who              0
adult_male       0
deck           688
embark_town      2
alive            0
alone            0
dtype: int64

In [6]:
# Check the shape of the dataset
kashti.shape

(891, 15)

In [7]:
# removes missing values from specific columns using dropna method
kashti.dropna(subset=['deck'], axis=0, inplace=True) # inplace=True modifies the dataframe

In [8]:
# Check for missing values again
kashti.isnull().sum()

survived        0
pclass          0
sex             0
age            19
sibsp           0
parch           0
fare            0
embarked        2
class           0
who             0
adult_male      0
deck            0
embark_town     2
alive           0
alone           0
dtype: int64

In [9]:
# Remove missing values from whole dataframe
kashti = kashti.dropna()

# Check the data
kashti.isnull().sum()

survived       0
pclass         0
sex            0
age            0
sibsp          0
parch          0
fare           0
embarked       0
class          0
who            0
adult_male     0
deck           0
embark_town    0
alive          0
alone          0
dtype: int64

In [10]:
# Check rows after removing missing values from whole dataframe
kashti.shape

(182, 15)

## Replace missing values with the average of that column

In [11]:
# Check the missing values in ks1
ks.isnull().sum()

survived         0
pclass           0
sex              0
age            177
sibsp            0
parch            0
fare             0
embarked         2
class            0
who              0
adult_male       0
deck           688
embark_town      2
alive            0
alone            0
dtype: int64

In [12]:
# Find the average of 'age'
age_mean = ks['age'].mean()
age_mean

29.69911764705882

In [13]:
# Replace missing values in 'age' with its mean
ks['age'] = ks['age'].replace(np.nan, age_mean) # np.nan to use as numpy array

In [14]:
ks.isnull().sum()

survived         0
pclass           0
sex              0
age              0
sibsp            0
parch            0
fare             0
embarked         2
class            0
who              0
adult_male       0
deck           688
embark_town      2
alive            0
alone            0
dtype: int64

## Data Formating

* Data has to be in common standard format
* It ensures data is consistent and understandable
* Doing so make the data easy to gather and to work with, e.g.:
    * Faisalabad or (FSD)
    * Lahore or (LHR)
    * Islamabad or (ISB)
    * Karachi or (KCH)
    * Peshawar or (PEW)
    * Covert 'g' to 'kg' or similar unit for all
    * One standard unit in each column (ft != cm)

In [15]:
# Know the data type and convert it accordingly
kashti.dtypes

survived          int64
pclass            int64
sex              object
age             float64
sibsp             int64
parch             int64
fare            float64
embarked         object
class          category
who              object
adult_male         bool
deck           category
embark_town      object
alive            object
alone              bool
dtype: object

In [16]:
# To convert datatype from on format to another
kashti['survived'] = kashti['survived'].astype('float32')
kashti.dtypes

survived        float32
pclass            int64
sex              object
age             float64
sibsp             int64
parch             int64
fare            float64
embarked         object
class          category
who              object
adult_male         bool
deck           category
embark_town      object
alive            object
alone              bool
dtype: object

In [17]:
# Covert the 'age' from years to days
ks['age'] = ks['age']*365
ks.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,8030.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,13870.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,9490.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,12775.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,12775.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [18]:
# Always rename column afterwards
ks.rename(columns={'age': 'age in days'}, inplace=True)
ks.head()

Unnamed: 0,survived,pclass,sex,age in days,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,8030.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,13870.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,9490.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,12775.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,12775.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


## Data Normalization

The benefits of data normalization are:
* Uniform the data
* They have same impact
* Also for computional efficiency

In [19]:
ks.head()

Unnamed: 0,survived,pclass,sex,age in days,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,8030.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,13870.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,9490.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,12775.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,12775.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [20]:
# Assign 'age in days' & 'fare' to a new variable
ks2 = ks[['age in days', 'fare']]
ks2.head()

Unnamed: 0,age in days,fare
0,8030.0,7.25
1,13870.0,71.2833
2,9490.0,7.925
3,12775.0,53.1
4,12775.0,8.05


* The above data is hard to compare because the values are the in wide range and we need to normalize them
* Normalization change the values to the range of 0-1 (after normalization, both variable will have similar influence on our models)

### Method of Normalization

1. Simple feature Scaling
    * x(new) = x(old) / x(max)
2. Min-Max Scaling
3. Z-score (standard score) -3 to +3
4. Log transformation

In [21]:
# Simple feature scaling
ks2['fare'] = ks2['fare']/ks2['fare'].max()
ks2['age in days'] = ks2['age in days']/ks2['age in days'].max()
ks2.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ks2['fare'] = ks2['fare']/ks2['fare'].max()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ks2['age in days'] = ks2['age in days']/ks2['age in days'].max()


Unnamed: 0,age in days,fare
0,0.275,0.014151
1,0.475,0.139136
2,0.325,0.015469
3,0.4375,0.103644
4,0.4375,0.015713


In [22]:
# Make a new copy dataframe with 'age in days' & 'fare'
ks3 = ks[['age in days', 'fare']]

# Min-Max scaling
ks3['fare'] = (ks3['fare']-ks3['fare'].min()) / (ks3['fare'].max()-ks3['fare'].min())
ks3['age in days'] = (ks3['age in days']-ks3['age in days'].min()) / (ks3['age in days'].max()-ks3['age in days'].min())
ks3.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ks3['fare'] = (ks3['fare']-ks3['fare'].min()) / (ks3['fare'].max()-ks3['fare'].min())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ks3['age in days'] = (ks3['age in days']-ks3['age in days'].min()) / (ks3['age in days'].max()-ks3['age in days'].min())


Unnamed: 0,age in days,fare
0,0.271174,0.014151
1,0.472229,0.139136
2,0.321438,0.015469
3,0.434531,0.103644
4,0.434531,0.015713


In [23]:
# Make new copy
ks4 = ks[['age in days', 'fare']]

# Z-score (standard score)
ks4['fare'] = (ks4['fare']-ks4['fare'].mean())/ks4['fare'].std()
ks4['age in days'] = (ks4['age in days']-ks4['age in days'].mean())/ks4['age in days'].std()
ks4.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ks4['fare'] = (ks4['fare']-ks4['fare'].mean())/ks4['fare'].std()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ks4['age in days'] = (ks4['age in days']-ks4['age in days'].mean())/ks4['age in days'].std()


Unnamed: 0,age in days,fare
0,-0.592148,-0.502163
1,0.63843,0.786404
2,-0.284503,-0.48858
3,0.407697,0.420494
4,0.407697,-0.486064


In [24]:
# Make new copy
ks5 = ks[['age in days', 'fare']]

# Log transformation
ks5['fare'] = np.log(ks5['fare'])
ks5['age in days'] = np.log(ks5['age in days'])
ks5.head()

  result = getattr(ufunc, method)(*inputs, **kwargs)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ks5['fare'] = np.log(ks5['fare'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ks5['age in days'] = np.log(ks5['age in days'])


Unnamed: 0,age in days,fare
0,8.99094,1.981001
1,9.537484,4.266662
2,9.157994,2.070022
3,9.455245,3.972177
4,9.455245,2.085672


## Binning

* Grouping of values into smaller number of values (bins)
* Convert numeric into categories (young, adult, senior) or 1-16, 17-30 etc
* To have better understanding of groups
    * Low vs mid vs high price

In [25]:
# Make bins (use ks1 dataframe)
bins = [0, 20, 40, 100]
age_groups = ['Bachay', 'Jawan', 'Boorhay']
ks1['age groups'] = pd.cut(ks1['age'], bins, labels=age_groups,
                           include_lowest=True).cat.add_categories('unknown').fillna('unknown') # fill missing value with 'unknown'
ks1.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone,age groups
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False,Jawan
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False,Jawan
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True,Jawan
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False,Jawan
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True,Jawan


In [26]:
ks1.head(20)

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone,age groups
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False,Jawan
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False,Jawan
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True,Jawan
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False,Jawan
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True,Jawan
5,0,3,male,,0,0,8.4583,Q,Third,man,True,,Queenstown,no,True,unknown
6,0,1,male,54.0,0,0,51.8625,S,First,man,True,E,Southampton,no,True,Boorhay
7,0,3,male,2.0,3,1,21.075,S,Third,child,False,,Southampton,no,False,Bachay
8,1,3,female,27.0,0,2,11.1333,S,Third,woman,False,,Southampton,yes,False,Jawan
9,1,2,female,14.0,1,0,30.0708,C,Second,child,False,,Cherbourg,yes,False,Bachay


## Convert categories into dummies

* For example, convert Male-Female to (0, 1)
* Easy to use for computation

In [27]:
pd.get_dummies(ks1['sex'])

Unnamed: 0,female,male
0,0,1
1,1,0
2,1,0
3,1,0
4,0,1
...,...,...
886,0,1
887,1,0
888,1,0
889,0,1
