# Data Wrangling

In [1]:
import numpy as numpy
import pandas as pd
import seaborn as sns

In [45]:
d = sns.load_dataset('titanic')
d1 = d
d2 = d

In [46]:
d.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [27]:
# simple mathematical operations

(d['age']+6).head()

0    28.0
1    44.0
2    32.0
3    41.0
4    41.0
Name: age, dtype: float64

In [47]:
# Dealing with missing values
d.isnull().sum()

survived         0
pclass           0
sex              0
age            177
sibsp            0
parch            0
fare             0
embarked         2
class            0
who              0
adult_male       0
deck           688
embark_town      2
alive            0
alone            0
dtype: int64

In [29]:
# replace missing value with mean,frequency,mode
d['age'].mean()
d['deck'].mode()

0    C
Name: deck, dtype: category
Categories (7, object): ['A', 'B', 'C', 'D', 'E', 'F', 'G']

In [48]:
d.shape

(891, 15)

In [49]:
d.dropna(subset=['deck'], axis=0,inplace=True)

In [50]:
d.shape

(203, 15)

In [51]:
d.isnull().sum()

survived        0
pclass          0
sex             0
age            19
sibsp           0
parch           0
fare            0
embarked        2
class           0
who             0
adult_male      0
deck            0
embark_town     2
alive           0
alone           0
dtype: int64

In [52]:
# to drop 
d2 = d.dropna()

In [53]:
d2.shape

(182, 15)

In [54]:
d1.shape

(203, 15)

In [55]:
d1.isnull().sum()

survived        0
pclass          0
sex             0
age            19
sibsp           0
parch           0
fare            0
embarked        2
class           0
who             0
adult_male      0
deck            0
embark_town     2
alive           0
alone           0
dtype: int64

In [58]:
mean = d1['age'].mean()
mean

35.77945652173913

In [60]:
import numpy as np

In [62]:
d1['age'] = d1['age'].replace(np.nan,mean)

In [63]:
d1.isnull().sum()

survived       0
pclass         0
sex            0
age            0
sibsp          0
parch          0
fare           0
embarked       2
class          0
who            0
adult_male     0
deck           0
embark_town    2
alive          0
alone          0
dtype: int64

In [64]:
d1.dropna(subset=['embarked'],axis=0, inplace=True)

In [66]:
d1.isnull().sum()

survived       0
pclass         0
sex            0
age            0
sibsp          0
parch          0
fare           0
embarked       0
class          0
who            0
adult_male     0
deck           0
embark_town    0
alive          0
alone          0
dtype: int64

## Data Formatting

In [67]:
# types of data 
d.dtypes

survived          int64
pclass            int64
sex              object
age             float64
sibsp             int64
parch             int64
fare            float64
embarked         object
class          category
who              object
adult_male         bool
deck           category
embark_town      object
alive            object
alone              bool
dtype: object

In [72]:
#convert data set
d['survived'] = d['survived'].astype('float64')
d.dtypes

survived        float64
pclass            int64
sex              object
age             float64
sibsp             int64
parch             int64
fare            float64
embarked         object
class          category
who              object
adult_male         bool
deck           category
embark_town      object
alive            object
alone              bool
dtype: object

In [79]:
d['age']=d['age']*365
d['age']=d['age'].astype('int64')

In [80]:
d.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
1,1.0,1,female,13870,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
3,1.0,1,female,12775,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
6,0.0,1,male,19710,0,0,51.8625,S,First,man,True,E,Southampton,no,True
10,1.0,3,female,1460,1,1,16.7,S,Third,child,False,G,Southampton,yes,False
11,1.0,1,female,21170,0,0,26.55,S,First,woman,False,C,Southampton,yes,True


In [82]:
d.rename(columns={'age':'age(days)'}, inplace=True)

In [83]:
d.head()

Unnamed: 0,survived,pclass,sex,age(days),sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
1,1.0,1,female,13870,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
3,1.0,1,female,12775,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
6,0.0,1,male,19710,0,0,51.8625,S,First,man,True,E,Southampton,no,True
10,1.0,3,female,1460,1,1,16.7,S,Third,child,False,G,Southampton,yes,False
11,1.0,1,female,21170,0,0,26.55,S,First,woman,False,C,Southampton,yes,True


## Data Normalization

In [84]:
d.columns

Index(['survived', 'pclass', 'sex', 'age(days)', 'sibsp', 'parch', 'fare',
       'embarked', 'class', 'who', 'adult_male', 'deck', 'embark_town',
       'alive', 'alone'],
      dtype='object')

In [85]:
d3=d[['age(days)','fare']]

In [86]:
d3.head()

Unnamed: 0,age(days),fare
1,13870,71.2833
3,12775,53.1
6,19710,51.8625
10,1460,16.7
11,21170,26.55


## Methods of Normalization
1- Simple feature scaling

   * x(new) = x(old)/x(max)/
  
2-Min-Max method/

3- z-score(-3 to 3)/

4- Log transformation

In [87]:
# simple feature scalling
d3['fare'] = d3['fare']/d3['fare'].max()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  d3['fare'] = d3['fare']/d3['fare'].max()


In [88]:
d3.head()

Unnamed: 0,age(days),fare
1,13870,0.139136
3,12775,0.103644
6,19710,0.101229
10,1460,0.032596
11,21170,0.051822


In [89]:
d3['age(days)']= d3['age(days)']/d3['age(days)'].max()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  d3['age(days)']= d3['age(days)']/d3['age(days)'].max()


In [90]:
d3.head()

Unnamed: 0,age(days),fare
1,0.475,0.139136
3,0.4375,0.103644
6,0.675,0.101229
10,0.05,0.032596
11,0.725,0.051822


In [92]:

d4 = d1.copy()



In [93]:
d4.head()

Unnamed: 0,survived,pclass,sex,age(days),sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
1,1.0,1,female,13870,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
3,1.0,1,female,12775,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
6,0.0,1,male,19710,0,0,51.8625,S,First,man,True,E,Southampton,no,True
10,1.0,3,female,1460,1,1,16.7,S,Third,child,False,G,Southampton,yes,False
11,1.0,1,female,21170,0,0,26.55,S,First,woman,False,C,Southampton,yes,True


In [94]:


d4 = d4[['age(days)','fare']]



In [95]:
d4.head()

Unnamed: 0,age(days),fare
1,13870,71.2833
3,12775,53.1
6,19710,51.8625
10,1460,16.7
11,21170,26.55


In [97]:
#min-max method
# x = x - x(min) / x(max)-x(min)
d4['fare']=(d4['fare']-d4['fare'].min())/(d4['fare'].max()-d4['fare'].min())



In [98]:
d4.head()

Unnamed: 0,age(days),fare
1,13870,0.139136
3,12775,0.103644
6,19710,0.101229
10,1460,0.032596
11,21170,0.051822


In [99]:
# z-score(standard score)
# x = (x-x(mean))/x.std
d4['age(days)']=(d4['age(days)']-d4['age(days)'].mean())/d4['age(days)'].std()

In [100]:
d4.head()

Unnamed: 0,age(days),fare
1,0.158444,0.139136
3,-0.04278,0.103644
6,1.231641,0.101229
10,-2.122098,0.032596
11,1.49994,0.051822


In [101]:
ds = sns.load_dataset('titanic')

In [102]:
# log transformation
ds = ds[['age','fare']]

In [103]:
ds.head()

Unnamed: 0,age,fare
0,22.0,7.25
1,38.0,71.2833
2,26.0,7.925
3,35.0,53.1
4,35.0,8.05


In [104]:
ds['fare']=np.log(ds['fare'])

  result = getattr(ufunc, method)(*inputs, **kwargs)


In [105]:
ds.head()

Unnamed: 0,age,fare
0,22.0,1.981001
1,38.0,4.266662
2,26.0,2.070022
3,35.0,3.972177
4,35.0,2.085672


## Binning

In [121]:

d.shape
d['age'].unique()

array([22.  , 38.  , 26.  , 35.  ,   nan, 54.  ,  2.  , 27.  , 14.  ,
        4.  , 58.  , 20.  , 39.  , 55.  , 31.  , 34.  , 15.  , 28.  ,
        8.  , 19.  , 40.  , 66.  , 42.  , 21.  , 18.  ,  3.  ,  7.  ,
       49.  , 29.  , 65.  , 28.5 ,  5.  , 11.  , 45.  , 17.  , 32.  ,
       16.  , 25.  ,  0.83, 30.  , 33.  , 23.  , 24.  , 46.  , 59.  ,
       71.  , 37.  , 47.  , 14.5 , 70.5 , 32.5 , 12.  ,  9.  , 36.5 ,
       51.  , 55.5 , 40.5 , 44.  ,  1.  , 61.  , 56.  , 50.  , 36.  ,
       45.5 , 20.5 , 62.  , 41.  , 52.  , 63.  , 23.5 ,  0.92, 43.  ,
       60.  , 10.  , 64.  , 13.  , 48.  ,  0.75, 53.  , 57.  , 80.  ,
       70.  , 24.5 ,  6.  ,  0.67, 30.5 ,  0.42, 34.5 , 74.  ])

In [None]:
bins = np.linspace(min(d['age']),max(d['age']), 15000)

age_groups = ['bachay','jawan','booray']

d['age'] = pd.cut(d['age'],bins,labels=age_groups, include_lowest=True)



In [112]:
#get_dummies


d = pd.get_dummies(d['sex'])

In [113]:
d.head()

Unnamed: 0,female,male
1,1,0
3,1,0
6,0,1
10,1,0
11,1,0
