# Data Wrangling

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns

In [4]:
kashti = sns.load_dataset("titanic")
kashti.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [5]:
#mathematic operation
(kashti['age']+1).head(10)

0    23.0
1    39.0
2    27.0
3    36.0
4    36.0
5     NaN
6    55.0
7     3.0
8    28.0
9    15.0
Name: age, dtype: float64

## Dealing with missing values
- types
    1. N/A
    2. NaN 
    3. 0
    4. Blank

In [6]:
# where exactky missing values are
kashti.isnull().sum()

survived         0
pclass           0
sex              0
age            177
sibsp            0
parch            0
fare             0
embarked         2
class            0
who              0
adult_male       0
deck           688
embark_town      2
alive            0
alone            0
dtype: int64

In [7]:
#how to drop missing values:
kashti.dropna(subset = ['deck'], axis = 0, inplace  = True)
# inplace true means changes will be made in original dataset

In [8]:
kashti.shape

(203, 15)

In [11]:
#all missing values dropped
kashti = kashti.dropna()
kashti.isnull().sum()

survived       0
pclass         0
sex            0
age            0
sibsp          0
parch          0
fare           0
embarked       0
class          0
who            0
adult_male     0
deck           0
embark_town    0
alive          0
alone          0
dtype: int64

In [12]:
#now check how much data we are left with
kashti.shape

(182, 15)

## Data Fomratting

In [13]:
#know the datatype
kashti.dtypes

survived          int64
pclass            int64
sex              object
age             float64
sibsp             int64
parch             int64
fare            float64
embarked         object
class          category
who              object
adult_male         bool
deck           category
embark_town      object
alive            object
alone              bool
dtype: object

In [14]:
# convert age from years to days
kashti['age'] = kashti['age']*365
kashti.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
1,1,1,female,13870.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
3,1,1,female,12775.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
6,0,1,male,19710.0,0,0,51.8625,S,First,man,True,E,Southampton,no,True
10,1,3,female,1460.0,1,1,16.7,S,Third,child,False,G,Southampton,yes,False
11,1,1,female,21170.0,0,0,26.55,S,First,woman,False,C,Southampton,yes,True


In [15]:
# convert datatype of any column
kashti['survived'] = kashti['survived'].astype("float64")
kashti.dtypes

survived        float64
pclass            int64
sex              object
age             float64
sibsp             int64
parch             int64
fare            float64
embarked         object
class          category
who              object
adult_male         bool
deck           category
embark_town      object
alive            object
alone              bool
dtype: object

In [18]:
#rename column afterwards
kashti.rename(columns = {"age":"age in days"}, inplace  = True )
kashti.head()

Unnamed: 0,survived,pclass,sex,age in days,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
1,1.0,1,female,13870.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
3,1.0,1,female,12775.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
6,0.0,1,male,19710.0,0,0,51.8625,S,First,man,True,E,Southampton,no,True
10,1.0,3,female,1460.0,1,1,16.7,S,Third,child,False,G,Southampton,yes,False
11,1.0,1,female,21170.0,0,0,26.55,S,First,woman,False,C,Southampton,yes,True


## Data Normalization

In [20]:
kashti[["age in days","fare"]]

Unnamed: 0,age in days,fare
1,13870.0,71.2833
3,12775.0,53.1000
6,19710.0,51.8625
10,1460.0,16.7000
11,21170.0,26.5500
...,...,...
871,17155.0,52.5542
872,12045.0,5.0000
879,20440.0,83.1583
887,6935.0,30.0000


In [25]:
#simple feature scaling (values come in 0-1 range)
kashti['fare'] = kashti['fare']/kashti['fare'].max()
kashti['age in days'] = kashti['age in days']/kashti['age in days'].max()
kashti[['fare','age in days']]

Unnamed: 0,fare,age in days
1,0.139136,0.4750
3,0.103644,0.4375
6,0.101229,0.6750
10,0.032596,0.0500
11,0.051822,0.7250
...,...,...
871,0.102579,0.5875
872,0.009759,0.4125
879,0.162314,0.7000
887,0.058556,0.2375


In [26]:
#Min - Max Method  
kashti['fare'] = (kashti['fare']-kashti['fare'].min())/(kashti['age in days'].max()-kashti['age in days'].min()) 
kashti['fare']

1      0.140754
3      0.104850
6      0.102407
10     0.032975
11     0.052425
         ...   
871    0.103772
872    0.009873
879    0.164203
887    0.059237
889    0.059237
Name: fare, Length: 182, dtype: float64

In [27]:
# z- score method (range = -3 to 3)
kashti['fare'] = (kashti['fare']-kashti['fare'].mean())/kashti['fare'].std()
kashti['fare']

1     -0.099835
3     -0.337554
6     -0.353732
10    -0.813428
11    -0.684654
         ...   
871   -0.344689
872   -0.966388
879    0.055413
887   -0.639551
889   -0.639551
Name: fare, Length: 182, dtype: float64

In [30]:
#fare value already gone very minimum that's why we'll load again and try different normalization methods
ks  = sns.load_dataset('titanic')
ks.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [32]:
#log transformation
ks['fare'] = np.log(ks['fare'])
ks['fare'].head()

  result = getattr(ufunc, method)(*inputs, **kwargs)


0    0.683603
1    1.450832
2    0.727559
3    1.379314
4    0.735091
Name: fare, dtype: float64

## Binning
- grouping of values into bins

In [35]:
#dummy values to categories 
pd.get_dummies(kashti['sex'])

Unnamed: 0,female,male
1,True,False
3,True,False
6,False,True
10,True,False
11,True,False
...,...,...
871,True,False
872,False,True
879,True,False
887,True,False


In [52]:
import pandas as pd
import numpy as np

# Assuming 'age' is in fractions of a year
# If directly using the age in years, define bins within 1 year
bin_edges = [0, 0.33, 0.66, 1]  # Adjust these as needed for "Bachay", "Jawan", "Burhay"

# Define the age group labels
age_groups = ["Bachay", "Jawan", "Burhay"]

# Create the 'age_group' column using pd.cut()
kashti['age_group'] = pd.cut(kashti['age in days'], bins=bin_edges, labels=age_groups, include_lowest=True)

# Display the results
print(kashti[['age in days', 'age_group']])


     age in days age_group
1         0.4750     Jawan
3         0.4375     Jawan
6         0.6750    Burhay
10        0.0500    Bachay
11        0.7250    Burhay
..           ...       ...
871       0.5875     Jawan
872       0.4125     Jawan
879       0.7000    Burhay
887       0.2375    Bachay
889       0.3250    Bachay

[182 rows x 2 columns]
