# Data Aquisition

In [None]:
import pandas as pd

In [None]:
# Loads Dataset
df_titanic = pd.read_csv('/content/drive/MyDrive/CS240/Titanic.csv')

In [None]:
df_titanic.head()

Unnamed: 0,index,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [None]:
#Show All Rows and Columns
df_titanic.shape

(891, 16)

**Data Description**
* index: Passenger index
* survived: Survival status (0 = No, 1 = Yes)
* pclass: Passenger class (1 = 1st class, 2 = 2nd class, 3 = 3rd class)
* sex: Passenger’s gender
* age: Passenger’s age
* sibsp: Number of siblings/spouses aboard
* parch: Number of parents/children aboard
* fare: Fare paid for the ticket
* embarked: Port of embarkation (C = Cherbourg, Q = Queenstown, S = Southampton)
* class: Equivalent to Pclass (1 = 1st class, 2 = 2nd class, 3 = 3rd class)
* who: Passenger’s category (man, woman, child)
* adult_male: Whether the passenger is an adult male or not (True or False)
* deck: Cabin deck
* embark_town: Port of embarkation (Cherbourg, Queenstown, Southampton)
* alive: Survival status (yes or no)
* alone: Whether the passenger is alone or not (True or False)

#Data Preparation

In [None]:
#drop column 'index'
df_rm_index = df_titanic.drop('index', axis=1)

In [None]:
df_rm_index.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


# Checking and Handling Duplicate Data

In [None]:
df_rm_index[df_rm_index.duplicated()]

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
47,1,3,female,,0,0,7.7500,Q,Third,woman,False,,Queenstown,yes,True
76,0,3,male,,0,0,7.8958,S,Third,man,True,,Southampton,no,True
77,0,3,male,,0,0,8.0500,S,Third,man,True,,Southampton,no,True
87,0,3,male,,0,0,8.0500,S,Third,man,True,,Southampton,no,True
95,0,3,male,,0,0,8.0500,S,Third,man,True,,Southampton,no,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
870,0,3,male,26.0,0,0,7.8958,S,Third,man,True,,Southampton,no,True
877,0,3,male,19.0,0,0,7.8958,S,Third,man,True,,Southampton,no,True
878,0,3,male,,0,0,7.8958,S,Third,man,True,,Southampton,no,True
884,0,3,male,25.0,0,0,7.0500,S,Third,man,True,,Southampton,no,True


In [None]:
#Check duplicated
df_rm_index.duplicated().sum()

107

In [None]:
#Drop duplicated data
df_rm_index.drop_duplicates(inplace=True)

# Checking and Handling Inconsistent Datatypes

In [None]:
df_rm_index['sex'].unique()

array(['male', 'female'], dtype=object)

In [None]:
for sex in df_rm_index.columns:
  print(f'{sex}: {df_rm_index[sex].unique()}')

survived: [0 1]
pclass: [3 1 2]
sex: ['male' 'female']
age: [22.   38.   26.   35.     nan 54.    2.   27.   14.    4.   58.   20.
 39.   55.   31.   34.   15.   28.    8.   19.   40.   66.   42.   21.
 18.    3.    7.   49.   29.   65.   28.5   5.   11.   45.   17.   32.
 16.   25.    0.83 30.   33.   23.   24.   46.   59.   71.   37.   47.
 14.5  70.5  32.5  12.    9.   36.5  51.   55.5  40.5  44.    1.   61.
 56.   50.   36.   45.5  20.5  62.   41.   52.   63.   23.5   0.92 43.
 60.   10.   64.   13.   48.    0.75 53.   57.   80.   70.   24.5   6.
  0.67 30.5   0.42 34.5  74.  ]
sibsp: [1 0 3 4 2 5 8]
parch: [0 1 2 5 3 4 6]
fare: [  7.25    71.2833   7.925   53.1      8.05     8.4583  51.8625  21.075
  11.1333  30.0708  16.7     26.55    31.275    7.8542  16.      29.125
  13.      18.       7.225   26.       8.0292  35.5     31.3875 263.
   7.8792   7.8958  27.7208 146.5208   7.75    10.5     82.1708  52.
   7.2292  11.2417   9.475   21.      41.5792  15.5     21.6792  17.8
  39.68

In [None]:
 #Handling Inconsistent Data
 df_rm_index['sex'] = df_rm_index['sex'].replace('male', 'M')
 df_rm_index['sex'] = df_rm_index['sex'].replace('female', 'F')

In [None]:
df_rm_index

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,M,22.0,1,0,7.2500,S,Third,man,True,,Southampton,no,False
1,1,1,F,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,F,26.0,0,0,7.9250,S,Third,woman,False,,Southampton,yes,True
3,1,1,F,35.0,1,0,53.1000,S,First,woman,False,C,Southampton,yes,False
4,0,3,M,35.0,0,0,8.0500,S,Third,man,True,,Southampton,no,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
885,0,3,F,39.0,0,5,29.1250,Q,Third,woman,False,,Queenstown,no,False
887,1,1,F,19.0,0,0,30.0000,S,First,woman,False,B,Southampton,yes,True
888,0,3,F,,1,2,23.4500,S,Third,woman,False,,Southampton,no,False
889,1,1,M,26.0,0,0,30.0000,C,First,man,True,C,Cherbourg,yes,True
