## Preliminary analysis

In [12]:
import pandas as pd

url_data = "../datasets/titanic/titanic3.csv"

df = pd.read_csv(url_data)

# First rows
print(df.head())

# Tail rows
print(df.tail())

# Shape dimesions

print(df.shape)

# Statistics

print(df.describe())

# Column names

print(df.columns.values)

   pclass  survived                                             name     sex  \
0       1         1                    Allen, Miss. Elisabeth Walton  female   
1       1         1                   Allison, Master. Hudson Trevor    male   
2       1         0                     Allison, Miss. Helen Loraine  female   
3       1         0             Allison, Mr. Hudson Joshua Creighton    male   
4       1         0  Allison, Mrs. Hudson J C (Bessie Waldo Daniels)  female   

       age  sibsp  parch  ticket      fare    cabin embarked boat   body  \
0  29.0000      0      0   24160  211.3375       B5        S    2    NaN   
1   0.9167      1      2  113781  151.5500  C22 C26        S   11    NaN   
2   2.0000      1      2  113781  151.5500  C22 C26        S  NaN    NaN   
3  30.0000      1      2  113781  151.5500  C22 C26        S  NaN  135.0   
4  25.0000      1      2  113781  151.5500  C22 C26        S  NaN    NaN   

                         home.dest  
0                     St 

#### Functions to see null values on columns

In [13]:
pd.isnull(df["body"]).values.sum()

1188

In [14]:
pd.notnull(df["body"]).values.sum()

121

### Reasons about missing data
    *Extraction of data
    *Collection of data
   
### Methods to solve this situation

In [15]:
# 1. Drop rows in which all columns are NaN
df.dropna(axis=0, how="all")

# 2. Drop rows in which at least one column is NaN
df.dropna(axis=0, how="any")

# 3. Drop columns in which all values are NaN
df.dropna(axis=1, how="all")

# 4. Drop columns in which any values are NaN
df.dropna(axis=1, how="any")

# 5. Replace NaN of DF for a new value
df.fillna(0)

#6. Replace Series/column of DF for a new value
df["body"].fillna(0)

#7. Set NaN values as mean of values
df["age"].fillna(df["age"].mean())

#8. Set NaN values with forward fill method. That is, the previous value known is set
df["age"].fillna(method="ffill")

#9. Set NaN values with back fill method.
a = df["age"].fillna(method="bfill")

28.5