# SLU15 - Working With Real Data: Exercises notebook

## 1 About the data

In this exercise we will be cleaning the data from the well-known titanic dataset. 

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

titanic = pd.read_csv('data/titanic.csv', index_col='PassengerId')
titanic.head(n=3)

In [None]:
X = titanic.copy()
y = X.pop('Survived')

The first thing we want to do is to check the dtypes of our features.

In [None]:
X.dtypes

## 2 Cleaning the Titanic data

### Exercise 1: Scaling numerical values (graded)

As stated in the learning notebook, having variables in different scales may prevent our estimator from learning.

The first thing we will do is to scale all numerical variables.

In [None]:
from sklearn.preprocessing import MinMaxScaler

def scale_numerical_features(X, range_min=0, range_max=1):
    """ 
    Scales all numerical variables in the dataframe.
    
    Args:
        X (pd.DataFrame): a dataframe containing all features
    Returns:
        X_s (pd.DataFrame): a scaled dataframe, with all features

    """
    X_s = X.copy()
    # use select_dtypes().columns to list the numerical columns
    # cols = ... 
    # YOUR CODE HERE
    raise NotImplementedError()
    # initialize the min_max_scaler, with range_min and range_max
    # as parameters for the feature_range (see MinMaxScaler docs)
    # min_max_scaler = ...
    # YOUR CODE HERE
    raise NotImplementedError()
    # use min_max_scaler to scale all variables
    # X_s[cols] = ...
    # YOUR CODE HERE
    raise NotImplementedError()
    return X_s

In [None]:
"""Check that the solution is correct."""
X_scaled = scale_numerical_features(X)

assert all(X_scaled.select_dtypes(exclude='object').min() >= 0)
assert all(X_scaled.select_dtypes(exclude='object').max() <= 1)

### Exercise 2: Binary categorical variables (graded)

Now, we want to make sure we encode the binary categorical variables. We will use `df.nunique()` to identify them.

In [None]:
(X_scaled.select_dtypes(include='object')
         .nunique())

In [None]:
X_scaled['Sex'].value_counts()

Let's save the counts for later use, while testing our solution.

In [None]:
sex_string_counts = X_scaled['Sex'].value_counts()

In [None]:
def encode_sex(X):
    """ 
    Converts 'Sex' from string to int, using an explicit mapping.
    
    Args:
        X (pd.DataFrame): a dataframe containing all features
    Returns:
        X_e (pd.DataFrame): a dataframe, with Sex encoded as int

    """
    X_e = X.copy()
    # create a dictionary mapping the current values to int values
    # 'male' should be mapped to 1 and 'female' to 0
    # sex_to_int = ...
    # YOUR CODE HERE
    raise NotImplementedError()
    # change Sex column from strings to ints, using the dictionary
    # X_e = ... or X_e['Sex'] = ...
    # YOUR CODE HERE
    raise NotImplementedError()
    return X_e

In [None]:
"""Check that the solution is correct."""
X_encoded = encode_sex(X_scaled)
sex_int_counts = X_encoded['Sex'].value_counts()


### Exercise 3: Low-cardinality categorical variables (graded)

We will encode the `Embarked` variable as categorical with three possible, non-ordered values.

In [None]:
def encode_embarked(X):
    """
    Converts 'Embarked' to the categorical type.
    
    Args:
        X (pd.DataFrame): a dataframe containing all features
    Returns:
        X_cat (pd.DataFrame): a dataframe, with Embarked as categorical
    """
    X_cat = X.copy()
    # change Embarked to type category
    # X_cat = ... or X_cat['Embarked'] = ...
    # YOUR CODE HERE
    raise NotImplementedError()
    print('Is ordered: {}'.format(X_cat['Embarked'].cat.ordered))
    print('Categories: {}'.format(X_cat['Embarked'].cat.categories.values))
    return X_cat

In [None]:
"""Check that the solution is correct."""
X_cat = encode_embarked(X_encoded)

assert X_cat['Embarked'].dtype == 'category'

Please note that, despite the fact there's no order in our categories, we can call `codes` on our categorical variable.

In [None]:
X_cat['Embarked'].cat.codes.sample(n=3)

### Exercise 4: Dummy-encoding (graded)

The higher-cardinality features we have in our dataset are, for the most part, identifiers and we will eventually drop them (for now).

To keep a sense of people travelling together and family groups, we will keep the family name as a feature and drop the rest.

To manage dimensionality, we only keep family names with more than two occurences, otherwise we mark passengers as travelling alone.

In [None]:
def dummy_encode(X):
    """
    Creates dummies for categorical features.
    
    Args:
        X (pd.DataFrame): a dataframe containing all features
    Returns:
        X_cat (pd.DataFrame): a dataframe, with dummies
    """
    X_dummies = X.copy()
    X_dummies = extract_surname(X_dummies)
    # drop features 'Name', 'Ticket' and 'Cabin'
    # X_dummies = ...
    # YOUR CODE HERE
    raise NotImplementedError()
    # list containing the names of the categorical columns, i.e.,
    # 'Embarked' and 'Surname'
    # cats = ...
    # YOUR CODE HERE
    raise NotImplementedError()
    # create dummy variables for all categorical columns
    # X_dummies = ...
    # YOUR CODE HERE
    raise NotImplementedError()
    return X_dummies

def extract_surname(X):
    """
    Extracts relevant family names from data.
    
    Args:
        X (pd.DataFrame): a dataframe containing all features
    Returns:
        X_family (pd.DataFrame): a dataframe, with surnames
    """
    X_fam = X.copy()
    X_fam = X_fam.assign(Surname=X_fam['Name'].str.extract('^(.+?),', 
                                                           expand=False))
    X_fam = keep_families_only(X_fam)
    X_fam = X_fam.assign(Surname=X_fam['Surname'].astype('category'))
    return X_fam

def keep_families_only(X):
    """
    If family has more than one member, keep the name. Otherwise, use 'Alone'.
    
    Args:
        X (pd.DataFrame): a dataframe containing all features
    Returns:
        X_family (pd.DataFrame): a dataframe, with familu surnames
    """
    X_fam = X.copy()
    name_counts = X_fam['Surname'].value_counts()
    families = name_counts[name_counts > 1].index
    X_fam.loc[(~X_fam['Surname'].isin(families)), 'Surname'] = 'Alone'
    return X_fam

In [None]:
"""Check that the solution is correct."""
X_dummies = dummy_encode(X_cat)

assert X_dummies.shape == (183, 43)

### Exercise 5: Ordinal variables

Finally, we will deal with `Pclass`, or passenger class.

We will have to encode it as a categorical variable with an explicit order.

In [None]:
def encode_class(X):
    """
    Encodes 'Pclass' as an ordinal feature.
    
    Args:
        X (pd.DataFrame): a dataframe containing all features
    Returns:
        X_cat (pd.DataFrame): a dataframe, with 'Pclass' ordinal
    """
    X_class = X.copy()
    order = ['third', 'second', 'first']
    # change 'Pclass' dtype to category
    # X_class = ... or X_class['Pclass'] = ...
    # YOUR CODE HERE
    raise NotImplementedError()
    # set 'Pclass' categories as ordered in 'order'
    # X_class = ... or X_class['Pclass'] = ...
    # YOUR CODE HERE
    raise NotImplementedError()
    print('Is ordered: {}'.format(X_class['Pclass'].cat.ordered))
    print('Categories: {}'.format(X_class['Pclass'].cat.categories.values))
    # change from categories to codes using Series.cat.codes
    # X_class = ... or X_class['Pclass'] = ...
    # YOUR CODE HERE
    raise NotImplementedError()
    return X_class

In [None]:
"""Check that the solution is correct."""
X_class = encode_class(X_dummies)
class_string_counts = X_dummies['Pclass'].value_counts()
class_int_counts = X_class['Pclass'].value_counts()