# BLU02 - Exercises Notebook

In [25]:
import hashlib # for grading

import os
import pandas as pd
import numpy as np
import datetime
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error

## 1 Read the Programs data (graded)

In this first exercise, we aim to create a single dataframe, combining all programs from all seasons.

With a caveat though: **we want to include seasons from the year 1950 onwards**.

In [2]:
def read_season(folder_path, file_name):
    path = os.path.join(folder_path, file_name)
    return pd.read_csv(path)

def read_programs(folder_path):
    files = os.listdir(folder_path)
    # Create a list with the name of all files containing programs from
    # 1950 inclusive and onwards (just the filename, no complete path.)
    # files_from_1950: List[str] = ...
    ### BEGIN SOLUTION
    files_from_1950 = [f for f in files if int(f.split('-')[0]) >= 1950]
    ### END SOLUTION 
    # Create a list with the dataframes
    # seasons: List[pd.DataFrame] = ...
    ### BEGIN SOLUTION
    seasons = [read_season(folder_path, f) for f in files_from_1950 if '.csv' in f]
    ### END SOLUTION
    # Use pd.concat to create a single dataframe.
    # programs: pd.DataFrame = ...
    ### BEGIN SOLUTION
    programs = pd.concat(seasons, axis=0, ignore_index=True)
    ### END SOLUTION
    # Drop the column GUID.
    # programs = ...
    ### BEGIN SOLUTION
    programs = programs.drop(columns='GUID')
    ### END SOLUTION
    ## Remove Duplicated lines.
    ### BEGIN SOLUTION
    # programs = ...
    programs = programs.drop_duplicates()
    ### END SOLUTION
    # Set the index to be the column ProgramID
    ### BEGIN SOLUTION
    programs = programs.set_index('ProgramID')
    ### END SOLUTION
    return programs

programs = read_programs(os.path.join('data', 'programs'))

In [3]:
assert programs['Season'].min() == '1950-51'
assert programs['Season'].max() == '2016-17'
assert programs.index.name == 'ProgramID'
assert programs.shape == (7341, 2)

## 2 Read the Concerts data (graded)

Although we list all transformations step-by-step for the sake of clarity, we expect you to use method chaining.

In [4]:
def make_concerts(file_path): 
    # Read concerts data and drop the GUID column.
    # concerts: pd.DataFrame = ...
    ### BEGIN SOLUTION
    concerts = pd.read_csv(file_path)
    concerts = concerts.drop(columns=['GUID'])
    ### END SOLUTION
    # Remember to_datetime? We need it here. We need to parse the columns Date and 
    # Time. Use pd.to_datetime(...).dt.date for the Date. 
    # then use the same logic to create the column Hour and Minute from Time column.
    ### BEGIN SOLUTION
    concerts = concerts.assign(
        Date=pd.to_datetime(concerts['Date']).dt.date,
        Hour=pd.to_datetime(
            concerts['Time']).dt.hour,
        Minute=pd.to_datetime(
            concerts['Time']).dt.minute,
    )
    ### END SOLUTION
    ## Remove Duplicated lines.
    ### BEGIN SOLUTION
    concerts = concerts.drop_duplicates()
    ### END SOLUTION
    ## Remove all lines with empty Time column. Then also drop the Time column.
    ### BEGIN SOLUTION
    concerts = concerts.dropna(subset=["Time"])
    concerts = concerts.drop("Time", axis = 1)
    ### END SOLUTION    
    
    return concerts

concerts = make_concerts(os.path.join('data','concerts.csv'))

In [5]:
assert concerts.shape == (20812, 8)
assert concerts.Date.min() == datetime.date(1842, 12, 7)
assert concerts.Date.max() == datetime.date(2017, 7, 7)
assert concerts.Date.max() == datetime.date(2017, 7, 7)
assert concerts['Hour'].mode().values[0] == 20
assert concerts['Minute'].mode().values[0] == 0
assert list(concerts.iloc[1537][['Hour', 'Minute']].values) == [20,30]
assert list(concerts.iloc[1201][['Hour', 'Minute']].values) == [20,15]
assert set(concerts.columns) == set([
    'ProgramID', 'ConcertID', 'EventType', 'Location', 'Venue', 'Date', 'Hour', 'Minute'
])

## 3 Combine Programs and Concerts data (graded)

Let's combine both dataframes into a single dataset, using an inner join.

In [6]:
# Remember that you want to join on the index of one of the dataframes.
# Join only the concerts with valid ProgramIDs
# nyp = ...
### BEGIN SOLUTION
nyp = concerts.join(programs, on='ProgramID', how='inner')
### END SOLUTION

In [7]:
assert nyp.shape == (12943, 10)
assert set(nyp.columns) == set([
    'ProgramID', 'ConcertID', 'EventType', 'Location', 'Venue',
    'Date', 'Hour', 'Minute', 'Orchestra', 'Season'
])

## 4 Read Works and Soloists data (graded)

We will read the two remaining pieces of data. 

Again, albeit the step-by-step description, we encourage you to use method chaining.

In [8]:
def read_works(file_path):
    # Read the works data.
    # works: pd.DataFrame = ...
    ### BEGIN SOLUTION
    works = pd.read_csv(file_path)
    ### END SOLUTION
    # Remove the Intervals (attention to the values in the isInterval column).
    # works: pd.DataFrame = ...
    ### BEGIN SOLUTION
    works = works[~works.isInterval]
    ### END SOLUTION
    # Select the columns ProgramID, WorkID, ComposerName, WorkTitle, Movement and ConductorName.
    ### BEGIN SOLUTION
    columns = [
        'ProgramID','WorkID', 
        'ComposerName', 'WorkTitle', 'Movement', 
        'ConductorName'
    ]
    works = works.loc[:, columns]
    ### END SOLUTION
    ## Remove Duplicated lines.
    ### BEGIN SOLUTION
    # works: pd.DataFrame = ...
    works = works.drop_duplicates()
    ### END SOLUTION
    ## Remove all lines with empty Movement column.
    ### BEGIN SOLUTION
    # works: pd.DataFrame = ...
    works = works.dropna(subset=["Movement"])
    ### END SOLUTION    
    
    return works


def read_soloists(file_path):
    # Read the soloists data and drop GUID and MovementID Columns
    ### BEGIN SOLUTION
    soloists = pd.read_csv(file_path)
    soloists = soloists.drop(columns=['GUID', 'MovementID'])
    ### END SOLUTION
    ## Remove Duplicated lines.
    ### BEGIN SOLUTION
    # soloists: pd.DataFrame = ...
    soloists = soloists.drop_duplicates()
    ### END SOLUTION
    return soloists


works = read_works('data/works.csv')
soloists = read_soloists('data/soloists.csv')

In [9]:
assert works.shape == (24320, 6)
assert set(works.columns) == set([
    'ProgramID', 'WorkID', 'ComposerName', 'WorkTitle', 'Movement', 'ConductorName'
])

assert soloists.shape == (50292, 5)
assert set(soloists.columns) == set([
   'ProgramID', 'WorkID', 'SoloistName', 'SoloistInstrument', 'SoloistRole'
])

## 5 Combine Works and Soloists (graded)

Like we did for Programs and Concerts, now we combine Works and Soloists.

In [10]:
# Combine both dataframes, again using an inner type of join. An work is identified by the pair
# ProgramId, WorkID
# works_and_soloists : pd.DataFrame = ....
### BEGIN SOLUTION
works_and_soloists = pd.merge(works, soloists, on=['WorkID', 'ProgramID'])
### END SOLUTION

In [11]:
assert works_and_soloists.shape == (23578, 9)
assert set(works_and_soloists.columns) == set(
    [
        'ProgramID', 'WorkID', 'ComposerName', 'WorkTitle', 'Movement',
        'ConductorName', 'SoloistName', 'SoloistInstrument', 'SoloistRole'
    ]
)

## 6 Combine everything (graded)

The final goal here is to create a single dataframe.

In [12]:
# Combine works_and_soloists and nyp into a single dataframe.
# You need to figure out the common column shared between the two dataframes
# nyp_merged = ...
### BEGIN SOLUTION
nyp_merged = pd.merge(nyp, works_and_soloists, on=['ProgramID'])
### END SOLUTION

In [13]:
assert nyp_merged.shape == (27725, 18)
assert set(nyp_merged.columns) == set(
    [
       'ProgramID', 'ConcertID', 'EventType', 'Location', 'Venue', 'Date',
       'Hour', 'Minute', 'Orchestra', 'Season', 'WorkID', 'ComposerName', 'WorkTitle',
       'Movement', 'ConductorName', 'SoloistName', 'SoloistInstrument',
       'SoloistRole'
    ]
)

## 7 Final transformations (graded)

Now, we perform the train-test split.

We also perform some final transformations on both datasets:

* Include some date features: Year, Month, Day and Weekday
* Create a new feature, ComposerLastName from ComposerName column. 
* Filter out rows with a location that appears less than 10 times in the DataFrame.
* Drop ProgramID, ConcertID, WorkID, Date and Season

In [14]:
def append_date_features(df):
    df = df.copy()
    # Use the chain method to create multiple datatime columns
    ### BEGIN SOLUTION
    df = df.assign(
        Year=pd.to_datetime(df['Date']).dt.year,
        Month=pd.to_datetime(df['Date']).dt.month,
        Day=pd.to_datetime(df['Date']).dt.day,
        Weekday=pd.to_datetime(df['Date']).dt.weekday
    )
    ### END SOLUTION
    return df

def append_composer_last_name(df):
    ### BEGIN SOLUTION
    df['ComposerLastName'] = df.ComposerName.map(lambda x: x.split(',')[0])
    ### END SOLUTION
    return df

def preprocess_data(df):
    # You should follow these exact steps:
    #   1 - Include some date features: Year, Month, Hour, Minute, Day and Weekday
    #   2 - Create a new feature, ComposerLastName from ComposerName column. 
    #   3 - Filter out rows that have a location that appear is less than 10 times in the DataFrame.
    #   4 - Drop ProgramID, ConcertID, WorkID, Season, Date, Time
    #   
    ### BEGIN SOLUTION
    df = df.copy()
    df = df.pipe(
                append_date_features
            ).pipe(
                append_composer_last_name
            ).groupby(
                'Location'
            ).filter(
                lambda x: x.shape[0] >= 10
            ).drop(
                columns=['ProgramID', 'ConcertID', 'WorkID', 'Season', 'Date']
            )
    ### END SOLUTION
    return df


nyp_preprocessed = preprocess_data(nyp_merged)

In [15]:
assert nyp_preprocessed.shape == (27571, 18)
assert set(nyp_preprocessed.columns) == {
       'EventType', 'Location', 'Venue', 'Orchestra',
       'ComposerName', 'WorkTitle', 'Movement', 'ConductorName', 'SoloistName',
       'SoloistInstrument', 'SoloistRole', 'Year', 'Month', 'Day', 'Hour',
       'Minute', 'Weekday', 'ComposerLastName'
}
assert nyp_preprocessed.groupby('Location').size().min() == 10
assert nyp_preprocessed.ComposerLastName.value_counts().loc['Mozart'] == 512
assert nyp_preprocessed.ComposerLastName.value_counts().loc['Gershwin'] == 1673
assert nyp_preprocessed.ComposerLastName.nunique() == 236

# The house prices dataset

A dataset containing several characteristics of several houses and their selling price 

* LotFrontage: Linear feet of street connected to property
* LotArea: Lot size in square feet
* OverallQual: Rates the overall material and finish of the house
       10  Very Excellent
       9	Excellent
       8	Very Good
       7	Good
       6	Above Average
       5	Average
       4	Below Average
       3	Fair
       2	Poor
       1	Very Poor
* OverallCond: Rates the overall condition of the house

       10	Very Excellent
       9	Excellent
       8	Very Good
       7	Good
       6	Above Average	
       5	Average
       4	Below Average	
       3	Fair
       2	Poor
       1	Very Poor
* MasVnrArea: Masonry veneer area in square feet
* BsmtFinSF1: Type 1 finished square feet
* BsmtUnfSF: Unfinished square feet of basement area
* TotalBsmtSF: Total square feet of basement area
* 1stFlrSF: First Floor square feet
* 2ndFlrSF: Second floor square feet
* LowQualFinSF: Low quality finished square feet (all floors)
* GrLivArea: Above grade (ground) living area square feet
* BsmtFullBath: Basement full bathrooms
* BsmtHalfBath: Basement half bathrooms
* FullBath: Full bathrooms above grade
* HalfBath: Half baths above grade
* BedroomAbvGr: Bedrooms above grade (does NOT include basement bedrooms)
* KitchenAbvGr: Kitchens above grade
* TotRmsAbvGrd: Total rooms above grade (does not include bathrooms)
* Fireplaces: Number of fireplaces
* GarageCars: Size of garage in car capacity
* GarageArea: Size of garage in square feet
* WoodDeckSF: Wood deck area in square feet
* OpenPorchSF: Open porch area in square feet
* EnclosedPorch: Enclosed porch area in square feet
* 3SsnPorch: Three season porch area in square feet
* ScreenPorch: Screen porch area in square feet
* PoolArea: Pool area in square feet
* MiscVal: $Value of miscellaneous feature 
* SellingDate: Date when the house was sold
* BuildingDate: Date when the house was built
* RemodAddDate: Remodel date (same as construction date if no remodeling or additions)
* SalePrice: The house price at the selling date (our target variable)

Let's read the csv and create our train-test-split

In [16]:
def house_price_dataset():
    return pd.read_csv(
    'data/housePrices.csv', 
        parse_dates=[
            'SellingDate',
            'BuildingDate',
            'RemodAddDate'
        ]
    )

dataset = house_price_dataset()
dataset_train, dataset_test = train_test_split(dataset, random_state=0)
X_train = dataset_train.drop(columns='SalePrice')
y_train = dataset_train.SalePrice
X_test = dataset_test.drop(columns='SalePrice')
y_test = dataset_test.SalePrice

## 8 Build a DateTransformer transformer (graded)

There's a simple transformer that can be useful, from times to times, when modeling.

What we want is to build a transformer that transforms dates into timedeltas.

Usually when you have features that are Dates you compute a time delta between the feature and a given refence date.

e.g Imagine that your clients have a loyalty period that ends at a given date. When your model is doing some predictions, one of the features that you can use is the number of days until the end of the loyalty period. i.e the date when the loyalty ends minus the date when your model is running. 

In the house prices dataset, the selling date will be the reference data, since we want to predict the house price at the selling date. For instance, two houses with the exact same features can vary in prices if the construction year is different. So we should input this information and feed into the model. Then we need to convert the other dates using our transformer

Hint: Result should be integers

In [17]:
class DateTransformer(BaseEstimator, TransformerMixin):
    # Implement the __init__ method.
    # Our DateTransformer must be able to receive two parameters: 
    # datetime_cols: a list, that contains the datetime cols that should be converted
    # ref_date_col - indicates the name of the column that should be used as reference date,
    ### BEGIN SOLUTION
    def __init__(self, datetime_cols, ref_date_col):
        self.ref_date_col = ref_date_col
        self.datetime_cols = datetime_cols
    ### END SOLUTION
        
    # There's no need for a fit method in this case, it does nothing.
    # We should be able to call fit without any explicit parameters.
    # Meaning: we should be able to call transformer.fit().
    ### BEGIN SOLUTION
    def fit(self, X=None, y=None):
        return self
    ### END SOLUTION

    # Transform should transform all datetime columns into the difference in days to the reference date.
    # The reference date column should be dropped. 
    ### BEGIN SOLUTION
    def transform(self, X):
        X_ = X.copy()
        for col in self.datetime_cols:
            X_[col] = (X_[col] - X_[self.ref_date_col]).dt.days
        return X_.drop(columns='SellingDate')
    ### END SOLUTION

In [18]:
X_train_transformed = DateTransformer(
    datetime_cols=['BuildingDate', 'RemodAddDate'], 
    ref_date_col='SellingDate'
).fit_transform(X_train)
assert X_train_transformed.BuildingDate.min() == -49008
assert X_train_transformed.BuildingDate.max() == -1
assert 'SellingDate' not in X_train_transformed.columns
assert X_train_transformed.dtypes.BuildingDate == np.dtype('int64')
assert X_train_transformed.dtypes.RemodAddDate == np.dtype('int64')

You might be wondering why we have to implement it as a Transformer Class, and not using functions.
You'll understand the reason in the next section - so we can tie them all together in a `Pipeline`.

## 9 Building the pipeline (graded)

Finally, we want to use the two transformers together and run a linear regression on top.

We want to Convert the dates to time deltas relative to the Selling Date.

We want to scale all features to the same range, using `sklearn.preprocessing.StandardScaler()`.

We want to estimate the SellingPrice using a Liner Regression.

Standardization of datasets is a common requirement for many machine learning estimators implemented in scikit-learn; they might behave badly if the individual features do not more or less look like standard normally distributed data: Gaussian with zero mean and unit variance.

In practice we often ignore the shape of the distribution and just transform the data to center it by removing the mean value of each feature, then scale it by dividing non-constant features by their standard deviation.

For instance, many elements used in the objective function of a learning algorithm (such as the RBF kernel of Support Vector Machines or the l1 and l2 regularizers of linear models) assume that all features are centered around zero and have variance in the same order. If a feature has a variance that is orders of magnitude larger than others, it might dominate the objective function and make the estimator unable to learn from other features correctly as expected.



In [24]:
X_train.describe()

Unnamed: 0,LotFrontage,LotArea,OverallQual,OverallCond,MasVnrArea,BsmtUnfSF,TotalBsmtSF,1stFlrSF,2ndFlrSF,LowQualFinSF,...,Fireplaces,GarageCars,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal
count,1051.0,1051.0,1051.0,1051.0,1051.0,1051.0,1051.0,1051.0,1051.0,1051.0,...,1051.0,1051.0,1051.0,1051.0,1051.0,1051.0,1051.0,1051.0,1051.0,1051.0
mean,56.516651,10165.033302,6.095147,5.594672,102.087536,564.330162,1046.601332,1159.809705,351.509039,6.424358,...,0.627022,1.760228,470.736441,97.31589,44.084681,21.963844,3.058991,15.811608,2.394862,28.690771
std,33.228422,6319.536187,1.365732,1.120939,179.72347,442.621653,418.2101,380.318077,437.579545,52.580304,...,0.648656,0.744233,209.647682,124.988613,62.975199,60.397025,27.657847,56.470192,39.015517,190.231875
min,0.0,1491.0,1.0,2.0,0.0,0.0,0.0,334.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,40.0,7500.0,5.0,5.0,0.0,217.5,792.5,876.0,0.0,0.0,...,0.0,1.0,336.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,61.0,9505.0,6.0,5.0,0.0,463.0,990.0,1077.0,0.0,0.0,...,1.0,2.0,477.0,0.0,24.0,0.0,0.0,0.0,0.0,0.0
75%,78.0,11635.0,7.0,6.0,161.0,808.0,1282.5,1382.5,736.0,0.0,...,1.0,2.0,576.0,177.5,64.5,0.0,0.0,0.0,0.0,0.0
max,182.0,115149.0,10.0,9.0,1600.0,2336.0,3200.0,3228.0,2065.0,572.0,...,3.0,4.0,1390.0,728.0,547.0,386.0,508.0,480.0,738.0,3500.0


In [28]:
# Create a pipeline including:
#   1 - 'date_converter', DateTransformer(['BuildingDate', 'RemodAddDate'], ref_date_col='SellingDate')
#   2 - 'standard_scaler', StandardScaler() with the default parameters
#   3 - 'model', LinearRegression
### BEGIN SOLUTION
pipeline = Pipeline(
    [
        (
            'date_converter', 
            DateTransformer(
                ['BuildingDate', 'RemodAddDate'], 
                ref_date_col='SellingDate'
            )
        ),
        (
            'robust_scaler', 
            RobustScaler()
        ),
        (
            'model', 
            LinearRegression()
        )
    ]
)
### END SOLUTION


pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
print('MAE: {}'.format(mae))

MAE: 20737.060193147765


In [32]:
assert type(pipeline) == Pipeline
assert type(pipeline.named_steps['date_converter']) == DateTransformer
assert type(pipeline.named_steps['robust_scaler']) == RobustScaler
assert pipeline.named_steps['date_converter'].get_params()['ref_date_col'] == 'SellingDate'
assert set(
    pipeline.named_steps['date_converter'].get_params()['datetime_cols']
) == {'BuildingDate', 'RemodAddDate'}
assert type(pipeline.named_steps['model']) == LinearRegression 

## 10. Access the cofficients from the pipeline (ungraded)

Now we would want to obtain the coefficients from the model to understand features with the most predictive power.

In [33]:
#coefs = ....
### BEGIN SOLUTION
coefs = pipeline.named_steps['model'].coef_
### END SOLUTION

In [23]:
assert coefs.shape == (30,), 'Wrong number of coefficients. Did you select the features correctly?'

Exercises complete, congratulations! You are about to become a certified data wrangler.