# BLU02 - Exercise Notebook

In [None]:
import hashlib
import json

import os
import pandas as pd
import numpy as np
import datetime
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, RobustScaler, FunctionTransformer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error

### Movies dataset
In the first part of the exercise notebook, we'll be working with data about movies. We have data about Oscar nominees from 1928 to 2003 in the folder `data/oscars` and data about movies from IMDB in the file `data/imdb_top_1000.csv`. We'll also use the files `data/gross_top_200.csv` and `data/budget_top_500.csv` with movie revenue and budget data.

## Exercise 1 - Read the Oscars data

Complete the function `read_nominees` which should:
- read all the Oscar nominees files from the year 1960 onwards from the provided path, `data/oscars` folder
- combine all the information into one dataframe called `nominees`
- drop the `ceremony` column
- remove the missing values
- return the resulting dataframe `nominees`.

You can use the provided helper function `read_year` which reads one file given the path and the file name.

In [None]:
def read_year(folder_path, file_name):
    path = os.path.join(folder_path, file_name)
    return pd.read_csv(path, index_col = 0)

def read_nominees(folder_path):
    # YOUR CODE HERE
    raise NotImplementedError()
    return nominees

nominees = read_nominees(os.path.join('data', 'oscars'))

In [None]:
assert isinstance(nominees, pd.DataFrame), 'The result should be a dataframe.'
assert nominees.shape == (7116,6), 'The shape of nominees is not correct.'
assert nominees['year_ceremony'].min() == 1960, 'Files from 1960 should be included.'
assert nominees['year_ceremony'].max() == 2023, 'Files from 1960 should be included.'
assert nominees.isna().sum().sum() == 0, 'Did you drop the missing values?'

## Exercise 2 - Read the IMDB Ratings data

Complete the function `read_ratings` which should:
- read data from a file like `data/imdb_top_1000.csv`
- drop the `director`, `star1`, `star2`, `star3`, and `star4` columns
- convert the `runtime` column to integer
- remove the lines with no `metascore` info
- return the resulting dataframe `top_rated`.

In [None]:
def read_ratings(file_path): 
    # YOUR CODE HERE
    raise NotImplementedError()
    return top_rated

top_rated = read_ratings(os.path.join('data','imdb_top_1000.csv'))

In [None]:
assert isinstance(top_rated, pd.DataFrame), 'The function should return a dataframe.'
assert top_rated.shape == (843, 5), 'The shape of the top_rated is not correct.'
assert set(top_rated.columns) == set(['film', 'metascore', 'no_votes', 'rating', 'runtime']), 'The column names are not correct.'
assert pd.api.types.is_integer_dtype(top_rated.runtime), 'The runtime column should be an integer.'
assert top_rated.runtime.min() == 64, 'The runtime column is not correct.'
assert top_rated.runtime.max() == 321, 'The runtime column is not correct.'
assert top_rated.metascore.isna().sum() == 0, 'Did you remove the na values?'

## Exercise 3 - Combine Oscars and Ratings data

Combine the `nominees` and `top_rated` dataframes into a single dataset called `best_rated_nominees` using an inner join.

In [None]:
# best_rated_nominees = ...
# YOUR CODE HERE
raise NotImplementedError()

In [None]:
assert best_rated_nominees.shape == (1919, 10), 'The shape of the resulting dataframe is not correct.'
assert set(best_rated_nominees.columns) == set(['year_film', 'year_ceremony', 'category', 'name', 'film', 'winner',
       'runtime', 'rating', 'metascore', 'no_votes']), 'The columns of the resulting dataframe are not correct.'

## Exercise 4 - Read top grossing and budget film data

We will read the two remaining pieces of data, the files `gross_top_200.csv` and `budget_top_500.csv`. Complete the two functions below.

The function `read_gross` should:
- read the provided file, like `data/gross_top_200.csv`
- remove the `year` column
- convert the `gross` column to integer
- return the resulting dataframe `top_grossing`.

The function `read_budget` should:
- read the provided file, like `data/budget_top_500.csv`
- drop the `runtime`, `theaters`, and `year` columns
- convert the `budget_rank` column to integer
- return the resulting dataframe `top_budget`.

Try to use method chaining.

In [None]:
def read_gross(file_path):
    # top_grossing: pd.DataFrame = ...
    # YOUR CODE HERE
    raise NotImplementedError()
    return top_grossing

def read_budget(file_path):
    # YOUR CODE HERE
    raise NotImplementedError()
    return top_budget

top_grossing = read_gross('data/gross_top_200.csv')
top_budget = read_budget('data/budget_top_500.csv')

In [None]:
assert top_grossing.shape == (200, 3), 'The shape of the top_grossing dataframe is not correct.'
assert set(top_grossing.columns) == set(['gross_rank', 'film', 'gross']), 'The column names in top_grossing are not correct.'
assert pd.api.types.is_integer_dtype(top_grossing.gross), 'The gross column should be an integer.'
assert top_budget.shape == (500, 3), 'The shape of the top_budget dataframe are not correct.'
assert set(top_budget.columns) == set(['budget_rank', 'film', 'production_cost']), 'The column names in the top_budget are not correct.'
assert pd.api.types.is_integer_dtype(top_budget.budget_rank), 'The budget_rank column should be an integer.'

## Exercise 5 - Combine the top grossing and budget films

Combine the `top_grossing` and `top_budget` dataframes into a single dataset called `top_grossing_budget` using an inner join.

In [None]:
# top_grossing_budget = 
# YOUR CODE HERE
raise NotImplementedError()

In [None]:
assert top_grossing_budget.shape == (129, 5), 'The shape of the resulting dataframe is not correct.'
assert set(top_grossing_budget.columns) == set(['budget_rank', 'film', 'production_cost',
                                                'gross_rank', 'gross']), 'The column names are not correct.'

## Exercise 6 - Combine everything

Combine the `best_rated_nominees` and `top_grossing_budget` dataframes into one dataframe called `top_films` using an inner join.

In [None]:
# top_films = ...
# YOUR CODE HERE
raise NotImplementedError()

In [None]:
assert top_films.shape == (184, 14), 'The shape of the resulting dataframe is not correct.'
assert set(top_films.columns) == set(['year_film', 'year_ceremony', 'category', 'name', 'film', 'winner',
       'runtime', 'rating', 'metascore', 'no_votes', 'budget_rank',
       'production_cost', 'gross_rank', 'gross']), 'The columns of the resulting dataframes are not correct.'

## Exercise 7 - Final transformations

We will perform some final transformations on the `top_films` dataframe. You need to implement three functions for that.

The function `transform_winner` should transform the `winner` column of the given dataframe into integer, then return the dataframe.

The function `create_rating_rank` should create a new column `rating_rank` in the given dataframe. The `rating_rank` should be the rank of the rating (i.e. the rating values are ordered in ascending order and the rank is the order in which they appear). The values should be integers and the lowest rank should be 1.

The function `preprocess_data` should:
- use pandas pipes
- take the given dataframe
- transform the `winner` column to integer using the `transform_winner` function
- filter out rows with films that appear less than 10 times in the given dataframe
- keep only the columns `film`, `runtime`, `category`, `rating`, `metascore`, `no_votes`, `winner`, `production_cost`, `budget_rank`, `gross`, `gross_rank` in the given dataframe
- create the `rating_rank` column using the `create_rating_rank` function
- sort the resulting dataframe by `rating` in ascending order
- return the dataframe

In [None]:
def transform_winner(df):
    df_ = df.copy()
    # df_ = ...
    # YOUR CODE HERE
    raise NotImplementedError()
    return df_

def create_rating_rank(df):
    df_ = df.copy()
    # df_['rating_rank'] = 
    # YOUR CODE HERE
    raise NotImplementedError()
    return df_

def preprocess_data(df):
    # YOUR CODE HERE
    raise NotImplementedError()
    return df

top_films_preprocessed = preprocess_data(top_films)

In [None]:
assert top_films_preprocessed.shape == (59, 12), 'The shape of the resulting dataframe is not correct.'
assert set(top_films_preprocessed.columns) == {'category', 'film', 'runtime', 'rating', 'metascore', 'no_votes',
       'budget_rank', 'production_cost', 'gross_rank', 'gross', 'rating_rank','winner'}, \
       'The columns of the resulting dataframe are not correct.'
assert hashlib.sha256(json.dumps(list(top_films_preprocessed.film.to_numpy())).encode()).hexdigest() == \
'9cc9972180ded18502e377320d6671cb7f58b4795052a60480f8c92f8eb37e1a', 'The content of the resulting datframe is not correct.'
assert pd.api.types.is_integer_dtype(top_films_preprocessed.rating_rank), 'The rating_rank should be an integer.'
assert hashlib.sha256(json.dumps(''.join([str(i) for i in top_films_preprocessed.film.to_numpy()])).encode()
 ).hexdigest() == '78e339e414421abca749fc1f3bce8cdeb7f7001e4a71cbf2c868a67d8e1f8fe7', \
'The rating_rank column is not correct.'

### The house prices dataset

In the following exercises we will use a dataset with characteristics of houses and their selling price. These are the column names and their meaning.

* LotFrontage: Linear feet of street connected to property
* LotArea: Lot size in square feet
* OverallQual: Rates the overall material and finish of the house   
  (10 - Very Excellent, 9 - Excellent, 8 - Very Good, 7 - Good, 6 - Above Average, 5 - Average, 4 - Below Average, 3 - Fair, 2 - Poor, 1 - Very Poor)
  
* OverallCond: Rates the overall condition of the house   
  (10 - Very Excellent, 9 - Excellent, 8 - Very Good, 7 - Good, 6 - Above Average, 5 - Average, 4 - Below Average, 3 - Fair, 2 - Poor, 1 - Very Poor)
  
* MasVnrArea: Masonry veneer area in square feet
* BsmtFinSF1: Type 1 finished square feet
* BsmtUnfSF: Unfinished square feet of basement area
* TotalBsmtSF: Total square feet of basement area
* 1stFlrSF: First Floor square feet
* 2ndFlrSF: Second floor square feet
* LowQualFinSF: Low quality finished square feet (all floors)
* GrLivArea: Above grade (ground) living area square feet
* BsmtFullBath: Basement full bathrooms
* BsmtHalfBath: Basement half bathrooms
* FullBath: Full bathrooms above grade
* HalfBath: Half baths above grade
* BedroomAbvGr: Bedrooms above grade (does NOT include basement bedrooms)
* KitchenAbvGr: Kitchens above grade
* TotRmsAbvGrd: Total rooms above grade (does not include bathrooms)
* Fireplaces: Number of fireplaces
* GarageCars: Size of garage in car capacity
* GarageArea: Size of garage in square feet
* WoodDeckSF: Wood deck area in square feet
* OpenPorchSF: Open porch area in square feet
* EnclosedPorch: Enclosed porch area in square feet
* 3SsnPorch: Three season porch area in square feet
* ScreenPorch: Screen porch area in square feet
* PoolArea: Pool area in square feet
* MiscVal: $Value of miscellaneous feature 
* SellingDate: Date when the house was sold
* BuildingDate: Date when the house was built
* RemodAddDate: Remodel date (same as construction date if no remodeling or additions)
* SalePrice: The house price at the selling date (our target variable)

Let's read the csv and create the train-test-split.

In [None]:
def read_house_price_dataset():
    return pd.read_csv('data/housePrices.csv', 
                       parse_dates=['SellingDate', 'BuildingDate', 'RemodAddDate'],
                       date_format="%Y-%d-%m")

house_price_dataset = read_house_price_dataset()
house_price_dataset_train, house_price_dataset_test = train_test_split(house_price_dataset, random_state=0)
X_train = house_price_dataset_train.drop(columns='SalePrice')
y_train = house_price_dataset_train.SalePrice
X_test = house_price_dataset_test.drop(columns='SalePrice')
y_test = house_price_dataset_test.SalePrice

## Exercise 8 - Build a function transformer
One thing that influences the house price at selling date is the time since the house was built or remodeled. We're going to create new features with this information.

The house price dataset contains three date columns - the `SellingDate`, the `BuildingDate`, and the `RemodAddDate`. Create a tranformer that calculates the difference in days between a reference column and any other given datetime columns. The new features should be integers.

We're going to take the easy way here and use the `FunctionTransformer`. First define a function called `dif_days` that
- takes three arguments - the dataframe, the name of the reference column, and a list of other datetime columns
- calculates the difference in days between the dates in the reference column and the other datetime columns
- the new features should be stored in columns with the same name appended with `_dif_days`
- drops the datetime columns (not the reference column)
- returns the transformed dataframe

Then, use the `FunctionTransformer` and the `dif_days` function to define a transformer called `dif_days_transformer`.

Hint: use [.dt](https://pandas.pydata.org/pandas-docs/stable/user_guide/basics.html#dt-accessor) to find datetime properties.

In [None]:
def dif_days(df='house_price_dataset', ref_col='SellingDate', time_cols=['BuildingDate', 'RemodAddDate']):
    df_ = df.copy()
    # YOUR CODE HERE
    raise NotImplementedError()
    return df_

# dif_days_transformer = ...
# YOUR CODE HERE
raise NotImplementedError()

In [None]:
X_train_transformed = dif_days_transformer.fit_transform(X_train)
assert X_train_transformed.shape == (1051,31), 'The shape of the resulting dataframe is not correct.'
assert set(X_train_transformed.columns) == {'1stFlrSF', '2ndFlrSF', '3SsnPorch', 'BedroomAbvGr', 'BsmtFullBath',
 'BsmtHalfBath', 'BsmtUnfSF', 'BuildingDate_dif_days', 'EnclosedPorch', 'Fireplaces', 'FullBath', 'GarageArea',
 'GarageCars', 'GrLivArea', 'HalfBath', 'KitchenAbvGr', 'LotArea', 'LotFrontage', 'LowQualFinSF', 'MasVnrArea',
 'MiscVal', 'OpenPorchSF', 'OverallCond', 'OverallQual', 'PoolArea', 'RemodAddDate_dif_days', 'ScreenPorch',
 'SellingDate', 'TotRmsAbvGrd', 'TotalBsmtSF', 'WoodDeckSF'}, 'The column names are not correct.'
assert pd.api.types.is_integer_dtype(X_train_transformed.RemodAddDate_dif_days), \
 'The new features should be integers.'
assert pd.api.types.is_integer_dtype(X_train_transformed.BuildingDate_dif_days), \
 'The new features should be integers.'
assert hashlib.sha256(json.dumps(''.join([str(i) for i in X_train_transformed.BuildingDate_dif_days.to_numpy()])
).encode()).hexdigest() == '1061e0ff164b38138a483dcc9d658d355ac7f94b1eb5fb079425fa521491e951',\
'The transformed columns are not correct.'
assert hashlib.sha256(json.dumps(''.join([str(i) for i in X_train_transformed.RemodAddDate_dif_days.to_numpy()])
).encode()).hexdigest() == '82aad4370b44add830240b37f38c1140c0d09879fa48b069c8e7cfe9cfa721e7',\
'The transformed columns are not correct.'

## Exercise 9 - Build a transformer class
Now build the same transformer as a class. The transformer should be called `DateTransformer` and have three methods, `__init__`, `fit`, and `transform`.

The `__init__` method should initialize two parameters
- `ref_col`, a string - the name of the reference column
- `time_cols`, a list of strings - the names of the other datetime columns

The `fit` method should not do anything, just return `self`, so that we are able to call it. Make sure it takes all the required arguments.

The `transform` method should
- calculate the difference in days between the dates in the reference column and the other datetime columns
- the new features should be stored in columns with the same name appended with `_dif_days`
- drop the datetime columns and the reference column
- return the transformed dataframe

In [None]:
class DateTransformer(TransformerMixin, BaseEstimator):
    # YOUR CODE HERE
    raise NotImplementedError()

In [None]:
X_train_transformed_2 = DateTransformer(time_cols=['BuildingDate', 'RemodAddDate'], ref_col='SellingDate'
                                     ).fit_transform(X_train)
assert X_train_transformed_2.shape == (1051,30), 'The shape of the resulting dataframe is not correct.'
assert set(X_train_transformed_2.columns) == {'1stFlrSF', '2ndFlrSF', '3SsnPorch', 'BedroomAbvGr', 'BsmtFullBath',
 'BsmtHalfBath', 'BsmtUnfSF', 'BuildingDate_dif_days', 'EnclosedPorch', 'Fireplaces', 'FullBath', 'GarageArea',
 'GarageCars', 'GrLivArea', 'HalfBath', 'KitchenAbvGr', 'LotArea', 'LotFrontage', 'LowQualFinSF', 'MasVnrArea',
 'MiscVal', 'OpenPorchSF', 'OverallCond', 'OverallQual', 'PoolArea', 'RemodAddDate_dif_days', 'ScreenPorch',
 'TotRmsAbvGrd', 'TotalBsmtSF', 'WoodDeckSF'}, 'The column names are not correct.'
assert pd.api.types.is_integer_dtype(X_train_transformed_2.RemodAddDate_dif_days), \
 'The new features should be integers.'
assert pd.api.types.is_integer_dtype(X_train_transformed_2.BuildingDate_dif_days), \
 'The new features should be integers.'
assert hashlib.sha256(json.dumps(''.join([str(i) for i in X_train_transformed_2.BuildingDate_dif_days.to_numpy()])
).encode()).hexdigest() == '1061e0ff164b38138a483dcc9d658d355ac7f94b1eb5fb079425fa521491e951',\
'The transformed columns are not correct.'
assert hashlib.sha256(json.dumps(''.join([str(i) for i in X_train_transformed_2.RemodAddDate_dif_days.to_numpy()])
).encode()).hexdigest() == '82aad4370b44add830240b37f38c1140c0d09879fa48b069c8e7cfe9cfa721e7',\
'The transformed columns are not correct.'

## Exercise 10 - Build the pipeline

Finally, we want to use our new transformer in a pipeline. Build a pipeline called `house_price_pipeline` with three steps
- step 1 called `date_converter` using the `DateTransformer` which transforms the `BuildingDate` and `RemodAddDate` columns with reference to the `SellingDate` column
- step 2 called `robust_scaler` using the sklearn `RobustScaler`
- step 3 called `model` using the sklearn `LinearRegression`.

In [None]:
# house_price_pipeline =
# YOUR CODE HERE
raise NotImplementedError()

In [None]:
house_price_pipeline.fit(X_train, y_train)
y_pred = house_price_pipeline.predict(X_test)

assert isinstance(house_price_pipeline,Pipeline), 'Did you define the pipeline correctly?'
assert list(house_price_pipeline.named_steps.keys())==['date_converter', 'robust_scaler', 'model']
assert isinstance(house_price_pipeline.named_steps['date_converter'],DateTransformer), 'The date_converter step is not correct.'
assert house_price_pipeline.named_steps['date_converter'].get_params()['ref_col'] == 'SellingDate', \
 'The parameters of the DateTransformer are not correct.'
assert set(house_price_pipeline.named_steps['date_converter'].get_params()['time_cols']
          ) == {'BuildingDate', 'RemodAddDate'}, 'The parameters of the DateTransformer are not correct.'
assert isinstance(house_price_pipeline.named_steps['robust_scaler'],RobustScaler), 'The robust_scaler step is not correct.'
assert isinstance(house_price_pipeline.named_steps['model'],LinearRegression), 'The model step is not correct.'
assert house_price_pipeline.named_steps['model'].coef_.shape == (30,), 'Wrong number of coefficients. Did you select the features correctly?'

Exercises complete, congratulations! You are about to become a certified data wrangler.