In [None]:
import pandas as pd
import matplotlib.pyplot as plt

### Q: Timestamp Feature Extraction (non-graded)

One of the easiest, and more abundante, sources of information in a time series is the time of each observation. From the timestamp we can extract 

In [None]:
from sklearn.base import TransformerMixin


class TimestampFeatureExtractor(TransformerMixin): 
    """
    Transformer used to extract, from a pandas dataframe column, 
    one feature of the timestamps used in that column, and writes 
    the value in another column
    
    
    Parameters
    ----------
    source_column : str
        The name of the column of the pandas dataframes used in 
        `TimestampFeatureExtractor.transform` from where we will 
        extract the timestamp feature.
        
    unit_name : 'second', 'minute', 'hour', 'day'
        The name of the feature we want to extract: 
            - 'second': number of seconds of the time value.
            - 'minute': number of minutes of the time value.
            - 'hour': number of hours of the time value.
            - 'day': number of days of the time value.
            
    target_column : str
        The name of the column where the feature values will 
        be written.
        
    copy : boolean (default: True)
        If True, the pandas dataframe passed into `TimestampFeatureExtractor.transform` 
        must be copied before applying any operation. At the end, return the copy.
    """
    def __init__(self, source_column, unit_name, target_column, copy=True): 
        # Initialize fields with parameters.
        self.source_column = source_column
        self.unit_name = unit_name
        self.target_column = target_column
        self.copy = copy
    
    def fit(self, X, *_): 
        """
        
        Parameters
        ----------
        X : pandas.DataFrame
        
        
        Returns
        -------
        self : TimestampFeatureExtractor
        """
        return self
    
    def transform(self, X, *_): 
        """
        
        Parameters
        ----------
        X : pandas.DataFrame
        
        
        Returns
        -------
        X_new : pandas.DataFrame
        """
        # 1) When copy == True, you must operate of a copy of X.
        
        # 2) If unit_name == 'second' the target column must contain 
        #    the number of seconds of the datetime source column.
        
        # 3) If unit_name == 'minute' the target column must contain 
        #    the number of seconds of the datetime source column.
        
        # 4) If unit_name == 'hour' the target column must contain 
        #    the number of seconds of the datetime source column.
        
        # 5) If unit_name == 'second' the target column must contain 
        #    the number of seconds of the datetime source column.
        
        # YOUR CODE HERE
        raise NotImplementedError()
        return X

In [None]:
df = pd.DataFrame({
    'datetime': pd.date_range(start='February 1 2018', end='February 28 2018', freq='S')
})

In [None]:
t = TimestampFeatureExtractor('datetime', 'second', 'datetime_seconds', copy=True)
t.fit(df);
print(t.transform(df).sample(10, random_state=10))

Expected output:
```
                   datetime  datetime_seconds
1235354 2018-02-15 07:09:14                14
1355668 2018-02-16 16:34:28                28
1594321 2018-02-19 10:52:01                 1
1848764 2018-02-22 09:32:44                44
1670650 2018-02-20 08:04:10                10
1381401 2018-02-16 23:43:21                21
899763  2018-02-11 09:56:03                 3
1041075 2018-02-13 01:11:15                15
613274  2018-02-08 02:21:14                14
2196823 2018-02-26 10:13:43                43

```

In [None]:
t = TimestampFeatureExtractor('datetime', 'minute', 'datetime_minutes', copy=True)
t.fit(df);
print(t.transform(df).sample(10, random_state=20))

Expected output:
```
                   datetime  datetime_minutes
996783  2018-02-12 12:53:03                53
2136413 2018-02-25 17:26:53                26
346832  2018-02-05 00:20:32                20
1232036 2018-02-15 06:13:56                13
2154432 2018-02-25 22:27:12                27
1172351 2018-02-14 13:39:11                39
2297325 2018-02-27 14:08:45                 8
1994796 2018-02-24 02:06:36                 6
489748  2018-02-06 16:02:28                 2
2092819 2018-02-25 05:20:19                20
```

In [None]:
t = TimestampFeatureExtractor('datetime', 'hour', 'datetime_hours', copy=True)
t.fit(df);
print(t.transform(df).sample(10, random_state=60))

Expected output:
```
                   datetime  datetime_hours
1329323 2018-02-16 09:15:23               9
360966  2018-02-05 04:16:06               4
533492  2018-02-07 04:11:32               4
1412048 2018-02-17 08:14:08               8
1995861 2018-02-24 02:24:21               2
1258924 2018-02-15 13:42:04              13
1777769 2018-02-21 13:49:29              13
1637317 2018-02-19 22:48:37              22
562282  2018-02-07 12:11:22              12
1650510 2018-02-20 02:28:30               2
```

### Q: Cyclical Feature Encoding (non-graded)

In [None]:
from sklearn.base import TransformerMixin
import pandas as pd
import numpy as np

In [None]:
class CyclicalFeatureEncoding(TransformerMixin):
    def __init__(self, source_column, num_divisions, target_column, copy=True):
        self.source_column = source_column
        self.target_column = target_column
        self.num_divisions = num_divisions
        self.copy = copy
    
    def fit(self, X, *_):
        return self
    
    def transform(self, X, *_):
        # 1) When copy == True, you must operate of 
        #    a copy of X.
        
        # 2) Compute the sine
        #    X["{} (sin)".format(self.target_column)] = ...
        
        # 3) Compute the cosine
        #    X["{} (cos)".format(self.target_column)] = ...
        
        # YOUR CODE HERE
        raise NotImplementedError()
        return X

In [None]:
t = TimestampFeatureExtractor('datetime', 'minute', 'datetime_minutes', copy=True)
c = CyclicalFeatureEncoding('datetime_minutes', 60, 'datetime_minutes')
print(c.fit_transform(t.fit_transform(df)).sample(10, random_state=50))

Expected output:
```

                   datetime  datetime_minutes  datetime_minutes (sin)  \
1462307 2018-02-17 22:11:47                11                0.913545   
640727  2018-02-08 09:58:47                58               -0.207912   
737055  2018-02-09 12:44:15                44               -0.994522   
576656  2018-02-07 16:10:56                10                0.866025   
53823   2018-02-01 14:57:03                57               -0.309017   
1910070 2018-02-23 02:34:30                34               -0.406737   
1190855 2018-02-14 18:47:35                47               -0.978148   
2067682 2018-02-24 22:21:22                21                0.809017   
1596263 2018-02-19 11:24:23                24                0.587785   
2179607 2018-02-26 05:26:47                26                0.406737   

         datetime_minutes (cos)  
1462307                0.406737  
640727                 0.978148  
737055                -0.104528  
576656                 0.500000  
53823                  0.951057  
1910070               -0.913545  
1190855                0.207912  
2067682               -0.587785  
1596263               -0.809017  
2179607               -0.913545  
```

In [None]:
t = TimestampFeatureExtractor('datetime', 'hour', 'datetime_minutes', copy=True)
c = CyclicalFeatureEncoding('datetime_minutes', 24, 'datetime_minutes')
print(c.fit_transform(t.fit_transform(df)).sample(10, random_state=51))

Expected output:
```
                   datetime  datetime_minutes  datetime_minutes (sin)  \
1794146 2018-02-21 18:22:26                18               -1.000000   
2298896 2018-02-27 14:34:56                14               -0.500000   
314733  2018-02-04 15:25:33                15               -0.707107   
2256784 2018-02-27 02:53:04                 2                0.500000   
834027  2018-02-10 15:40:27                15               -0.707107   
1246054 2018-02-15 10:07:34                10                0.500000   
2006258 2018-02-24 05:17:38                 5                0.965926   
863632  2018-02-10 23:53:52                23               -0.258819   
1799372 2018-02-21 19:49:32                19               -0.965926   
474919  2018-02-06 11:55:19                11                0.258819   

         datetime_minutes (cos)  
1794146           -1.836970e-16  
2298896           -8.660254e-01  
314733            -7.071068e-01  
2256784            8.660254e-01  
834027            -7.071068e-01  
1246054           -8.660254e-01  
2006258            2.588190e-01  
863632             9.659258e-01  
1799372            2.588190e-01  
474919            -9.659258e-01 
```

### Q: Setting a transformation pipeline (non-graded)

Create a scikit pipeline with four TimestampFeatureExtractor instances. Read the comments in order to know what pipeline steps you should create. Do not forget that a [pipeline](http://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html) step is defined as tuple *(NAME OF THE STEP, TRANSFORMER/PREDICTOR)*.

In [None]:
from sklearn.pipeline import Pipeline

In [None]:
# Create a scikit pipeline with the following named steps:

# 'seconds_extractor': a TimestampFeatureExtraction instance that extracts the 
# seconds from the 'datetime' column and write them in 'datetime_second'
# seconds_extractor_tuple = ...
# YOUR CODE HERE
raise NotImplementedError()

# 'minutes_extractor': a TimestampFeatureExtraction instance that extracts the 
# seconds from the 'datetime' column and write them in 'datetime_minute'
# minutes_extractor_tuple = ...
# YOUR CODE HERE
raise NotImplementedError()

# 'hours_extractor': a TimestampFeatureExtraction instance that extracts the 
# seconds from the 'datetime' column and write them in 'datetime_hour'
# hours_extractor_tuple = ...
# YOUR CODE HERE
raise NotImplementedError()

# 'days_extractor': a TimestampFeatureExtraction instance that extracts the 
# day from the 'datetime' column and write them in 'datetime_seconds'
# hours_extractor_tuple = ...
# YOUR CODE HERE
raise NotImplementedError()


# Finally, create a pipeline with the previous tuples you created.
# timestamp_features_extraction_pipeline = ...
# YOUR CODE HERE
raise NotImplementedError()

In [None]:
timestamp_features_extraction_pipeline.fit(df);
transformed = timestamp_features_extraction_pipeline.transform(df).sample(10, random_state=20)
print(transformed)

Expected output:
```
                   datetime  datetime_second  datetime_minute  datetime_hour  \
996783  2018-02-12 12:53:03                3               53             12   
2136413 2018-02-25 17:26:53               53               26             17   
346832  2018-02-05 00:20:32               32               20              0   
1232036 2018-02-15 06:13:56               56               13              6   
2154432 2018-02-25 22:27:12               12               27             22   
1172351 2018-02-14 13:39:11               11               39             13   
2297325 2018-02-27 14:08:45               45                8             14   
1994796 2018-02-24 02:06:36               36                6              2   
489748  2018-02-06 16:02:28               28                2             16   
2092819 2018-02-25 05:20:19               19               20              5   

         datetime_day  
996783             12  
2136413            25  
346832              5  
1232036            15  
2154432            25  
1172351            14  
2297325            27  
1994796            24  
489748              6  
2092819            25  
```

### Q: Handling different number of days per month (non-graded)

As we know by now, in order to use cyclical feature encoding, we need

Not all months in the year have the same number of days. For example, January has more days that February. In order to take tha difference into account for computing the sine and cosine for the day, we need to use the number of days for a given month

In [None]:
class CyclicalDayInMonthFeatureEncoding(TransformerMixin):
    def __init__(self, source_column, target_column, copy=True): 
        self.source_column = source_column
        self.target_column = target_column
        self.copy = copy
    
    def fit(self, X, *_):
        return self
    
    def transform(self, X, *_):
        # YOUR CODE HERE
        raise NotImplementedError()
        return X

In [None]:
df_ = timestamp_features_extraction_pipeline.transform(df)
c = CyclicalDayInMonthFeatureEncoding('datetime', 'datetime_day')
print(c.fit_transform(df_).sample(10, random_state=10))

Expected output:
```
                   datetime  datetime_second  datetime_minute  datetime_hour  \
1235354 2018-02-15 07:09:14               14                9              7   
1355668 2018-02-16 16:34:28               28               34             16   
1594321 2018-02-19 10:52:01                1               52             10   
1848764 2018-02-22 09:32:44               44               32              9   
1670650 2018-02-20 08:04:10               10                4              8   
1381401 2018-02-16 23:43:21               21               43             23   
899763  2018-02-11 09:56:03                3               56              9   
1041075 2018-02-13 01:11:15               15               11              1   
613274  2018-02-08 02:21:14               14               21              2   
2196823 2018-02-26 10:13:43               43               13             10   

         datetime_day  datetime_day (sin)  datetime_day (cos)  
1235354            15           -0.222521           -0.974928  
1355668            16           -0.433884           -0.900969  
1594321            19           -0.900969           -0.433884  
1848764            22           -0.974928            0.222521  
1670650            20           -0.974928           -0.222521  
1381401            16           -0.433884           -0.900969  
899763             11            0.623490           -0.781831  
1041075            13            0.222521           -0.974928  
613274              8            0.974928           -0.222521  
2196823            26           -0.433884            0.900969  
```

In [None]:
print(c.fit_transform(df_).sample(10, random_state=20))

Expected output:
```
                   datetime  datetime_second  datetime_minute  datetime_hour  \
996783  2018-02-12 12:53:03                3               53             12   
2136413 2018-02-25 17:26:53               53               26             17   
346832  2018-02-05 00:20:32               32               20              0   
1232036 2018-02-15 06:13:56               56               13              6   
2154432 2018-02-25 22:27:12               12               27             22   
1172351 2018-02-14 13:39:11               11               39             13   
2297325 2018-02-27 14:08:45               45                8             14   
1994796 2018-02-24 02:06:36               36                6              2   
489748  2018-02-06 16:02:28               28                2             16   
2092819 2018-02-25 05:20:19               19               20              5   

         datetime_day  datetime_day (sin)  datetime_day (cos)  
996783             12        4.338837e-01           -0.900969  
2136413            25       -6.234898e-01            0.781831  
346832              5        9.009689e-01            0.433884  
1232036            15       -2.225209e-01           -0.974928  
2154432            25       -6.234898e-01            0.781831  
1172351            14        1.224647e-16           -1.000000  
2297325            27       -2.225209e-01            0.974928  
1994796            24       -7.818315e-01            0.623490  
489748              6        9.749279e-01            0.222521  
2092819            25       -6.234898e-01            0.781831  
```