![](CRISP_DM.png)

In [None]:
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import make_pipeline
# for plotting
%matplotlib inline
import matplotlib as mpl
import plotly.graph_objects as go
import matplotlib.pyplot as plt
mpl.rcParams['figure.figsize'] = (16, 10)
# dataframe option
pd.set_option('display.max_rows', 200)

In [None]:
# set date from starting in ascending order and create country list
df_analyse=pd.read_csv('../data/processed/COVID_small_sync_timeline_table.csv',sep=';')  
country_list=df_analyse.columns[1:] # creat country list
df_analyse.sort_values('date',ascending=True).head()

## 70.i Functions

In [None]:
# same as last notebook to easen our work for plotting differnt data series dataset
def quick_plot(x_in, df_input,y_scale='log',slider=False):
    """ Quick basic plot for quick static evaluation of a time series
    
        you can push selective columns of your data frame by .iloc[:,[0,6,7,8]]
        
        Parameters:
        ----------
        x_in : array 
            array of date time object, or array of numbers
        df_input : pandas dataframe 
            the plotting matrix where each column is plotted
            the name of the column will be used for the legend
        scale: str
            y-axis scale as 'log' or 'linear'
        slider: bool
            True or False for x-axis slider
    
        
        Returns:
        ----------
        
    """
    fig = go.Figure()

    for each in df_input.columns:
        fig.add_trace(go.Scatter(x=x_in, y=df_input[each], name=each, opacity=1.0))
    
    fig.update_layout(autosize=True, width=800,height=800,xaxis_title = 'Timeline in Days',
                      yaxis_title = 'Confirmed infected people (Source:Johns-hopkins csse)',
                      font=dict(family="PT Sans, monospace",size=14,color="#860303"))
    fig.update_yaxes(type=y_scale),
    fig.update_xaxes(tickangle=-45,nticks=20,tickfont=dict(size=12,color="#860303"))
    if slider==True:
        fig.update_layout(xaxis_rangeslider_visible=True)
    fig.show()

In [None]:
# function to calculate mean_absoulute_percentage_error
def MAPE(y_true, y_pred): 
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

In [None]:
quick_plot(df_analyse.date, df_analyse.iloc[:,3:-1], y_scale='log',slider=True)

## 70.ii Fitting a polynomial curve
*This function is from the [Python Data Science Handbook](http://shop.oreilly.com/product/0636920034919.do) by Jake VanderPlas; the content is available [on GitHub](https://github.com/jakevdp/PythonDataScienceHandbook).*
* Polynomial interpolation is approximate a function with a polynomial of degree n_degree by using ridge regression [scikit-learn.org](https://scikit-learn.org/stable/auto_examples/linear_model/plot_polynomial_interpolation.html#sphx-glr-auto-examples-linear-model-plot-polynomial-interpolation-py)

In [None]:
# check the data of first 26 raw and skipping the doubling rate column and save in other dataframe 
df_check=df_analyse.iloc[0:27,3:-1].reset_index()
df_check.head(20)

### 70.ii.a. Usage of *args and *kwargs

In [None]:
# function for polynomial regression
def Poly_Reg(degree=2, **kwargs):
    return make_pipeline(PolynomialFeatures(degree),
                         LinearRegression(**kwargs))

In [None]:
#Pivot a level of the (necessarily hierarchical) index labels.
y=df_check[['Germany','Italy','US','Spain','Brazil']].unstack().sort_index(axis=0,level=1)

In [None]:
y.head()

In [None]:
# test points = 29 for overfitting
test_points=28
y_train=y[0:-test_points-1]
y_test=y[-test_points:]

In [None]:
X_train=np.arange(len(y_train)).reshape(-1, 1)/4.0 
X_test=np.arange(len(y_train),len(y_train)+test_points).reshape(-1, 1)/4.0

In [None]:
# use of subplot feature from matplotlib for plotting
fig, ax1 = plt.subplots(1, 1)

ax1.scatter(np.arange(len(y))/4,y, color='black')
ax1.axvspan((len(y)-test_points-1)/4, len(y)/4, facecolor='y', alpha=0.5)

for degree in [1,3,7,15]:
    y_hat_insaple=Poly_Reg(degree).fit(X_train, y_train).predict(X_train)
    y_hat_test = Poly_Reg(degree).fit(X_train, y_train).predict(X_test)

    X_plot=np.concatenate((X_train, X_test), axis=None)
    y_plot=np.concatenate((y_hat_insaple, y_hat_test), axis=None)

    ax1.plot(X_plot, y_plot, label='degree={0}'.format(degree)+ 
                 '     MAPE train:  ' + str(MAPE(y_hat_insaple, y_train))[0:3]
                 +'    MAPE test    ' +str(MAPE(y_hat_test, y_test))[0:3]) 

ax1.set_ylim(100, 1500000)
ax1.set_yscale('log')
ax1.legend(loc='best',prop={'size': 16});