This is an attempt to make a class based on the ARIMA model I did for the covid project. I want to make it a piece of code that I can reuse for arbitrary jurisdictions in the John Hopkins data. Those 3 fields are Admin2, Province_State, and Country_Region.

In [53]:
def data_prep(file_name, place):
    """Read data off disk and manipulate it to be ready for class Stationary"""
    
    # Read in dfs
    df = pd.read_csv(file_name)
    country_codes_df = pd.read_csv(r'data/country_codes_edited.csv')
    
    # Lets rename the 3 letter code column
    country_codes_df.rename({'ISO3166-1-Alpha-3': 'Alpha_3'}, axis=1, inplace =True)
    
    # We only want Country_Region and Alpha-3
    country_codes_df = country_codes_df[['Country_Region', 'Alpha_3']]
    
    # Merge the 2 dfs
    df_cc = df.merge(country_codes_df)
    
    # List of columns we want to be able to access on
    columns_ = ['Date_', 'Admin2', 'Province_State', 'Country_Region', 'Alpha_3']
    
    # Just USA. Only data from March 22, 2020. Wow, did they EVER start late.
    if place == 'USA':
        df_gb = df_cc.groupby(columns_, as_index=False)['Confirmed', 'Deaths'].sum()
    
    # Every country. Admin2 is a USA only field
    else:
        df_gb = df_cc.groupby(columns_[:-1], as_index=False)['Confirmed', 'Deaths'].sum()
        
    return df_gb

In [54]:
class Stationary():
    """Creates and graphs a dataset after manipulating to see if it stationary"""
    
    def __init__(self, df, column):
        """Initialize the instance with the df
        Arg:
            df - input dataframe, assumes the dataframe is cleaned and ready to go
            column - column of interest
        Returns:
            None
        """
        
        # Attribute
        self.df = df
        self.column = column
        
    # create a differenced series
    def difference(self, dataset):
        """Create a difference between first value and the second value in the input"""

        diff = []
        for i in range(1, len(dataset)):
            value = dataset[i] - dataset[i - 1]
            diff.append(value)

        return pd.Series(diff)
    
    def load_prepare(self):
        """Load the data, create a series, extract the numeric values, and convert to integer"""
        
        series = self.df[self.column]
        
        # Convert it to an array
        X = series.values
        X = X.astype('float32')
        
        # difference the data
        stationary = Stationary.difference(self, X)
        stationary.index = series.index[1:]
        
        # return the series
        return stationary
        
    def check_stationary(self, stationary):
        """Look at some statistics and graphs to see if the dataset is now stationary"""
        
        # check if stationary
        result = adfuller(stationary)
        print('ADF Statistic: %f' % result[0])
        print('p-value: %f' % result[1])
        print('Critical Values:')
        for key, value in result[4].items():
            print('\t%s: %.3f' % (key, value))

        # plot differenced data
        stationary.plot()
        plt.show();

        # save
        stationary.to_csv(r'data/stationary.csv')
        
            
    def acf_pacf_plots(self):
        """ACF and PACF plots of the time series"""
        
        series = self.df[self.column]
        plt.figure(figsize=(16,6))
        plt.subplot(211)
        plot_acf(series, ax=plt.gca())
        plt.subplot(212)
        plot_pacf(series, ax=plt.gca())
        plt.show()

In [69]:
if __name__ == '__main__':
    
    # import libraries
    import matplotlib.pyplot as plt 
    plt.rcParams['figure.figsize'] = (12,8)
    import pandas as pd
    import numpy as np
    from statsmodels.graphics.tsaplots import plot_acf
    from statsmodels.graphics.tsaplots import plot_pacf
    from statsmodels.tsa.stattools import adfuller
    
    # Get data setup for stationary
    df = data_prep('all_df.csv', 'USA')
    stationary_usa = Stationary(df, 'Deaths')
    
#     # Run class Stationary
#     series = stationary_usa.load_prepare()
    
#     # This just gives you back some statistics on the differenced dataset
#     series = stationary_usa.check_stationary(series)
    
#     # This gives you information on the UNDIFFERENCED dataset
#     stationary_usa.acf_pacf_plots()



In [70]:
df.head()

Unnamed: 0,Date_,Admin2,Province_State,Country_Region,Alpha_3,Confirmed,Deaths
0,2020-03-22,Abbeville,South Carolina,US,USA,1.0,0.0
1,2020-03-22,Acadia,Louisiana,US,USA,0.0,0.0
2,2020-03-22,Accomack,Virginia,US,USA,1.0,0.0
3,2020-03-22,Ada,Idaho,US,USA,11.0,0.0
4,2020-03-22,Adair,Iowa,US,USA,1.0,0.0


In [71]:
stationary_usa

<__main__.Stationary at 0x22a022ecfd0>

In [72]:
df_ny = df[df['Province_State'] == 'New York']
df_ny.head()

Unnamed: 0,Date_,Admin2,Province_State,Country_Region,Alpha_3,Confirmed,Deaths
27,2020-03-22,Albany,New York,US,USA,123.0,0.0
42,2020-03-22,Allegany,New York,US,USA,2.0,0.0
277,2020-03-22,Bronx,New York,US,USA,0.0,0.0
282,2020-03-22,Broome,New York,US,USA,3.0,1.0
434,2020-03-22,Cattaraugus,New York,US,USA,0.0,0.0


In [73]:
type(df_ny)

pandas.core.frame.DataFrame