Color map for well distributed colors

In [1]:
#Colors to make every graph costant
colorMapWorld = {"United States": "red", "Italy": "blue", "Spain": "green", "France": "black", "United Kingdom": "purple",
           "Iran": "brown","Belgium": "pink","Germany": "orange","China": "gray","Netherlands": "yellow",
           "Turkey": "cyan","Russia": "olive","San Marino": "magenta","Andorra": "gold","Sweden": "lime"
            ,"Ireland": "blueviolet","Luxembourg": "khaki","Iceland": "lightcoral",
           "Switzerland": "indigo","Gibraltar": "sandybrown","Isle of Man":"teal","Brazil": "blueviolet","Qatar": "crimson"}


colorMapRegions = {"Lombardia": "red", "Campania": "blue", "Abruzzo": "green", "Molise": "black", "Piemonte": "purple",
           "Basilicata": "brown","Sardegna": "pink","Sicilia": "orange","Valle D'Aosta": "gray","Veneto": "yellow",
           "Lazio": "cyan","Umbria": "olive","Marche": "magenta","Toscana": "gold","Emilia-Romagna": "lime"
            ,"Trentino Alto Adige": "blueviolet","Friuli Venezia Giulia": "khaki","Calabria": "lightcoral",
           "Puglia": "indigo","Veneto": "sandybrown","Liguria":"teal","P.A. Trento": "blueviolet"}

colorMapProvinces = {"Reggio nell'Emilia": "red", "Roma": "blue", "Genova": "green", "Bergamo": "black", "Brescia": "purple",
           "Cremona": "brown","Milano": "pink","Monza e della Brianza": "orange","Torino": "gray","Verona": "yellow",
           "Lodi": "cyan","Mantova": "olive","Pavia": "magenta","Sondrio": "gold","Alessandria": "lime"
            ,"Cuneo": "blueviolet","Trento": "khaki","Calabria": "lightcoral",
           "Puglia": "indigo","Veneto": "sandybrown","Liguria":"teal","P.A. Trento": "blueviolet","Lecco":"crimson"}

Function to plot data about the growth starting from a specific number of cases. The countries for which the values are plotted are passed to the funcion as a list.

In [1]:
def plotGrowthDataFromFirstCases(dataset, countries_list, column_reference,
                               graph_title, starting_number, y_axis_max_limit = -1):
    """This procedure allow to plot data relative to the growth of
        a specific value starting from the first cases.
        Parameters:
        dataset -> is the dataset where the data are retreived
        countries_list -> is the list of countries which data must be plotted
        column_reference -> is the name of the column under analysis
        graph_title -> is the name of the graph that must be showed
        starting_number -> is the minimum number of cases from each the growth
            must be plotted
        y_axis_max_limit -> is the maximum value for the y axis plot, default value is -1,
            that means plot with the maximum range
    """
    for country in countries_list:
        country_growth_total_cases = dataset[dataset['Country'] == country]
        country_growth_total_cases = country_growth_total_cases[country_growth_total_cases[column_reference] >= starting_number]
        plt.plot(range(len(country_growth_total_cases)),country_growth_total_cases[column_reference],color=colorMapWorld[country], label = country)
    plt.xticks(rotation = 90)
    plt.legend()
    plt.title(graph_title)
    if (y_axis_max_limit > 0):
        plt.ylim(0, y_axis_max_limit)
    plt.show()

Define a function to evaluate the number of cases per million of inhabitants.

In [13]:
def evaluateRatePerNumberOfMillion(dataset, ref_column, new_column_name):
    """This funcion allows to add a new columns to the input dataset
       containing the number of cases of the reference column out of the
       population.
       Parameters:
       dataset -> is the input dataset containing all the necessary information.
           It is import it has a column "Population" with the total number of
           inhabitants.
       ref_column -> is the column under evaluation
       new_column_name -> is the name of the column where the value is stored
    """
    
    dataset[new_column_name] = dataset[ref_column]/dataset['Population'] * 1000000
    return dataset

Define a function that create a new dataframe containing the growth rate for each country. It is possible to specify the time interval to consider, expressed in days.

In [15]:
def evaluateGrowthRate(dataset, number_of_days, reference_column):
    """This function allows to evaluate the growth rate of the reference
       column in input, gathering the data from the dataset in input with
       a specific delay expressed in days. The starting and ending date
       considered is based by the minumum and maximum date present in the
       input dataset. The dataframe in input must have a column 'Date'
       containing the list of reference date. The value returned is a
       new dataframe containing for each country a specific column and the
       value in each row is the percentage growth calculated from the previous 
       date.
       Parameters:
       dataset -> is the dataset to gather the data
       number_of_days -> is the frequency in which the evaluation is done
       reference_column -> is the column in the dataset in input to consider
           for evaluating the growth
    """
    #get the minimum date to consider
    minimum_date = dataset['Date'].min()
    #get the maximum date to consider
    maximum_date = dataset['Date'].max()
    #create the list of country to consider
    list_of_country = dataset['Country'].unique()
    #Create a new dataframe with the list of country in each column
    #and the list of date as index
    date_list = pd.date_range(start = minimum_date, end = maximum_date, 
                              freq = str(number_of_days) + 'D')
    #create a new dataframe with country as columns and date as index
    #filled with 0 values
    zero_data = np.zeros(shape=(len(date_list),len(list_of_country)))
    df_growth = pd.DataFrame(zero_data, index = date_list, columns = list_of_country)
    #iterate on all the index date
    for index_date in range(len(date_list)):
        #first row is not considered because there is not a previus value
        #for which the growth can be evaluated
        if index_date == 0:
            continue
        else:
            #get the string date that has to be evaluated
            current_date = str(date_list.values[index_date])[:10]
            #get the previous date
            previous_date = str(date_list.values[index_date - 1])[:10]
            #iterate on all the countries present in the input dataset
            for country in list_of_country:
                #get the list of value available for the country in exam
                country_values = dataset.loc[dataset['Country'] == country]
                #get the row value for the date/country in exame
                current_value = country_values.loc[dataset['Date'] == current_date]
                #get the previous row value for the date/country in exame
                previous_value = country_values.loc[dataset['Date'] == previous_date]
                #verify that the two values are not empty, otherwise skip to the next
                #iteration
                #get all death
                total_value=country_values[reference_column].iloc[-1]
                #total_value=country_values.sum[reference_column]
                if len(current_value) > 0 and len(previous_value) > 0:
                    #if values are not empty, extract the two number used to evaluate
                    #the growth
                    current_value = current_value[reference_column].values[0]
                    previous_value = previous_value[reference_column].values[0]
                    
                    #verifies that the previous value is not 0 to avoid division error
                    if previous_value > 0:
                        #evaluate the percentage growth between the 2 consecutive values
                        growth_value = (current_value - previous_value) / total_value * 100
                        #set the evaluated value in the dataframe, at the right location
                        df_growth.loc[current_date,country] = growth_value
    #return the new dataframe with the outcomes
    return df_growth

Define a function to create a matrix of value

In [5]:
def get_matrix_dataset(dataset, column_to_consider):
    """
    This function produce a new dataframe where all the columns
    represent a country, each row represent a date and the value
    of the matrix is the value specified in the input 'column_to_consider'
    Parameters:
    dataset -> is the dataset organized in rows
    column_to_consider -> is the value contained in the matrix
    """
    #get the minimum date to consider
    minimum_date = dataset['Date'].min()
    #get the maximum date to consider
    maximum_date = dataset['Date'].max()
    #create the list of country to consider
    country_list = dataset['Country'].unique()
    #set Date column as index
    dataset.set_index('Date',inplace = True)
    #create a new dataframe with the list of country in each column
    #and the list of date as index
    date_list = pd.date_range(start = minimum_date, end = maximum_date, 
                              freq = '1' + 'D')
    zero_data = np.zeros(dtype=int, shape=(len(date_list),len(country_list)))
    df = pd.DataFrame(zero_data, index = date_list, columns = country_list)
    for date in date_list:
        date = str(date)[:10]
        for country in country_list:
            date_rows = dataset.loc[dataset['Country'] == country]
            try:
                value = date_rows.loc[date, column_to_consider]
                df.loc[date,country] = value
            except:
                pass
    return df

Define a function to evaluate the moving average.

In [2]:
def evaluate_moving_average(dataset, period, time_series = True):
    '''
    This function take a time series matrix dataset in input,  
    if not it must be present a column 'Date', and a column for each
    country. Create a new dataset with the same column but with the
    value of each cell that is the average of the previous n value
    where n is the period in input. The first n rows of the original
    dataset are truncated in the final dataset.
    Parameters:
    dataset -> input dataset where the moving average is computed.
    period -> is the number of sample to evaluate the moving average.
    time_serie -> specify if the dataset in input is a time series.
    '''
    #reset data index if necessary
    if time_series:
        dataset.reset_index(inplace = True)
        dataset.rename(columns = {'index':'Date'}, inplace = True)
    
    #create a new empty dataframe with a number of rows less of the
    #period of time input but with the same country column
    date_list = dataset['Date'][period:]
    #set Date column as index
    dataset.set_index('Date',inplace = True)
    country_list = dataset.columns
    #create 0 value matrix to insert into the new dataframe
    zero_data = np.zeros(dtype=int, shape=(len(date_list),len(country_list)))
    df = pd.DataFrame(zero_data, index = date_list, columns = country_list)
    
    #evaluate the moving average of the input dataset and 
    #insert the value in the new dataset.
    #Iterate on each date
    for date_index in range(len(date_list)):
        #Iterate on each country
        for country in country_list:
            #Evaluate the average of the previous n-1 cell and the 
            #current cell for the country in input
            average = dataset.iloc[date_index-period:date_index][country].mean()
            #store the average in the new dataset
            df.iloc[date_index][country] = average
    return df

Plot for the lockdown moving average

In [2]:
'''
Inputs:
Checks if it's the peak of the curve, or it hasn't reach it yet
column-> it's the dataset column we are analyzing. Ex: Afghanistan,Italy ecc...
peakIndex-> It's the supposed peak that we have to check

Output->
It returns 0 if its not the peak yet, otherwise it returns the peak as it is
'''
def peakChecker(column,peakIndex):
    if(peakIndex+datetime.timedelta(days=1) in column.index): #checking if there is a day after the peak
        return peakIndex
                    
                    
    else:
        return 0



      
'''
10 graphs, one for each country in the top ten with a lockdown line and a counter

Inputs:
top_ten_moving_average-> the top ten of the countries with the moving mean
df_lockdown_states->dataset with the lockdown information
df_moving_average_-> dataset with the moving average of every country
name-> name of the Y axis of the graph


Output->
Prints the 10 graphs
'''    
def plotWithLockdown(top_ten_moving_average,df_lockdown_states,df_moving_average_,name):
    i=0 #Subplot indexer, can be from 0 to 4 and indexes the rows
    j=0 #Subplot indexer, can be from 0 to 1 and indexes the columns
    fig, axs = plt.subplots(5, 2,figsize=(15,20)) #Subplot 5x2 
    fig.tight_layout(pad=7.0) #to distanciate better the graphs
    
    for d in range(0,len(top_ten_moving_average)):
        #Calculating the peak of the curve, taking the max value for a column in the top ten
        lockdown_date=df_lockdown_states['Beginning Date'][df_lockdown_states['Country']==top_ten_moving_average[d]]
        peak=df_moving_average_[top_ten_moving_average[d]].max()
        peak=df_moving_average_[top_ten_moving_average[d]][df_moving_average_[top_ten_moving_average[d]]==peak]
        
        #then checking the peak to see if it's the last value (counter=Peak not reached) or not (counter = lenght between lockdown and peak) 
        newPeak=peakChecker(df_moving_average_[top_ten_moving_average[d]],peak.index[0])
        if(newPeak!=0):
            day_counter=len(df_moving_average_[top_ten_moving_average[d]][lockdown_date.iloc[0]:peak.index[0]])
        else:
            day_counter='Peak not reached yet'

        #building the subplots
        axs[i, j].plot( df_moving_average_.index,df_moving_average_[top_ten_moving_average[d]],color=colorMapWorld[top_ten_moving_average[d]], label= '')
        axs[i, j].set_title(top_ten_moving_average[d])
        axs[i, j].set(xlabel='Dates', ylabel=name)
        axs[i, j].axvline(x=lockdown_date.iloc[0],color='red')
        axs[i, j].tick_params(labelrotation=45)
        axs[i, j].annotate('Counter from the lockdown: '+str(day_counter), xy=(10, 170), xycoords='axes points',
                size=10, ha='left', va='top',
                )
        #Index incrementing. It must be done manually and not with a for otherwise it's impossibile
        j=j+1
    
        if(j==2):
            j=0
            i=i+1
            if(i==5):
                i=0
        

Plot creator:
This is for the plot for italy's data    

In [3]:

'''
Input:
x-> the range of days passed since the thing we want to represent Ex: if a month passed, x=range(1,30)
y-> is an array of datasets row. Each component of the list is a row containing a province/region with 
the values for the matter in exam following. Ex: if we are analyzing the deaths, every component of the list will be: 
'Abruzzo,0,20,...,2000' The rows goes on as the same amount of days passed
title-> The name given to the graph. It's a list of two elements, cause and effect of the graph. Ex: ['Number of deaths','Region']

Output: It prints the graph
'''
def condition_plot(x,y,title):
    plt.figure()
    
    for i in range(0,len(y)):
        if(title[1]=='regions'):
            plt.plot(x,y[i][1:],color=colorMapRegions[y[i][0]],label=y[i][0])
        else:
            plt.plot(x,y[i][1:],color=colorMapProvinces[y[i][0]],label=y[i][0])
    plt.legend(bbox_to_anchor=(1, 1.05))


    plt.xlabel('Days')
    plt.ylabel(title[0])
    plt.title(title[0]+' for '+title[1])

    plt.show()
    plt.clf()




Top ten regions/provinces: this is to have a top ten list of regions/provinces

In [None]:
'''
Input:
dataset-> it's the dataset in which we have our data to classify
name_column-> it's the name/index of the column where the names to assign to each value  is stored. 
If Abruzzo is the best region for example, we want to use that name, and that name is stored at column '0'

Output:
A list containing dataset rows, in which we have the region and the values that will be used in the graphs
'''

def top_ten_list(dataset,name_column):
    #loading the list
    y=[]  #list of the condition for provinces/regions, in which every entry of the list is a day (its a list of lists)
    for i in range(dataset.shape[0]):
        y.append(dataset.iloc[i,name_column:]) #position 1 is the province name, which we need for the label in the graph

    #Let's start with the top ten 
    top_ten=[]
    last_day=[]
    #first let's determine the last day how many people has a certain condition
    for i in range(name_column,len(y)):
        last_day.append(y[i][-1])
    last_day.sort(reverse = True)

  
    #then let's have only the first 10
    for i in range(name_column,len(y)):
        if y[i][-1] in last_day[0:10]:
            top_ten.append(y[i])
    
    return top_ten

Moving mean italy

In [3]:
'''
Input:
data-> Dataset
window_size-> The size of the moving mean

Output:
An array with moving averages for each rows of the dataset
'''

def moving_mean(data,window_size):
    i=0
    
    moving_averages=[]
    while i < len(data) - window_size + 1:
        this_window = data[i : i + window_size]
        #print(this_window[0][1:])
        window_average = sum(this_window[1:]) / window_size
        moving_averages.append(window_average)
        i += 1

    return moving_averages

'''
Input:
top_ten-> list of datasets rows, containg name of the region/provinces and its values
window_size-> The size of the moving mean

Output:
An array with moving averages for each rows of the dataset with the name of the region/province
'''

def moving_mean_with_names(top_ten,windows_size):

    moving_averages=[]
    for i in range(len(top_ten)):
        moving_averages.append(moving_mean(top_ten[i],windows_size))

 

    for i in range(len(top_ten)):
        moving_averages[i][0]=top_ten[i][0]
    return moving_averages

Autocorrelation

In [3]:
'''
10 graphs, one for each country in the top ten with a lockdown line and a counter

Inputs:
top_ten_moving_average-> the top ten of the countries with the moving mean
df_lockdown_states->dataset with the lockdown information
df_moving_average_-> dataset with the moving average of every country
name-> name of the Y axis of the graph


Output->
Prints the 10 graphs
'''    
def autocorrelationPlot(top_ten_moving_average,df_moving_average_,total,lag):
    i=0 #Subplot indexer, can be from 0 to 4 and indexes the rows
    j=0 #Subplot indexer, can be from 0 to 1 and indexes the columns
    fig, axs = plt.subplots(5, 2,figsize=(15,20)) #Subplot 5x2 
    fig.tight_layout(pad=7.0) #to distanciate better the graphs
    
    for d in range(0,len(top_ten_moving_average)):
        

        try:#building the subplots
            if(total):
                plot_acf(df_moving_average_[top_ten_moving_average[d]],color=colorMapWorld[top_ten_moving_average[d]], label= '',lags=lag,ax=axs[i, j],title=top_ten_moving_average[d]+" autocorrelation")
            else:
                plot_pacf(df_moving_average_[top_ten_moving_average[d]],color=colorMapWorld[top_ten_moving_average[d]], label= '',lags=lag,ax=axs[i, j],title=top_ten_moving_average[d]+" partial autocorrelation")
        except:
            if(total):
                plot_acf(df_moving_average_[top_ten_moving_average[d]],color=colorMapRegions[top_ten_moving_average[d]], label= '',lags=lag,ax=axs[i, j],title=top_ten_moving_average[d]+" autocorrelation")
            else:
                plot_pacf(df_moving_average_[top_ten_moving_average[d]],color=colorMapRegions[top_ten_moving_average[d]], label= '',lags=lag,ax=axs[i, j],title=top_ten_moving_average[d]+" partial autocorrelation")
       
                
        
        j=j+1
    
        if(j==2):
            j=0
            i=i+1
            if(i==5):
                i=0
 

def decompositionPlot(top_ten_moving_average,df_moving_average_):
  
    
    
    for d in range(0,len(top_ten_moving_average)):
        
        rcParams['figure.figsize'] = 18, 8
        #building the subplots
        decomposition=seasonal_decompose(df_moving_average_[top_ten_moving_average[d]], model='additive', period=int((len(df_moving_average_)-1)/2))
        decomposition.plot()

        '''
        fig, axes = plt.subplots(4, 1, sharex=True)
        decomposition.observed.plot(ax=axes[0], legend=False, color=colorMapWorld[top_ten_moving_average[d]])
        axes[0].set_ylabel('Observed')
        decomposition.trend.plot(ax=axes[1], legend=False, color=colorMapWorld[top_ten_moving_average[d]])
        axes[1].set_ylabel('Trend')
        decomposition.seasonal.plot(ax=axes[2], legend=False, color=colorMapWorld[top_ten_moving_average[d]])
        axes[2].set_ylabel('Seasonal')
        decomposition.resid.plot(ax=axes[3], legend=False, color=colorMapWorld[top_ten_moving_average[d]])
        axes[3].set_ylabel('Residual')
        '''
        
            
        
        

# Miscellanous

This code is when you downlaod the csv files on you pcs file by file.
It's no use for this notebook, avoid to compile it, it will not work.
There is also scrap code, here in case i need it again

In [None]:

'''
print(len(population_time_line_dataset))
print(len(time_line_dataset))
difference =pd.Index(time_line_dataset['denominazione_provincia'].tolist()).symmetric_difference( population_time_line_dataset['denominazione_provincia'])
print(difference)
'''

'''
for filename in listdir("./dati-province\\"):
   
    temp_data= pd.read_csv("./\\"+filename)

    time_line_dataset[filename[-12:-4]]=temp_data['totale_casi']
'''