Function to plot data about the growth starting from a specific number of cases. The countries for which the values are plotted are passed to the funcion as a list.

In [4]:
def plotGrowthDataFromFirstCases(dataset, countries_list, column_reference,
                               graph_title, starting_number, y_axis_max_limit = -1):
    """This procedure allow to plot data relative to the growth of
        a specific value starting from the first cases.
        Parameters:
        dataset -> is the dataset where the data are retreived
        countries_list -> is the list of countries which data must be plotted
        column_reference -> is the name of the column under analysis
        graph_title -> is the name of the graph that must be showed
        starting_number -> is the minimum number of cases from each the growth
            must be plotted
        y_axis_max_limit -> is the maximum value for the y axis plot, default value is -1,
            that means plot with the maximum range
    """
    for country in countries_list:
        country_growth_total_cases = dataset[dataset['Country'] == country]
        country_growth_total_cases = country_growth_total_cases[country_growth_total_cases[column_reference] >= starting_number]
        plt.plot(range(len(country_growth_total_cases)),country_growth_total_cases[column_reference], label = country)
    plt.xticks(rotation = 90)
    plt.legend()
    plt.title(graph_title)
    if (y_axis_max_limit > 0):
        plt.ylim(0, y_axis_max_limit)
    plt.show()

Define a function to evaluate the number of cases per million of inhabitants.

In [13]:
def evaluateRatePerNumberOfMillion(dataset, ref_column, new_column_name):
    """This funcion allows to add a new columns to the input dataset
       containing the number of cases of the reference column out of the
       population.
       Parameters:
       dataset -> is the input dataset containing all the necessary information.
           It is import it has a column "Population" with the total number of
           inhabitants.
       ref_column -> is the column under evaluation
       new_column_name -> is the name of the column where the value is stored
    """
    
    dataset[new_column_name] = dataset[ref_column]/dataset['Population'] * 1000000
    return dataset

Define a function that create a new dataframe containing the growth rate for each country. It is possible to specify the time interval to consider, expressed in days.

In [12]:
def evaluateGrowthRate(dataset, number_of_days, reference_column):
    """This function allows to evaluate the growth rate of the reference
       column in input, gathering the data from the dataset in input with
       a specific delay expressed in days. The starting and ending date
       considered is based by the minumum and maximum date present in the
       input dataset. The dataframe in input must have a column 'Date'
       containing the list of reference date. The value returned is a
       new dataframe containing for each country a specific column and the
       value in each row is the percentage growth calculated from the previous 
       date.
       Parameters:
       dataset -> is the dataset to gather the data
       number_of_days -> is the frequency in which the evaluation is done
       reference_column -> is the column in the dataset in input to consider
           for evaluating the growth
    """
    #get the minimum date to consider
    minimum_date = dataset['Date'].min()
    #get the maximum date to consider
    maximum_date = dataset['Date'].max()
    #create the list of country to consider
    list_of_country = dataset['Country'].unique()
    #Create a new dataframe with the list of country in each column
    #and the list of date as index
    date_list = pd.date_range(start = minimum_date, end = maximum_date, 
                              freq = str(number_of_days) + 'D')
    #create a new dataframe with country as columns and date as index
    #filled with 0 values
    zero_data = np.zeros(shape=(len(date_list),len(list_of_country)))
    df_growth = pd.DataFrame(zero_data, index = date_list, columns = list_of_country)
    #iterate on all the index date
    for index_date in range(len(date_list)):
        #first row is not considered because there is not a previus value
        #for which the growth can be evaluated
        if index_date == 0:
            continue
        else:
            #get the string date that has to be evaluated
            current_date = str(date_list.values[index_date])[:10]
            #get the previous date
            previous_date = str(date_list.values[index_date - 1])[:10]
            #iterate on all the countries present in the input dataset
            for country in list_of_country:
                #get the list of value available for the country in exam
                country_values = dataset.loc[dataset['Country'] == country]
                #get the row value for the date/country in exame
                current_value = country_values.loc[dataset['Date'] == current_date]
                #get the previous row value for the date/country in exame
                previous_value = country_values.loc[dataset['Date'] == previous_date]
                #verify that the two values are not empty, otherwise skip to the next
                #iteration
                if len(current_value) > 0 and len(previous_value) > 0:
                    #if values are not empty, extract the two number used to evaluate
                    #the growth
                    current_value = current_value[reference_column].values[0]
                    previous_value = previous_value[reference_column].values[0]
                    #verifies that the previous value is not 0 to avoid division error
                    if previous_value > 0:
                        #evaluate the percentage growth between the 2 consecutive values
                        growth_value = (current_value - previous_value) / previous_value * 100
                        #set the evaluated value in the dataframe, at the right location
                        df_growth.loc[current_date,country] = growth_value
    #return the new dataframe with the outcomes
    return df_growth

Define a function to create a matrix of value

In [5]:
def get_matrix_dataset(dataset, column_to_consider):
    """
    This function produce a new dataframe where all the columns
    represent a country, each row represent a date and the value
    of the matrix is the value specified in the input 'column_to_consider'
    Parameters:
    dataset -> is the dataset organized in rows
    column_to_consider -> is the value contained in the matrix
    """
    #get the minimum date to consider
    minimum_date = dataset['Date'].min()
    #get the maximum date to consider
    maximum_date = dataset['Date'].max()
    #create the list of country to consider
    country_list = dataset['Country'].unique()
    #set Date column as index
    dataset.set_index('Date',inplace = True)
    #create a new dataframe with the list of country in each column
    #and the list of date as index
    date_list = pd.date_range(start = minimum_date, end = maximum_date, 
                              freq = '1' + 'D')
    zero_data = np.zeros(dtype=int, shape=(len(date_list),len(country_list)))
    df = pd.DataFrame(zero_data, index = date_list, columns = country_list)
    for date in date_list:
        date = str(date)[:10]
        for country in country_list:
            date_rows = dataset.loc[dataset['Country'] == country]
            try:
                value = date_rows.loc[date, column_to_consider]
                df.loc[date,country] = value
            except:
                pass
    return df