In [1]:
from utils.disaster_data_utils import *
import numpy as np
df = build_dataframe()
df = build_clean_dataframe(df)

  warn("Workbook contains no default style, apply openpyxl's default")


# Feature Engineering

For each country, for each year, we want one feature vector containg good predictors for the content of the speech they give that year.


In [3]:
df.columns

Index(['Dis No', 'Year', 'Seq', 'Glide', 'Disaster Group', 'Disaster Subgroup',
       'Disaster Type', 'Disaster Subtype', 'Disaster Subsubtype',
       'Event Name', 'Country', 'ISO', 'Region', 'Continent', 'Location',
       'Origin', 'Associated Dis', 'Associated Dis2', 'OFDA Response',
       'Appeal', 'Declaration', 'AID Contribution ('000 US$)', 'Dis Mag Value',
       'Dis Mag Scale', 'Latitude', 'Longitude', 'Local Time', 'River Basin',
       'Start Year', 'Start Month', 'Start Day', 'End Year', 'End Month',
       'End Day', 'Total Deaths', 'No Injured', 'No Affected', 'No Homeless',
       'Total Affected', 'Reconstruction Costs ('000 US$)',
       'Reconstruction Costs, Adjusted ('000 US$)',
       'Insured Damages ('000 US$)', 'Insured Damages, Adjusted ('000 US$)',
       'Total Damages ('000 US$)', 'Total Damages, Adjusted ('000 US$)', 'CPI',
       'Adm Level', 'Admin1 Code', 'Admin2 Code', 'Geo Locations',
       'causes climate change sentiment'],
      dtype='object

In [4]:
def build_feature_vector_v1(df, country, year) -> np.array:
    '''
    Returns a feature vector for the given country in the given year using the information present in the provided dataframe.

            Parameters:
                    df (pd.DataFrame): The dataframe containing the disaster data
                    country (string): The country to build the feature vector for
                    year (int): The year to build the feature vector for

            Returns:
                    vector (np.array): a feature vector for the given country in the given year
    '''
    row = df[(df['Country'] == country) & (df['Year'] == year)]
    num_disasters = len(row)
    num_deaths = row['Total Deaths'].sum()
    num_deaths_per_disaster = num_deaths / num_disasters
    num_deaths_at_biggest_disaster = row['Total Deaths'].max()
    vector = np.array([num_disasters, num_deaths, num_deaths_per_disaster, num_deaths_at_biggest_disaster])
    return vector

In [5]:
build_feature_vector_v1(df, 'Indonesia', 2005)

array([  6.        , 322.        ,  53.66666667, 143.        ])

# Create Training Data

- 1. From what year onwards are we going to use the data?
  - I.e. from what year onwards is the data complete / accuracte?
  - i.e. from what year onwards is climate change a theme that governments talk about?
- 

In [6]:
# X{array-like, sparse matrix} of shape (n_samples, n_features)
def create_feature_matrix(df, years, feature_vector_builder = build_feature_vector_v1) -> np.array:
    """
    Create a feature matrix for a given DataFrame and list of years.

    Parameters:
    - df (pd.DataFrame): The input DataFrame containing data for different countries and years.
    - years (list): A list of years for which feature vectors should be created.
    - feature_vector_builder (callable): A function used to build feature vectors for each country and year.
                                         Default is build_feature_vector_v1.

    Returns:
    - np.array: A 2D numpy array representing the feature matrix, where each row corresponds to a country-year pair.

    Example:
    ```
    from utils.disaster_data_utils import *
    import numpy as np
    df = build_dataframe()
    df = build_clean_dataframe(df)
    feature_matrix = create_feature_matrix(df, np.arange(2000, 2005))
    ```
    """
    result = []
    for country in df['Country'].unique():
        for year in years:
            feature_vector = build_feature_vector_v1(df, country, year)
            result.append(feature_vector)
    return np.array(result)


In [7]:
train_years = np.arange(2019, 2021)
X_train = create_feature_matrix(df, train_years)
X_train.shape

  num_deaths_per_disaster = num_deaths / num_disasters


(452, 4)

In [8]:
# # 5 Steps to use the Scikit-learn Estimator API:

# # 1. Select model and import it
# from sklearn.linear_model import LinearRegression

# # 2. Select model hyperparameters
# model = LinearRegression(fit_intercept=True) # select model hyperparameters

# # 3. Arrange data in feature matrix (or vector if just 1 feature) and Target array
# X = x["Log GDP per capita"].values
# Y = x["Healthy life expectancy at birth"].values

# # 4. Fit model to data, [:, np.newaxis] is used to increase a dimension of X making it a column vector,
# # which is the expected input for fit function.
# model.fit(X[:, np.newaxis], Y)

# # 5. Apply model
# # TODO: we could also fit do predictions on the original X data. But it is better practice to show model performance on unseen data as done below.
# xfit = np.linspace(6, 12, 2) # these values (6 and 12) were chosen by inspecting, visually, the datapoints in the plot
# xfit = np.linspace(min(X), max(X), 2) # suggestion of a more general alternative

# yfit = model.predict(xfit[:, np.newaxis])

# # Plot
# plt.scatter(X, Y, c='black')
# plt.plot(xfit, yfit, c='red');

# Linear Regression