In [1]:
from utils.disaster_data_utils import *
import numpy as np
df = build_clean_dataframe()

  warn("Workbook contains no default style, apply openpyxl's default")


# Feature Engineering

For each country, for each year, we want one feature vector containg good predictors for the content of the speech they give that year.


In [2]:
df.columns

Index(['Country', 'Total Deaths', 'Year', 'Disaster Type'], dtype='object')

In [3]:
def build_feature_vector_v1(df, country, year) -> np.array:
    '''
    Returns a feature vector for the given country in the given year using the information present in the provided dataframe.

            Parameters:
                    df (pd.DataFrame): The dataframe containing the disaster data
                    country (string): The country to build the feature vector for
                    year (int): The year to build the feature vector for

            Returns:
                    vector (np.array): a feature vector for the given country in the given year
    '''
    row = df[(df['Country'] == country) & (df['Year'] == year)]
    num_disasters = len(row)
    num_deaths = row['Total Deaths'].sum()
    num_deaths_per_disaster = num_deaths / num_disasters if num_disasters > 0 else 0
    num_deaths_at_biggest_disaster = row['Total Deaths'].max()
    vector = np.array([num_disasters, num_deaths, num_deaths_per_disaster, num_deaths_at_biggest_disaster])
    return vector

In [4]:
build_feature_vector_v1(df, 'Indonesia', 2005)

array([  5. , 322. ,  64.4, 143. ])

# Create Training Data

- 1. From what year onwards are we going to use the data?
  - I.e. from what year onwards is the data complete / accuracte?
  - i.e. from what year onwards is climate change a theme that governments talk about?
- 

In [5]:
# X{array-like, sparse matrix} of shape (n_samples, n_features)
def create_feature_matrix(df, years, feature_vector_builder = build_feature_vector_v1) -> np.array:
    """
    Create a feature matrix for a given DataFrame and list of years.

    Parameters:
    - df (pd.DataFrame): The input DataFrame containing data for different countries and years.
    - years (list): A list of years for which feature vectors should be created.
    - feature_vector_builder (callable): A function used to build feature vectors for each country and year.
                                         Default is build_feature_vector_v1.

    Returns:
    - np.array: A 2D numpy array representing the feature matrix, where each row corresponds to a country-year pair.

    Example:
    ```
    from utils.disaster_data_utils import *
    import numpy as np
    df = build_dataframe()
    df = build_clean_dataframe(df)
    feature_matrix = create_feature_matrix(df, np.arange(2000, 2005))
    ```
    """
    result = []
    for country in df['Country'].unique():
        for year in years:
            row = df[(df['Country'] == country) & (df['Year'] == year)]
            if len (row) != 0:
                last_row = row
                feature_vector = feature_vector_builder(df, country, year)
                result.append(feature_vector)
            
    return np.array(result)


In [6]:
train_years = np.arange(2019, 2021)
X_train = create_feature_matrix(df, train_years)
X_train.shape

(197, 4)

In [7]:
X_train

array([[1.00000000e+00, 1.00000000e+00, 1.00000000e+00, 1.00000000e+00],
       [1.20000000e+01, 2.30700000e+03, 1.92250000e+02, 1.90000000e+03],
       [1.00000000e+01, 2.31600000e+03, 2.31600000e+02, 1.92200000e+03],
       [2.00000000e+00, 2.00000000e+00, 1.00000000e+00, 1.00000000e+00],
       [1.00000000e+00, 1.00000000e+00, 1.00000000e+00, 1.00000000e+00],
       [3.00000000e+00, 7.16000000e+02, 2.38666667e+02, 4.00000000e+02],
       [1.00000000e+00, 1.68700000e+03, 1.68700000e+03, 1.68700000e+03],
       [6.00000000e+00, 2.60000000e+02, 4.33333333e+01, 1.14000000e+02],
       [2.00000000e+00, 2.83000000e+02, 1.41500000e+02, 2.57000000e+02],
       [2.00000000e+00, 1.40000000e+01, 7.00000000e+00, 8.00000000e+00],
       [1.00000000e+00, 3.90000000e+01, 3.90000000e+01, 3.90000000e+01],
       [1.10000000e+01, 5.55000000e+02, 5.04545455e+01, 3.00000000e+02],
       [9.00000000e+00, 4.59000000e+02, 5.10000000e+01, 2.80000000e+02],
       [4.00000000e+00, 8.40000000e+01, 2.10000000e

# Train: Linear Regression

In [8]:
from sklearn.linear_model import LinearRegression

model = LinearRegression(fit_intercept=True) 
X_train = X_train
# TODO: create proper y targets based on speech data.
# let's first try to overfit to verify we have implemented everything correctly
## create a y vector with a one if there are more than 10 disasters, 0 otherwise
Y_train = np.array([1 if x > 10 else 0 for x in X_train[:, 0]])
# Y_train = np.random.rand(X_train.shape[0]).reshape(-1, 1)
# TODO: fix Nan Values
model.fit(X_train, Y_train)


In [14]:
# create random y vector
y1 = np.random.rand(X_train.shape[1]).reshape(1, -1)
y1[0] = 11
y2 = np.random.rand(X_train.shape[1]).reshape(1, -1)
y2[0] = 2
print(model.predict(y1))
print(model.predict(y2)) # seems to work as expeted; giving higher output for samples with more disasters


[0.38573963]
[-0.00185137]
