In [1]:
from utils.disaster_data_utils import *
import numpy as np
df = build_clean_dataframe()
df.head()


  warn("Workbook contains no default style, apply openpyxl's default")


Unnamed: 0,Country,Total Deaths,Year,Disaster Type
0,Cabo Verde,11000.0,1900,Drought
1,India,1250000.0,1900,Drought
6,Canada,76.0,1903,Mass movement (dry)
12,Canada,18.0,1905,Mass movement (dry)
16,Belgium,6.0,1906,Flood


# Feature Engineering

For each country, for each year, we want one feature vector containg good predictors for the content of the speech they give that year.


In [2]:
df.columns

Index(['Country', 'Total Deaths', 'Year', 'Disaster Type'], dtype='object')

In [3]:
def build_feature_vector_v1(df, country, year) -> np.array:
    '''
    Returns a feature vector for the given country in the given year using the information present in the provided dataframe.

            Parameters:
                    df (pd.DataFrame): The dataframe containing the disaster data
                    country (string): The country to build the feature vector for
                    year (int): The year to build the feature vector for

            Returns:
                    vector (np.array): a feature vector for the given country in the given year
    '''
    row = df[(df['Country'] == country) & (df['Year'] == year)]
    num_disasters = len(row)
    num_deaths = row['Total Deaths'].sum()
    num_deaths_per_disaster = num_deaths / num_disasters if num_disasters > 0 else 0
    num_deaths_at_biggest_disaster = row['Total Deaths'].max()
    vector = np.array([num_disasters, num_deaths, num_deaths_per_disaster, num_deaths_at_biggest_disaster])
    return vector

In [4]:
build_feature_vector_v1(df, 'Indonesia', 2005)

array([  5. , 322. ,  64.4, 143. ])

# Create Training Data

- 1. From what year onwards are we going to use the data?
  - I.e. from what year onwards is the data complete / accuracte?
  - i.e. from what year onwards is climate change a theme that governments talk about?
- 

In [5]:
# X{array-like, sparse matrix} of shape (n_samples, n_features)
def create_feature_matrix(df, years, feature_vector_builder = build_feature_vector_v1) -> np.array:
    """
    Create a feature matrix for a given DataFrame and list of years.

    Parameters:
    - df (pd.DataFrame): The input DataFrame containing data for different countries and years.
    - years (list): A list of years for which feature vectors should be created.
    - feature_vector_builder (callable): A function used to build feature vectors for each country and year.
                                         Default is build_feature_vector_v1.

    Returns:
    - np.array: A 2D numpy array representing the feature matrix, where each row corresponds to a country-year pair.

    Example:
    ```
    from utils.disaster_data_utils import *
    import numpy as np
    df = build_dataframe()
    df = build_clean_dataframe(df)
    feature_matrix = create_feature_matrix(df, np.arange(2000, 2005))
    ```
    """
    result = []
    for country in df['Country'].unique():
        for year in years:
            row = df[(df['Country'] == country) & (df['Year'] == year)]
            if len (row) != 0:
                last_row = row
                feature_vector = feature_vector_builder(df, country, year)
                result.append(feature_vector)
            
    return np.array(result)


In [6]:
train_years = np.arange(2000, 2021)
X_train = create_feature_matrix(df, train_years)
X_train.shape

(1834, 4)

In [7]:
X_train

array([[ 1.,  3.,  3.,  3.],
       [ 1.,  9.,  9.,  9.],
       [ 1.,  1.,  1.,  1.],
       ...,
       [ 1.,  9.,  9.,  9.],
       [ 1., 12., 12., 12.],
       [ 1.,  4.,  4.,  4.]])

# Train: Linear Regression

In [8]:
from sklearn.linear_model import LinearRegression

model = LinearRegression(fit_intercept=True) 
X_train = X_train
# TODO: create proper y targets based on speech data.
# let's first try to overfit to verify we have implemented everything correctly
## create a y vector with a one if there are more than 10 disasters, 0 otherwise
Y_train = np.array([1 if x > 10 else 0 for x in X_train[:, 0]])
# Y_train = np.random.rand(X_train.shape[0]).reshape(-1, 1)
# TODO: fix Nan Values
model.fit(X_train, Y_train)


In [9]:
# create random y vector
y1 = np.random.rand(X_train.shape[1]).reshape(1, -1)
y1[0] = 0
y2 = np.random.rand(X_train.shape[1]).reshape(1, -1)
y2[0] = 100
print(model.predict(y1))
print(model.predict(y2)) # seems to work as expeted; giving higher output for samples with more disasters


[-0.07970634]
[4.03815589]


# Feature engineer training labels

In [10]:
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')



# https://www.w3resource.com/python-exercises/nltk/nltk-tokenize-exercise-3.php
# words = word_tokenize('klimaarverandering is erg')

def convert_text_to_keyword_counts(speech_string, keywords):
    keywords = [keyword.lower() for keyword in keywords]
    result = 0
    words = word_tokenize(speech_string)
    words = [word.lower() for word in words]
    for word in words:
        if word in keywords:
            result += 1
    return result
    
    

def convert_list_of_speeches_to_list_of_keyword_counts(speeches, keywords):
    result = []
    for speech in speeches:
        result.append(convert_text_to_keyword_counts(speech, keywords))
    return result


print(convert_list_of_speeches_to_list_of_keyword_counts(['klimaatverandering is erg', 
                                               'Pilkes is geen Pickle. Toch.', 
                                               'Minder CO2! Minder CO2! Fossielle brandstoffen zijn stom.'], 
                                              keywords=['klimaatverandering', "CO2", "fossielle"]))

def convert_list_of_speeches_to_normalized_scores(speeches, keywords):
    '''
        
    
    '''
    keywords = [keyword.lower() for keyword in keywords]
    list_of_counts = convert_list_of_speeches_to_list_of_keyword_counts(speeches, keywords)
    # ensure all counts have a value between 0 and 1
    max_count = max(list_of_counts)
    min_count = min(list_of_counts)
    denom = max_count - min_count 
    if denom == 0:
        denom = 1
    normalized_counts = [(x - min_count) / (denom) for x in list_of_counts]
    return normalized_counts



convert_list_of_speeches_to_normalized_scores(['klimaatverandering is erg', 
                                               'Pilkes is geen Pickle. Toch.', 
                                               'Minder CO2! Minder CO2! Fossiele brandstoffen zijn stom.'], 
                                              keywords=['klimaatverandering', "CO2", "fossiele"])

[1, 0, 3]


[nltk_data] Downloading package punkt to /Users/julius/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


[0.3333333333333333, 0.0, 1.0]

['Minder', 'CO2', '!', 'Minder', 'CO2', '!', 'Fossielle', 'brandstoffen', 'zijn', 'stom', '.']
