In [None]:
from utils.disaster_data_utils import *
import numpy as np
df = build_clean_dataframe()
df.head()


# Feature Engineering

For each country, for each year, we want one feature vector containg good predictors for the content of the speech they give that year.


In [None]:
df.columns

In [None]:
def build_feature_vector_v1(df, country, year) -> np.array:
    '''
    Returns a feature vector for the given country in the given year using the information present in the provided dataframe.

            Parameters:
                    df (pd.DataFrame): The dataframe containing the disaster data
                    country (string): The country to build the feature vector for
                    year (int): The year to build the feature vector for

            Returns:
                    vector (np.array): a feature vector for the given country in the given year
    '''
    row = df[(df['Country'] == country) & (df['Year'] == year)]
    num_disasters = len(row)
    num_deaths = row['Total Deaths'].sum()
    num_deaths_per_disaster = num_deaths / num_disasters if num_disasters > 0 else 0
    num_deaths_at_biggest_disaster = row['Total Deaths'].max()
    vector = np.array([num_disasters, num_deaths, num_deaths_per_disaster, num_deaths_at_biggest_disaster])
    return vector

In [None]:
build_feature_vector_v1(df, 'Indonesia', 2005)

# Create Training Data

- 1. From what year onwards are we going to use the data?
  - I.e. from what year onwards is the data complete / accuracte?
  - i.e. from what year onwards is climate change a theme that governments talk about?
- 

In [None]:
# X{array-like, sparse matrix} of shape (n_samples, n_features)


def create_feature_train_matrix(countries, years, feature_vector_builder=build_feature_vector_v1) -> np.array:
    """
    Create a feature matrix for a given DataFrame and list of years.

    Parameters:
    - df (pd.DataFrame): The input DataFrame containing data for different countries and years.
    - years (list): A list of years for which feature vectors should be created.
    - feature_vector_builder (callable): A function used to build feature vectors for each country and year.
                                         Default is build_feature_vector_v1.

    Returns:
    - np.array: A 2D numpy array representing the feature matrix, where each row corresponds to a country-year pair.

    Example:
    ```
    from utils.disaster_data_utils import *
    import numpy as np
    df = build_dataframe()
    df = build_clean_dataframe(df)
    feature_matrix = create_feature_matrix(df, np.arange(2000, 2005))
    ```
    """
    df = build_clean_dataframe()
    result = []
    for country in countries:
        for year in years:
            row = df[(df['Country'] == country) & (df['Year'] == year)]
            if len (row) != 0:
                last_row = row
                feature_vector = feature_vector_builder(df, country, year)
                result.append(feature_vector)
            
    return np.array(result)


In [None]:
train_years = np.arange(2000, 2021)
countries = df['Country'].unique()
X_train = create_feature_train_matrix(countries, train_years)
X_train.shape

In [None]:
X_train

# Train: Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression

model = LinearRegression(fit_intercept=True) 
X_train = X_train
# TODO: create proper y targets based on speech data.
# let's first try to overfit to verify we have implemented everything correctly
## create a y vector with a one if there are more than 10 disasters, 0 otherwise
Y_train = np.array([1 if x > 10 else 0 for x in X_train[:, 0]])
# Y_train = np.random.rand(X_train.shape[0]).reshape(-1, 1)
# TODO: fix Nan Values
model.fit(X_train, Y_train)


In [None]:
# create random y vector
y1 = np.random.rand(X_train.shape[1]).reshape(1, -1)
y1[0] = 0
y2 = np.random.rand(X_train.shape[1]).reshape(1, -1)
y2[0] = 100
print(model.predict(y1))
print(model.predict(y2)) # seems to work as expeted; giving higher output for samples with more disasters


# Feature engineer training labels

In [None]:
# from nltk.tokenize import word_tokenize
# import nltk
# nltk.download('punkt')



# # https://www.w3resource.com/python-exercises/nltk/nltk-tokenize-exercise-3.php
# # words = word_tokenize('klimaarverandering is erg')

# def convert_text_to_keyword_counts(speech_string, keywords):
#     keywords = [keyword.lower() for keyword in keywords]
#     result = 0
#     words = word_tokenize(speech_string)
#     words = [word.lower() for word in words]
#     for word in words:
#         if word in keywords:
#             result += 1
#     return result
    
    

# def convert_list_of_speeches_to_list_of_keyword_counts(speeches, keywords):
#     result = []
#     for speech in speeches:
#         result.append(convert_text_to_keyword_counts(speech, keywords))
#     return result


# print(convert_list_of_speeches_to_list_of_keyword_counts(['klimaatverandering is erg', 
#                                                'Pilkes is geen Pickle. Toch.', 
#                                                'Minder CO2! Minder CO2! Fossielle brandstoffen zijn stom.'], 
#                                               keywords=['klimaatverandering', "CO2", "fossielle"]))

# def convert_list_of_speeches_to_normalized_scores(speeches, keywords):
#     '''
        
    
#     '''
#     keywords = [keyword.lower() for keyword in keywords]
#     list_of_counts = convert_list_of_speeches_to_list_of_keyword_counts(speeches, keywords)
#     # ensure all counts have a value between 0 and 1
#     max_count = max(list_of_counts)
#     min_count = min(list_of_counts)
#     denom = max_count - min_count 
#     if denom == 0:
#         denom = 1
#     normalized_counts = [(x - min_count) / (denom) for x in list_of_counts]
#     return normalized_counts



# convert_list_of_speeches_to_normalized_scores(['klimaatverandering is erg', 
#                                                'Pilkes is geen Pickle. Toch.', 
#                                                'Minder CO2! Minder CO2! Fossiele brandstoffen zijn stom.'], 
#                                               keywords=['klimaatverandering', "CO2", "fossiele"])

In [None]:
# from nltk.stem import PorterStemmer
# from nltk.tokenize import word_tokenize
# from nltk.tokenize import RegexpTokenizer
# import matplotlib.pyplot as plt
# import pandas as pd
# import numpy as np


# def create_target_value_matrix(countries, years, keywords, stemmer=PorterStemmer(), tokenizer=RegexpTokenizer(r'\w+')) -> np.array:
#     df = create_prepoccesed_df(countries=countries, years=years, keywords=keywords, stemmer=stemmer, tokenizer=tokenizer)
#     result = []
#     for country in countries:
#         for year in years:
#             row = df[(df['country'] == country) & (df['year'] == year)]
#             if len(row) != 1:
#                 print("this should not happen")
#             result.append(row["year_score"])
#     return np.array(result)


# def create_prepoccesed_df(countries, years, keywords, stemmer=PorterStemmer(), tokenizer=RegexpTokenizer(r'\w+')):
#     print("prepoccesing df")
#     df = pd.read_pickle("../../data/DF_UNsession_rawtxt_per_country_from1990.pkl")
#     ## filter df for countries and years
#     df = df[df["country"].isin(countries)]
#     df = df[df["year"].isin(years)]
#     df = create_stemmed_df(df, stemmer, tokenizer)
#     # df = add_stemmed_df_year(df, stemmer, tokenizer)
#     df = add_keyword_counts_column(df, keywords, stemmer, tokenizer)
#     df = add_year_score_column(df, stemmer, tokenizer)
#     return df
         
# def create_stemmed_df(df, stemmer=PorterStemmer(), tokenizer=RegexpTokenizer(r'\w+')):
#     df_tokenized_per_country = pd.DataFrame()
#     for row in df.iterrows():
#         country = row[1]["country"]
#         year = row[1]["year"]
#         txt = row[1]["txt"]

#         txt_tokenized = stem_tokenizer(txt, stemmer, tokenizer)
        
#         # Create a new row DataFrame with year and country and the tokenized text
#         new_row = pd.DataFrame({"country":[country], "year": [year], "txt_stemmed": [txt_tokenized]})
        
#         # Concatenate the new row DataFrame to df_tokenized_per_year
#         df_tokenized_per_country = pd.concat([df_tokenized_per_country, new_row], ignore_index=True)
#     return df_tokenized_per_country         
         
# def add_year_score_column(df, stemmer=PorterStemmer(), tokenizer=RegexpTokenizer(r'\w+')):
#     print("adding year score column to df")
#     for year in df["year"].unique():
#         df_year = df.loc[df["year"]==year]
#         max_count_year = df_year["keyword_counts"].max()
#         min_count_year = df_year["keyword_counts"].min()
#         # for each country, add normalized score columm to df for that year
#         df.loc[df["year"]==year, "year_score"] = df_year["keyword_counts"].apply(lambda x: normalize_score_min_max(x, min_count_year, max_count_year))
#     return df
              

# def add_keyword_counts_column(df, keywords, stemmer=PorterStemmer(), tokenizer=RegexpTokenizer(r'\w+')):
#     '''
#     Returns a dataframe with a column containing the stemmed text of the original dataframe.

#             Parameters:
#                     df (pd.DataFrame): The dataframe containing the disaster data
#                     stemmer (PorterStemmer): The stemmer to use
#                     tokenizer (RegexpTokenizer): The tokenizer to use

#             Returns:
#                     df (pd.DataFrame): The dataframe containing the disaster data with an extra column containing the stemmed text
#     '''
#     keywords = [stem_tokenizer(keyword, stemmer=stemmer, tokenizer=tokenizer) for keyword in keywords]

#     print("adding keyword counts column to df by matching keywords in stemmed stemmed text")
#     df['keyword_counts'] = df['txt_stemmed'].apply(lambda x: count_keywords_in_text(keywords, x))
#     return df


# def build_tokenized_text(df, country, year, tokenizer=RegexpTokenizer(r'\w+'), stemmer=PorterStemmer()) -> str:
#     sub_df = df[(df['Country'] == country) & (df['Year'] == year)]
#     df_of_year = df.loc[df["year"]==year]
#     corpus_of_year = ' '.join(sub_df.txt)
#     corpus_tokanized = stem_tokenizer(corpus_of_year, stemmer, tokenizer)
    

# def stem_tokenizer(txt, stemmer, tokenizer):
#     txt = tokenizer.tokenize(txt.lower())
#     txt = [stemmer.stem(word) for word in txt]
#     txt = ' '.join(txt)
#     return txt

# def count_keywords_in_text(keywords, text):
#     count = 0
#     for keyword in keywords:
#         count  += text.count(keyword)
#     return count

# # def stem_counter_multiple_keywords(keywords, corpus, stemmer, tokenizer):
# #     count = 0
# #     for keyword in keywords:
# #         count  += stem_counter(keyword, corpus, stemmer, tokenizer)
# #     return count
        

# # def stem_counter(keyword, corpus, stemmer, tokenizer):

# #     keyword = stem_tokenizer(keyword, stemmer, tokenizer)

# #     corpus = stem_tokenizer(corpus, stemmer, tokenizer)

# #     return corpus.count(keyword)


# def normalize_score_min_max(score, min_count, max_count):
#     if max_count == min_count:
#         denom = 0
#     else :
#         denom = max_count - min_count
#     return (score - min_count) / denom

In [None]:
df = pd.read_pickle("../../data/DF_UNsession_rawtxt_per_country_from1990.pkl")
countries = df['country'].unique()
years = df['year'].unique()
print(countries, years)

# test = create_prepoccesed_df(countries, [2009, 2010], keywords=["climate", "CO2"])
# test = create_target_value_matrix(countries[:3], years[20009, 2010], ["climate", "CO2"])

In [None]:
T = create_target_value_matrix(countries, [2009, 2010], ["climate", "CO2"])

In [None]:
def create_training_and_target_value_matrices(countries, years, keywords, stemmer=PorterStemmer(), tokenizer=RegexpTokenizer(r'\w+')) -> np.array:
   