In [1]:
import pandas as pd
import datetime

In [2]:
midi = pd.read_csv('./sources/merged_midi.csv')
midi['start'] = midi['start'].astype(str).str[:4]
midi['start'] = midi['start'].astype(int)

# Remove error codes from datefields
mask = midi['start'] < 1992
midi = midi[mask]
mask = midi['start'] > 1945
midi = midi[mask]

midi['end'] = midi['end'].astype(str).str[:4]
midi['end'] = midi['end'].astype(int)

# Remove error codes from datefields
mask = midi['end'] < 2020
midi = midi[mask]
mask = midi['end'] > 1945
midi = midi[mask]

In [3]:
# Get only relevant columns
midi = midi[['intervener', 'target', 'start', 'end']]
midi['duration'] = midi['end'] - midi['start'] + 1
# Remove error-code enddates
mask = midi['duration'] < 5000
midi = midi[mask]

In [4]:
midi.start = pd.to_datetime(midi.start, format='%Y')
midi.end = pd.to_datetime(midi.end, format='%Y')
midi.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 767 entries, 0 to 779
Data columns (total 5 columns):
intervener    767 non-null int64
target        767 non-null int64
start         767 non-null datetime64[ns]
end           767 non-null datetime64[ns]
duration      767 non-null int64
dtypes: datetime64[ns](2), int64(3)
memory usage: 36.0 KB


In [5]:
midi = pd.concat(
    [pd.DataFrame({'date': pd.date_range(row.start, row.end + pd.DateOffset(years=1), freq='12m'), 
              'target': row.target, 'intervener': row.intervener}, columns=['date', 'target', 'intervener']
                 ) for i, row in midi.iterrows()])

# remove dates after cold war
mask = midi['date'] < datetime.date(1992,1,1)
midi = midi[mask]
# Reconvert date to year
midi['date'] = midi['date'].dt.year

# Remove all interventions we dont care about
nations = [2, 200, 210, 211, 220, 230, 235, 365]
mask = midi['intervener'].isin(nations)
midi = midi[mask]
dummies = midi['intervener'].astype('str').str.get_dummies()
midi = pd.concat([midi, dummies], axis=1)

In [6]:
# Group dataframe, so one year-target pair exists for each country
midi = midi.groupby(['target', 'date']).max()

In [7]:
midi.columns = ['intervener', 'US_midi', 'UK_midi', 'NL_midi', 'BE_midi', 'FR_midi', 'SP_midi', 'PT_midi', 'SU_midi']
midi['COL_MIDI'] = 0
midi.reset_index(inplace=True)
col_list = ['UK_midi', 'NL_midi', 'BE_midi', 'FR_midi', 'SP_midi', 'PT_midi']
midi['COL_MIDI'] = (midi[col_list].sum(axis=1) > 0) * 1 
midi.drop('intervener', axis=1, inplace=True)

# Save to CSV
midi.to_csv('./output/MIDI_prepared.csv')