In [1]:
import pandas as pd
import statsmodels.api as sm
import datetime
import sklearn as sk
import numpy as np
import seaborn as sn

# Load Wimmer dataset to use as base df

In [2]:
wim = pd.read_csv('WimmerMin.csv')
# Remove all non-coldwar entries
mask = wim.year > 1945 
wim = wim[mask]
mask = wim.year < 1992
wim = wim[mask]
# Wimmers war data, when ignored, will cause duplicate rows to appear, remove them
wim.drop_duplicates(subset=['year', 'cowcode'], keep='first', inplace=True)

### Add US political information (Dummies for presidents and parties)

In [3]:
# Convert start to datetime
pres = pd.read_csv('us_presidents.csv')
pres['start'] = pd.to_datetime(pres['start'])
# Filter out non relevant presidents
mask = pres['start'] > datetime.date(1945, 1, 1)
pres = pres[mask]
mask = pres['start'] < datetime.date(1992, 1, 1)
pres = pres[mask]
pres['start'] = pres.start.dt.year

# Set Truman start to 1946 for data reasons
pres = pres[['start', 'president', 'party']]
pres[:1].start = pres[:1].start +1
dummy_pres = pres['president'].str.get_dummies()
dummy_party = pres['party'].str.get_dummies()
pres = pd.concat([pres, dummy_pres], axis=1)
pres = pd.concat([pres, dummy_party], axis=1)
pres = pres[['start', 'Dwight D. Eisenhower', 'George H. W. Bush', 
'Gerald Ford', 'Harry S. Truman', 'Jimmy Carter',
'John F. Kennedy', 'Lyndon B. Johnson', 'Richard Nixon',
'Ronald Reagan', 'Democratic', 'Republican']]
    
wim = wim.merge(pres, left_on='year', right_on='start', how='left')
wim[[
    'Dwight D. Eisenhower', 'George H. W. Bush', 
    'Gerald Ford', 'Harry S. Truman', 'Jimmy Carter',
    'John F. Kennedy', 'Lyndon B. Johnson', 'Richard Nixon',
    'Ronald Reagan', 'Democratic', 'Republican']] = wim[
    [
        'Dwight D. Eisenhower', 'George H. W. Bush', 
        'Gerald Ford', 'Harry S. Truman', 'Jimmy Carter',
        'John F. Kennedy', 'Lyndon B. Johnson', 'Richard Nixon',
        'Ronald Reagan', 'Democratic', 'Republican'
    ]].ffill()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value


### Add GDP and CINC scores, both as target and dummy variables

In [4]:
col_powers = {'United Kingdom': 200, 'Netherlands' : 210, 'Belgium': 211, 'France': 220, 'Spain': 230, 'Portugal': 235}

# Get CINC scores for target nations
cinc = pd.read_csv('NMC.csv')
mask = cinc['year'] < 1992
cinc = cinc[mask]
mask = cinc['year'] > 1945
cinc = cinc[mask]
cinc = cinc[['ccode', 'year', 'milex', 'cinc']]

def gdp_globals(code, name):
    temp = wim[wim['cowcode'] == code]
    temp = temp[['gdppc', 'gdppcl', 'year']]
    temp.columns = [name + '_' + str(x) for x in temp.columns]
    temp.reset_index(inplace=True)
    return temp

def cinc_globals(code, name):
    temp = cinc[cinc['ccode'] == code]
    temp = temp[['milex', 'cinc', 'year']]
    temp.columns = [name + '_' + str(x) for x in temp.columns]
    temp.reset_index(inplace=True)
    return temp    
    
# Add country specific GDP variables
us_gdp = gdp_globals(2, 'US')
ru_gdp = gdp_globals(365, 'SU')
uk_gdp = gdp_globals(200, 'UK')
nl_gdp = gdp_globals(210, 'NL')
be_gdp = gdp_globals(211, 'BE')
fr_gdp = gdp_globals(220, 'FR')
sp_gdp = gdp_globals(230, 'SP')
pr_gdp = gdp_globals(235, 'PR')

gdp_merged = pd.concat([us_gdp, ru_gdp, uk_gdp, nl_gdp, be_gdp, fr_gdp, sp_gdp, pr_gdp], axis=1)

us_cinc = cinc_globals(2, 'US')
ru_cinc = cinc_globals(365, 'SU')
uk_cinc = cinc_globals(200, 'UK')
nl_cinc = cinc_globals(210, 'NL')
be_cinc = cinc_globals(211, 'BE')
fr_cinc = cinc_globals(220, 'FR')
sp_cinc = cinc_globals(230, 'SP')
pr_cinc = cinc_globals(235, 'PR')

cinc_merged = pd.concat([us_cinc, ru_cinc, uk_cinc, nl_cinc, be_cinc, fr_cinc, sp_cinc, pr_cinc], axis=1)
gpd_cinc_merged = pd.concat([gdp_merged, cinc_merged], axis=1)
gpd_cinc_merged.drop('index', axis=1, inplace=True)
gpd_cinc_merged.drop(['SU_year', 'UK_year', 'BE_year', 'FR_year', 'SP_year', 'PR_year', 'NL_year'], axis=1, inplace=True)

# Remove duplicate columns. 
c = pd.Index(['US_gdppc', 'US_gdppcl', 'US_year', 'SU_gdppc', 'SU_gdppcl', 'UK_gdppc',
       'UK_gdppcl', 'NL_gdppc', 'NL_gdppcl', 'BE_gdppc', 'BE_gdppcl',
       'FR_gdppc', 'FR_gdppcl', 'FR_year', 'SP_gdppc', 'SP_gdppcl',
       'PR_gdppc', 'PR_gdppcl', 'US_milex', 'US_cinc', 'US_year', 'U_milex',
       'SU_cinc', 'UK_milex', 'UK_cinc', 'NL_milex', 'NL_cinc', 'BE_milex',
       'BE_cinc', 'FR_milex', 'FR_cinc', 'SP_milex', 'SP_cinc', 'PR_milex',
       'PR_cinc'])

gpd_cinc_merged = gpd_cinc_merged.loc[:, ~gpd_cinc_merged.columns.duplicated()]

#Merge new dataframe into Wimmers dataset
wim = wim.merge(gpd_cinc_merged, left_on='year', right_on='US_year', how='left')
# Add cinc and milex variables for indivual nations as well. 
wim = wim.merge(cinc, left_on=['year', 'cowcode'], right_on=['year', 'ccode'], how='left')
#Save the file
wim.to_csv('WimmerPrepared_2.csv')

# Merge UCDP external support into the BaseDataset

In [5]:
exsup = pd.read_csv('EXSUP_PREPARED.csv')
# Reset index so 'locationid1' becomes available again.
exsup.reset_index(inplace=True)
merged = wim.merge(exsup, left_on=['cowcode', 'year'], right_on=['locationid1', 'ywp_year'], how='left')
# External exists is a value that we dont want Null values for, ffill them. 
merged['external_exists'].fillna(value=0, inplace=True)


# Merge the MIDI intervention dataset into the BaseDateset

In [6]:
midi = pd.read_csv('MIDI_prepared.csv')
base_df = merged.merge(midi, left_on=['cowcode', 'year'], right_on=['target', 'date'], how='left')
base_df[midi.columns] = base_df[midi.columns].fillna(0)

# FIllna first, function doesnt work on NaN values
base_df['United States'].fillna(value=0, inplace=True)
base_df['Soviet Union'].fillna(value=0, inplace=True)
base_df['United Kingdom'].fillna(value=0, inplace=True)
base_df['France'].fillna(value=0, inplace=True)
base_df['Netherlands'].fillna(value=0, inplace=True)
base_df['Belgium'].fillna(value=0, inplace=True)
base_df['Spain'].fillna(value=0, inplace=True)
base_df['Portugal'].fillna(value=0, inplace=True)


# Edit exising US/ SU dummies
def US_checker(x):
    if x['US_midi'] == 1:
        return 1
    else:
        return x['United States']

def SU_checker(x):
    if x['SU_midi'] == 1:
        return 1
    else:
        return x['Soviet Union']
    
def FR_checker(x):
    if x['FR_midi'] == 1:
        return 1
    else:
        return x['France']

def UK_checker(x):
    if x['UK_midi'] == 1:
        return 1
    else:
        return x['United Kingdom']

def BE_checker(x):
    if x['BE_midi'] == 1:
        return 1
    else:
        return x['Belgium']

base_df['United States'] = base_df.apply(US_checker, axis=1)
base_df['Soviet Union'] = base_df.apply(SU_checker, axis=1)
base_df['United Kingdom'] = base_df.apply(UK_checker, axis=1)
base_df['France'] = base_df.apply(FR_checker, axis=1)
base_df['Belgium'] = base_df.apply(BE_checker, axis=1)

# # Recode dummies (Recheck later)
col_list = ['United Kingdom', 'France', 'Belgium', 'Netherlands', 'Spain', 'Portugal']
base_df['COL_MIDI'] = (base_df[col_list].sum(axis=1) > 0) * 1 

# Save to CSV
base_df.to_csv('masterfile.csv')

### The full list of variables that are present in the Dataset

In [7]:
for x in base_df.columns:
    print(x)

yearc
year
cowcode
country
onset
war
warname
warno
wartype
yrbeg
yrend
anarc
anarcl
anoc
anocl
area2001
asia
autoc
autocl
democ
democl
eeurop
ethfrac
gdp
gdppc
gdppcl
implag
imppower
instab
instabl
lamerica
lmtnest
lnpop
lnpopl
milperc
milpercl
nafrme
nbcivil
nbconq
nbinter
nbnatind
nbnonind
nsflag
nsfyear
ocivil
oconq
oil
oilpc
oilpcl
ointer
ointrap
onatind
ononind
pdemnb
pocivil
poconq
pointer
pointrap
poldisc
poldiscl
ponatind
pononind
ponset
pop
relfrac
ssafrica
western
start
Dwight D. Eisenhower
George H. W. Bush
Gerald Ford
Harry S. Truman
Jimmy Carter
John F. Kennedy
Lyndon B. Johnson
Richard Nixon
Ronald Reagan
Democratic
Republican
US_gdppc
US_gdppcl
US_year
SU_gdppc
SU_gdppcl
UK_gdppc
UK_gdppcl
NL_gdppc
NL_gdppcl
BE_gdppc
BE_gdppcl
FR_gdppc
FR_gdppcl
SP_gdppc
SP_gdppcl
PR_gdppc
PR_gdppcl
US_milex
US_cinc
SU_milex
SU_cinc
UK_milex
UK_cinc
NL_milex
NL_cinc
BE_milex
BE_cinc
FR_milex
FR_cinc
SP_milex
SP_cinc
PR_milex
PR_cinc
ccode
milex
cinc
index
locationid1
ywp_year
Belgium
Fra

In [8]:
base_df.info()

# mask = merged.year > 1975
# merged = merged[mask]

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6900 entries, 0 to 6899
Columns: 180 entries, yearc to COL_MIDI
dtypes: float64(132), int64(39), object(9)
memory usage: 9.5+ MB


### Targets political system

In [12]:
print(run([
 'democl',
 'anocl',
 'autocl', 
],   
    
 'United States'))

Optimization terminated successfully.
         Current function value: 0.655723
         Iterations 4
(<class 'statsmodels.iolib.summary.Summary'>
"""
                           Logit Regression Results                           
Dep. Variable:          United States   No. Observations:                  485
Model:                          Logit   Df Residuals:                      482
Method:                           MLE   Df Model:                            2
Date:                Sat, 26 May 2018   Pseudo R-squ.:                0.004157
Time:                        19:04:47   Log-Likelihood:                -318.03
converged:                       True   LL-Null:                       -319.35
                                        LLR p-value:                    0.2651
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
democl        -0.6419      0.195     -3.286      0.001     

#### The US is slightly less likely to intervene in democratic nations, however this hesitance completely once a Soviet Intervention is ongoing

In [13]:
print(run([
 'democl',
 'anocl',
 'autocl', 
 'Soviet Union'
],   
    
 'United States'))

Optimization terminated successfully.
         Current function value: 0.649583
         Iterations 4
(<class 'statsmodels.iolib.summary.Summary'>
"""
                           Logit Regression Results                           
Dep. Variable:          United States   No. Observations:                  485
Model:                          Logit   Df Residuals:                      481
Method:                           MLE   Df Model:                            3
Date:                Sat, 26 May 2018   Pseudo R-squ.:                 0.01348
Time:                        19:04:47   Log-Likelihood:                -315.05
converged:                       True   LL-Null:                       -319.35
                                        LLR p-value:                   0.03493
                   coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------
democl          -0.7006      0.198     -3.545      0.00