In [272]:
import pandas as pd
import statsmodels.api as sm
import datetime
import sklearn as sk


# Load Wimmer dataset to use as base df

In [39]:
wim = pd.read_csv('WimmerMin.csv')
# Remove all non-coldwar entries
mask = wim.year > 1945 
wim = wim[mask]
mask = wim.year < 1992
wim = wim[mask]
# Wimmers war data, when ignored, will cause duplicate rows to appear, remove them
wim.drop_duplicates(subset=['year', 'cowcode'], keep='first', inplace=True)


### Add US political information (Dummies for presidents and parties)

In [40]:
# Convert start to datetime
pres = pd.read_csv('us_presidents.csv')
pres['start'] = pd.to_datetime(pres['start'])
# Filter out non relevant presidents
mask = pres['start'] > datetime.date(1945, 1, 1)
pres = pres[mask]
mask = pres['start'] < datetime.date(1992, 1, 1)
pres = pres[mask]
pres['start'] = pres.start.dt.year
# Set turman start to 1946 for data reasons
pres = pres[['start', 'president', 'party']]
pres[:1].start = pres[:1].start +1
dummy_pres = pres['president'].str.get_dummies()
dummy_party = pres['party'].str.get_dummies()
pres = pd.concat([pres, dummy_pres], axis=1)
pres = pd.concat([pres, dummy_party], axis=1)
pres = pres[['start', 'Dwight D. Eisenhower', 'George H. W. Bush', 
'Gerald Ford', 'Harry S. Truman', 'Jimmy Carter',
'John F. Kennedy', 'Lyndon B. Johnson', 'Richard Nixon',
'Ronald Reagan', 'Democratic', 'Republican']]
    
wim = wim.merge(pres, left_on='year', right_on='start', how='left')
wim[[
    'Dwight D. Eisenhower', 'George H. W. Bush', 
    'Gerald Ford', 'Harry S. Truman', 'Jimmy Carter',
    'John F. Kennedy', 'Lyndon B. Johnson', 'Richard Nixon',
    'Ronald Reagan', 'Democratic', 'Republican']] = wim[
    [
        'Dwight D. Eisenhower', 'George H. W. Bush', 
        'Gerald Ford', 'Harry S. Truman', 'Jimmy Carter',
        'John F. Kennedy', 'Lyndon B. Johnson', 'Richard Nixon',
        'Ronald Reagan', 'Democratic', 'Republican'
    ]].ffill()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value


### Add US / SU GDP and CINC scores

In [41]:
# Add GDP related variables
# Split of both countries
us_only = wim['cowcode'] == 2
su_only = wim['cowcode'] == 365

rus_gdp = wim[su_only]
rus_gdp = rus_gdp[['gdppc', 'gdppcl', 'year']]
rus_gdp.columns = ['SU_gdppc', 'SU_gdppcl', 'year']

us_gdp = wim[su_only]
us_gdp = us_gdp[['gdppc', 'gdppcl', 'year']]

us_gdp
us_gdp.columns = ['US_gdppc', 'US_gdppcl', 'year']

gdp_merged = pd.concat([rus_gdp, us_gdp], axis=1)
gdp_merged 


# Get CINC scores
cinc = pd.read_csv('NMC.csv')
mask = cinc['year'] < 1992
cinc = cinc[mask]
mask = cinc['year'] > 1945
cinc = cinc[mask]
cinc = cinc[['ccode', 'year', 'milex', 'cinc']]

# Get MIl_Ex as sperate variable, 
# (as the Soviet Union had a large army despite a lagging economy)
# Again, split of both countries
us_only = cinc['ccode'] == 2
su_only = cinc['ccode'] == 365

rus_cinc = cinc[su_only]
rus_cinc = rus_cinc[['milex', 'cinc', 'year']]
rus_cinc.columns = ['SU_milex', 'SU_cinc', 'year']

us_cinc = cinc[us_only]
us_cinc = us_cinc[['milex', 'cinc', 'year']]
us_cinc.columns = ['US_milex', 'US_cinc', 'year']

rus_cinc.reset_index(inplace=True)
us_cinc.reset_index(inplace=True)  

cinc_merged = pd.concat([rus_cinc, us_cinc], axis=1)
cinc_merged = cinc_merged[['SU_milex', 'SU_cinc', 'US_milex', 'US_cinc', 'year']]
cinc_merged.reset_index(inplace=True)
gdp_merged.reset_index(inplace=True)


gpd_cinc_merged = pd.concat([gdp_merged, cinc_merged], axis=1)

# Remove duplicate columns. 
c = pd.Index(['SU_gdppc', 'SU_gdppcl', 'year', 'US_gdppc', 'US_gdppcl',
       'year', 'SU_milex', 'SU_cinc', 'US_milex', 'US_cinc', 'year',
       'year'])

gpd_cinc_merged = gpd_cinc_merged.loc[:, ~gpd_cinc_merged.columns.duplicated()]

#Merge new dataframe into Wimmers dataset
wim = wim.merge(gpd_cinc_merged, left_on='year', right_on='year', how='left')

# Add cinc and milex variables for indivual nations as well. 
wim = wim.merge(cinc, left_on=['year', 'cowcode'], right_on=['year', 'ccode'], how='left')

#Save the file
wim.to_csv('WimmerPrepared.csv')


# Prepare external support DF

## WARNING - This block contains a very slow groupby operation, which will take +/- 15 minutes

In [30]:
exsup = pd.read_csv('exsup.csv')
# Remove all non-coldwar entries
mask = exsup['ywp_year'] < 1992
exsup = exsup[mask]
#remove non US / SU interventions
dummies = exsup['external_name'].str.get_dummies(sep=';')
dummies = dummies[['United States', 'Soviet Union']]
exsup = pd.concat([exsup, dummies], axis=1)
# Drop duplicates
exsup.drop_duplicates(inplace=True)

# Add support specific dummies
old = ['external_type__X', 'external_type__L', 'external_type__Y',
         'external_type__W', 'external_type__M', 'external_type__T',
         'external_type__$', 'external_type__I', 'external_type__O', 
         'external_type__U']
new = [
    'troops', 'joint_ops', 'terr', 'weapons', 'logistics',
    'training', 'eco', 'intel', 'other', 'unknown'
]

us_support_cols = ['US_' + x for x in old]
su_support_cols = ['SU_' + x for x in old]

for col in us_support_cols:
    exsup[col] = 0
for col in su_support_cols:
    exsup[col] = 0
   

# # Build dict to rename columns
# coldict = {}
# index = 0
# for col in old:    
#     coldict[col] = new[index]
#     index +=1
# # Rename columns
# exsup.rename(coldict, inplace=True)


for index, row in exsup.iterrows():
    if row['Soviet Union'] == 1:
        nation = 'SU_'
        for col in old:
            exsup[index: index + 1][nation + col] = int(row[col])
    if row['United States'] == 1:
        nation = 'US_'
        for col in old:
            exsup[index: index + 1][nation + col] = int(row[col])
            

# Save to csv
exsup.to_csv('export.csv')
exsup_backup = exsup
# Merge dataframe per year-country combination
exsup = exsup.groupby(['locationid1', 'ywp_year']).max()
exsup.to_csv('exsup_merged.csv')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


## Merge UCDP external support into the BaseDataset

In [42]:
# Reset index so 'locationid1' becomes available again. 
exsup.reset_index(inplace=True)
# wim.reset_index(inplace=True)

In [51]:
merged = wim.merge(exsup, left_on=['cowcode', 'year'], right_on=['locationid1', 'ywp_year'], how='left')
# External exists is a value that we dont want Null values for, ffill them. 
merged['external_exists'].fillna(value=0, inplace=True)

# # Add dummies for US and RU interventions
# intervention_dummies = merged['external_name'].str.get_dummies()
# merged = pd.concat([merged, intervention_dummies], axis=1)

mask = merged.year > 1975
merged = merged[mask]
merged.to_csv('masterfile.csv')

### The full list of variables that are present in the Dataset

In [52]:
for x in merged.columns:
    print(x)

yearc
year
cowcode
country
onset
war
warname
warno
wartype
yrbeg
yrend
anarc
anarcl
anoc
anocl
area2001
asia
autoc
autocl
democ
democl
eeurop
ethfrac
gdp
gdppc
gdppcl
implag
imppower
instab
instabl
lamerica
lmtnest
lnpop
lnpopl
milperc
milpercl
nafrme
nbcivil
nbconq
nbinter
nbnatind
nbnonind
nsflag
nsfyear
ocivil
oconq
oil
oilpc
oilpcl
ointer
ointrap
onatind
ononind
pdemnb
pocivil
poconq
pointer
pointrap
poldisc
poldiscl
ponatind
pononind
ponset
pop
relfrac
ssafrica
western
start
Dwight D. Eisenhower
George H. W. Bush
Gerald Ford
Harry S. Truman
Jimmy Carter
John F. Kennedy
Lyndon B. Johnson
Richard Nixon
Ronald Reagan
Democratic
Republican
index_x
SU_gdppc
SU_gdppcl
US_gdppc
US_gdppcl
SU_milex
SU_cinc
US_milex
US_cinc
ccode
milex
cinc
index_y
locationid1
ywp_year
SU_external_type__$
SU_external_type__I
SU_external_type__L
SU_external_type__M
SU_external_type__O
SU_external_type__T
SU_external_type__U
SU_external_type__W
SU_external_type__X
SU_external_type__Y
Soviet Union
US_external_

In [273]:
merged.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2400 entries, 30 to 6899
Columns: 138 entries, yearc to ywp_name
dtypes: float64(97), int64(32), object(9)
memory usage: 2.6+ MB


In [60]:
def run(x, y):
    """
    A simple function that runs Logit regressions
    """
    logit = sm.Logit(merged[y], merged[x], missing='drop')
    results = logit.fit(missing='drop')
    return results.summary()
    
    
    

# Logit Regression Analysis

## Analysis 1: Relative economic and military power

In [386]:
print(run([
 'US_cinc',
 'US_milex',
 'SU_cinc',
 'SU_milex',
 'milex',
 'cinc',
 'Soviet Union',

],   
    
 'United States'))

Optimization terminated successfully.
         Current function value: 0.607859
         Iterations 7
                           Logit Regression Results                           
Dep. Variable:          United States   No. Observations:                  463
Model:                          Logit   Df Residuals:                      456
Method:                           MLE   Df Model:                            6
Date:                Sat, 26 May 2018   Pseudo R-squ.:                 0.07704
Time:                        01:08:54   Log-Likelihood:                -281.44
converged:                       True   LL-Null:                       -304.93
                                        LLR p-value:                 1.884e-08
                   coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------
US_cinc        -25.1905     13.934     -1.808      0.071     -52.500       2.119
US_milex      6.602e-09

In [269]:
print(run([
 'US_cinc',
 'US_milex',
 'SU_cinc',
 'SU_milex',
 'milex',
 'cinc',
 'United States',
],   
    
 'Soviet Union'))

Optimization terminated successfully.
         Current function value: 0.475490
         Iterations 7
                           Logit Regression Results                           
Dep. Variable:           Soviet Union   No. Observations:                  463
Model:                          Logit   Df Residuals:                      456
Method:                           MLE   Df Model:                            6
Date:                Sat, 26 May 2018   Pseudo R-squ.:                 0.05205
Time:                        00:31:24   Log-Likelihood:                -220.15
converged:                       True   LL-Null:                       -232.24
                                        LLR p-value:                 0.0004847
                    coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------
US_cinc         -12.3021     18.127     -0.679      0.497     -47.831      23.227
US_milex      -3.401

## Analysis 2: Political variables

### Targets political system

In [170]:
print(run([
 'democl',
 'anocl',
 'autocl', 
],   
    
 'United States'))

Optimization terminated successfully.
         Current function value: 0.657189
         Iterations 4
                           Logit Regression Results                           
Dep. Variable:          United States   No. Observations:                  463
Model:                          Logit   Df Residuals:                      460
Method:                           MLE   Df Model:                            2
Date:                Fri, 25 May 2018   Pseudo R-squ.:                0.002140
Time:                        23:58:38   Log-Likelihood:                -304.28
converged:                       True   LL-Null:                       -304.93
                                        LLR p-value:                    0.5207
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
democl        -0.6131      0.199     -3.084      0.002      -1.003      -0.223
anocl         -0.5108      0.

#### The US is slightly less likely to intervene in democratic nations, however this hesitance completely once a Soviet Intervention is ongoing

In [378]:
print(run([
 'democl',
 'anocl',
 'autocl', 
 'Soviet Union'
],   
    
 'United States'))

Optimization terminated successfully.
         Current function value: 0.651079
         Iterations 4
                           Logit Regression Results                           
Dep. Variable:          United States   No. Observations:                  463
Model:                          Logit   Df Residuals:                      459
Method:                           MLE   Df Model:                            3
Date:                Sat, 26 May 2018   Pseudo R-squ.:                 0.01142
Time:                        01:07:55   Log-Likelihood:                -301.45
converged:                       True   LL-Null:                       -304.93
                                        LLR p-value:                   0.07310
                   coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------
democl          -0.6733      0.201     -3.345      0.001      -1.068      -0.279
anocl           -0.6063

In [375]:
print(run([
 'democl',
 'anocl',
 'autocl', 
],   
    
 'Soviet Union'))

Optimization terminated successfully.
         Current function value: 0.497193
         Iterations 6
                           Logit Regression Results                           
Dep. Variable:           Soviet Union   No. Observations:                  463
Model:                          Logit   Df Residuals:                      460
Method:                           MLE   Df Model:                            2
Date:                Sat, 26 May 2018   Pseudo R-squ.:                0.008782
Time:                        01:07:33   Log-Likelihood:                -230.20
converged:                       True   LL-Null:                       -232.24
                                        LLR p-value:                    0.1301
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
democl        -2.2073      0.318     -6.949      0.000      -2.830      -1.585
anocl         -1.6650      0.

#### Soviet Union seemed to be less inclined to attack democratic nations

In [227]:
print(run([
 'democl',
 'anocl',
 'autocl', 
 'United States'
],   
    
 'Soviet Union'))

Optimization terminated successfully.
         Current function value: 0.494888
         Iterations 6
                           Logit Regression Results                           
Dep. Variable:           Soviet Union   No. Observations:                  463
Model:                          Logit   Df Residuals:                      459
Method:                           MLE   Df Model:                            3
Date:                Sat, 26 May 2018   Pseudo R-squ.:                 0.01338
Time:                        00:11:13   Log-Likelihood:                -229.13
converged:                       True   LL-Null:                       -232.24
                                        LLR p-value:                    0.1017
                    coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------
democl           -2.3363      0.332     -7.036      0.000      -2.987      -1.686
anocl            -1.

In [270]:
print(run([
 'democl',
 'anocl',
 'autocl', 
 'gdppc',
 'milex',
    'United States'
],   
    
 'Soviet Union'))

Optimization terminated successfully.
         Current function value: 0.489357
         Iterations 7
                           Logit Regression Results                           
Dep. Variable:           Soviet Union   No. Observations:                  460
Model:                          Logit   Df Residuals:                      454
Method:                           MLE   Df Model:                            5
Date:                Sat, 26 May 2018   Pseudo R-squ.:                 0.02790
Time:                        00:31:48   Log-Likelihood:                -225.10
converged:                       True   LL-Null:                       -231.56
                                        LLR p-value:                   0.02414
                    coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------
democl           -1.5726      0.421     -3.732      0.000      -2.398      -0.747
anocl            -1.

### This might have something to do with the fact that democratic nations are often wealthier / stronger. So I added control variables for both economic power and military expenditure.

What is interesting is the fact that the hesitance for attacking democratic nations virtually disappears in both nations. Yet the general political effects remain higher in the Soviet Union. In constract, the United States react much stronger to the presence of Soviet Support

In [271]:
print(run([
 'democl',
 'anocl',
 'autocl', 
 'gdppc',
 'milex',
 'Soviet Union'
],   
    
 'United States'))

Optimization terminated successfully.
         Current function value: 0.615962
         Iterations 8
                           Logit Regression Results                           
Dep. Variable:          United States   No. Observations:                  460
Model:                          Logit   Df Residuals:                      454
Method:                           MLE   Df Model:                            5
Date:                Sat, 26 May 2018   Pseudo R-squ.:                 0.06655
Time:                        00:31:53   Log-Likelihood:                -283.34
converged:                       True   LL-Null:                       -303.54
                                        LLR p-value:                 1.240e-07
                   coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------
democl          -0.7519      0.331     -2.273      0.023      -1.400      -0.104
anocl           -0.7470

## Type of war being fought

In [329]:
print(run([
    'pointrap',
    'pointer',
    'pocivil',
    'ointrap',
    'ointer',
    'ocivil',
    'Soviet Union',    
],   
    
 'United States'))

Optimization terminated successfully.
         Current function value: 0.673809
         Iterations 5
                           Logit Regression Results                           
Dep. Variable:          United States   No. Observations:                  463
Model:                          Logit   Df Residuals:                      456
Method:                           MLE   Df Model:                            6
Date:                Sat, 26 May 2018   Pseudo R-squ.:                -0.02310
Time:                        00:52:28   Log-Likelihood:                -311.97
converged:                       True   LL-Null:                       -304.93
                                        LLR p-value:                     1.000
                   coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------
pointrap        -0.6997      0.299     -2.341      0.019      -1.286      -0.114
pointer         -1.2303

In [328]:
print(run([
    #'pointrap',
    'pointer',
    'pocivil',
    #'ointrap',
    'ointer',
    'ocivil',
    'United States'
    
],   
    
 'Soviet Union'))

#Singular matrix for excluded variables

Optimization terminated successfully.
         Current function value: 0.621412
         Iterations 6
                           Logit Regression Results                           
Dep. Variable:           Soviet Union   No. Observations:                  463
Model:                          Logit   Df Residuals:                      458
Method:                           MLE   Df Model:                            4
Date:                Sat, 26 May 2018   Pseudo R-squ.:                 -0.2389
Time:                        00:52:19   Log-Likelihood:                -287.71
converged:                       True   LL-Null:                       -232.24
                                        LLR p-value:                     1.000
                    coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------
pointer          -1.6935      0.625     -2.709      0.007      -2.919      -0.468
pocivil          -0.

## New Nation

#### Note: Most colonial wars are not yet present in the Database

In [368]:
print(run([
    'nsflag',
    'ssafrica',
    'asia',
    'ponatind',  
],   
    
 'Soviet Union'))

Optimization terminated successfully.
         Current function value: 0.502419
         Iterations 6
                           Logit Regression Results                           
Dep. Variable:           Soviet Union   No. Observations:                  463
Model:                          Logit   Df Residuals:                      459
Method:                           MLE   Df Model:                            3
Date:                Sat, 26 May 2018   Pseudo R-squ.:               -0.001636
Time:                        01:01:32   Log-Likelihood:                -232.62
converged:                       True   LL-Null:                       -232.24
                                        LLR p-value:                     1.000
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
nsflag        -0.0153      0.002     -7.067      0.000      -0.020      -0.011
ssafrica      -0.8226      0.

In [369]:
print(run([
    'nsflag',
    'ssafrica',
    'asia',
    'ponatind',  
],   
    
 'United States'))

Optimization terminated successfully.
         Current function value: 0.631766
         Iterations 5
                           Logit Regression Results                           
Dep. Variable:          United States   No. Observations:                  463
Model:                          Logit   Df Residuals:                      459
Method:                           MLE   Df Model:                            3
Date:                Sat, 26 May 2018   Pseudo R-squ.:                 0.04074
Time:                        01:01:41   Log-Likelihood:                -292.51
converged:                       True   LL-Null:                       -304.93
                                        LLR p-value:                 1.663e-05
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
nsflag        -0.0050      0.001     -4.175      0.000      -0.007      -0.003
ssafrica      -0.9445      0.