In [26]:
import pandas as pd
import statsmodels.api as sm
import datetime
import sklearn as sk
%ls

Add_ints.ipynb            hm.csv              saved.csv
Adjjkd_ints.ipynb         late_20_merged.csv  target_df.csv
better_exsup.csv          LICENSE             [0m[01;34mThesis-Cold_War_Interventions[0m/
BuildBase.ipynb           maildit.csv         Untitled.ipynb
check.csv                 mancheck.csv        us_presidents.csv
export.csv                merged.csv          WimmerMin.csv
exsup.csv                 merged_midi.csv     WimmerPrepared.csv
exsup_merged.csv          NMC.csv             zwei.csv
GenerateMasterFile.ipynb  notebook.tex


# Load Wimmer dataset to use as base df

In [39]:
wim = pd.read_csv('WimmerMin.csv')
# Remove all non-coldwar entries
mask = wim.year > 1945 
wim = wim[mask]
mask = wim.year < 1992
wim = wim[mask]
# Wimmers war data, when ignored, will cause duplicate rows to appear, remove them
wim.drop_duplicates(subset=['year', 'cowcode'], keep='first', inplace=True)


### Add US political information (Dummies for presidents and parties)

In [40]:
# Convert start to datetime
pres = pd.read_csv('us_presidents.csv')
pres['start'] = pd.to_datetime(pres['start'])
# Filter out non relevant presidents
mask = pres['start'] > datetime.date(1945, 1, 1)
pres = pres[mask]
mask = pres['start'] < datetime.date(1992, 1, 1)
pres = pres[mask]
pres['start'] = pres.start.dt.year
# Set turman start to 1946 for data reasons
pres = pres[['start', 'president', 'party']]
pres[:1].start = pres[:1].start +1
dummy_pres = pres['president'].str.get_dummies()
dummy_party = pres['party'].str.get_dummies()
pres = pd.concat([pres, dummy_pres], axis=1)
pres = pd.concat([pres, dummy_party], axis=1)
pres = pres[['start', 'Dwight D. Eisenhower', 'George H. W. Bush', 
'Gerald Ford', 'Harry S. Truman', 'Jimmy Carter',
'John F. Kennedy', 'Lyndon B. Johnson', 'Richard Nixon',
'Ronald Reagan', 'Democratic', 'Republican']]
    
wim = wim.merge(pres, left_on='year', right_on='start', how='left')
wim[[
    'Dwight D. Eisenhower', 'George H. W. Bush', 
    'Gerald Ford', 'Harry S. Truman', 'Jimmy Carter',
    'John F. Kennedy', 'Lyndon B. Johnson', 'Richard Nixon',
    'Ronald Reagan', 'Democratic', 'Republican']] = wim[
    [
        'Dwight D. Eisenhower', 'George H. W. Bush', 
        'Gerald Ford', 'Harry S. Truman', 'Jimmy Carter',
        'John F. Kennedy', 'Lyndon B. Johnson', 'Richard Nixon',
        'Ronald Reagan', 'Democratic', 'Republican'
    ]].ffill()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value


### Add US / SU GDP and CINC scores

In [41]:
# Add GDP related variables
# Split of both countries
us_only = wim['cowcode'] == 2
su_only = wim['cowcode'] == 365

rus_gdp = wim[su_only]
rus_gdp = rus_gdp[['gdppc', 'gdppcl', 'year']]
rus_gdp.columns = ['SU_gdppc', 'SU_gdppcl', 'year']

us_gdp = wim[su_only]
us_gdp = us_gdp[['gdppc', 'gdppcl', 'year']]

us_gdp
us_gdp.columns = ['US_gdppc', 'US_gdppcl', 'year']

gdp_merged = pd.concat([rus_gdp, us_gdp], axis=1)
gdp_merged 


# Get CINC scores
cinc = pd.read_csv('NMC.csv')
mask = cinc['year'] < 1992
cinc = cinc[mask]
mask = cinc['year'] > 1945
cinc = cinc[mask]
cinc = cinc[['ccode', 'year', 'milex', 'cinc']]

# Get MIl_Ex as sperate variable, 
# (as the Soviet Union had a large army despite a lagging economy)
# Again, split of both countries
us_only = cinc['ccode'] == 2
su_only = cinc['ccode'] == 365

rus_cinc = cinc[su_only]
rus_cinc = rus_cinc[['milex', 'cinc', 'year']]
rus_cinc.columns = ['SU_milex', 'SU_cinc', 'year']

us_cinc = cinc[us_only]
us_cinc = us_cinc[['milex', 'cinc', 'year']]
us_cinc.columns = ['US_milex', 'US_cinc', 'year']

rus_cinc.reset_index(inplace=True)
us_cinc.reset_index(inplace=True)  

cinc_merged = pd.concat([rus_cinc, us_cinc], axis=1)
cinc_merged = cinc_merged[['SU_milex', 'SU_cinc', 'US_milex', 'US_cinc', 'year']]
cinc_merged.reset_index(inplace=True)
gdp_merged.reset_index(inplace=True)


gpd_cinc_merged = pd.concat([gdp_merged, cinc_merged], axis=1)

# Remove duplicate columns. 
c = pd.Index(['SU_gdppc', 'SU_gdppcl', 'year', 'US_gdppc', 'US_gdppcl',
       'year', 'SU_milex', 'SU_cinc', 'US_milex', 'US_cinc', 'year',
       'year'])

gpd_cinc_merged = gpd_cinc_merged.loc[:, ~gpd_cinc_merged.columns.duplicated()]

#Merge new dataframe into Wimmers dataset
wim = wim.merge(gpd_cinc_merged, left_on='year', right_on='year', how='left')

# Add cinc and milex variables for indivual nations as well. 
wim = wim.merge(cinc, left_on=['year', 'cowcode'], right_on=['year', 'ccode'], how='left')

#Save the file
wim.to_csv('WimmerPrepared.csv')


# Prepare external support DF

## WARNING - This block contains a very slow groupby operation, which will take +/- 15 minutes

In [30]:
exsup = pd.read_csv('exsup.csv')
# Remove all non-coldwar entries
mask = exsup['ywp_year'] < 1992
exsup = exsup[mask]
#remove non US / SU interventions
dummies = exsup['external_name'].str.get_dummies(sep=';')
dummies = dummies[['United States', 'Soviet Union']]
exsup = pd.concat([exsup, dummies], axis=1)
# Drop duplicates
exsup.drop_duplicates(inplace=True)

# Add support specific dummies
old = ['external_type__X', 'external_type__L', 'external_type__Y',
         'external_type__W', 'external_type__M', 'external_type__T',
         'external_type__$', 'external_type__I', 'external_type__O', 
         'external_type__U']
new = [
    'troops', 'joint_ops', 'terr', 'weapons', 'logistics',
    'training', 'eco', 'intel', 'other', 'unknown'
]

us_support_cols = ['US_' + x for x in old]
su_support_cols = ['SU_' + x for x in old]

for col in us_support_cols:
    exsup[col] = 0
for col in su_support_cols:
    exsup[col] = 0
   

# # Build dict to rename columns
# coldict = {}
# index = 0
# for col in old:    
#     coldict[col] = new[index]
#     index +=1
# # Rename columns
# exsup.rename(coldict, inplace=True)


for index, row in exsup.iterrows():
    if row['Soviet Union'] == 1:
        nation = 'SU_'
        for col in old:
            exsup[index: index + 1][nation + col] = int(row[col])
    if row['United States'] == 1:
        nation = 'US_'
        for col in old:
            exsup[index: index + 1][nation + col] = int(row[col])
            

# Save to csv
exsup.to_csv('export.csv')
exsup_backup = exsup
# Merge dataframe per year-country combination
exsup = exsup.groupby(['locationid1', 'ywp_year']).max()
exsup.to_csv('exsup_merged.csv')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


## Merge UCDP external support into the BaseDataset

In [42]:
# Reset index so 'locationid1' becomes available again. 
exsup.reset_index(inplace=True)
# wim.reset_index(inplace=True)

In [36]:
wim.columns

Index(['level_0', 'yearc', 'year', 'cowcode', 'country', 'onset', 'war',
       'warname', 'warno', 'wartype', 'yrbeg', 'yrend', 'anarc', 'anarcl',
       'anoc', 'anocl', 'area2001', 'asia', 'autoc', 'autocl', 'democ',
       'democl', 'eeurop', 'ethfrac', 'gdp', 'gdppc', 'gdppcl', 'implag',
       'imppower', 'instab', 'instabl', 'lamerica', 'lmtnest', 'lnpop',
       'lnpopl', 'milperc', 'milpercl', 'nafrme', 'nbcivil', 'nbconq',
       'nbinter', 'nbnatind', 'nbnonind', 'nsflag', 'nsfyear', 'ocivil',
       'oconq', 'oil', 'oilpc', 'oilpcl', 'ointer', 'ointrap', 'onatind',
       'ononind', 'pdemnb', 'pocivil', 'poconq', 'pointer', 'pointrap',
       'poldisc', 'poldiscl', 'ponatind', 'pononind', 'ponset', 'pop',
       'relfrac', 'ssafrica', 'western', 'start', 'Dwight D. Eisenhower',
       'George H. W. Bush', 'Gerald Ford', 'Harry S. Truman', 'Jimmy Carter',
       'John F. Kennedy', 'Lyndon B. Johnson', 'Richard Nixon',
       'Ronald Reagan', 'Democratic', 'Republican', 'inde

In [35]:
exsup.columns

Index(['locationid1', 'ywp_year', 'SU_external_type__$', 'SU_external_type__I',
       'SU_external_type__L', 'SU_external_type__M', 'SU_external_type__O',
       'SU_external_type__T', 'SU_external_type__U', 'SU_external_type__W',
       'SU_external_type__X', 'SU_external_type__Y', 'Soviet Union',
       'US_external_type__$', 'US_external_type__I', 'US_external_type__L',
       'US_external_type__M', 'US_external_type__O', 'US_external_type__T',
       'US_external_type__U', 'US_external_type__W', 'US_external_type__X',
       'US_external_type__Y', 'United States', 'actorID', 'bc_id', 'bc_name',
       'bwd_id', 'bwd_name', 'conflictID', 'country1', 'external_alleged',
       'external_exists', 'external_type__$', 'external_type__I',
       'external_type__L', 'external_type__M', 'external_type__O',
       'external_type__T', 'external_type__U', 'external_type__W',
       'external_type__X', 'external_type__Y', 'locationid2', 'ywp_id',
       'ywp_name'],
      dtype='object')

In [51]:
merged = wim.merge(exsup, left_on=['cowcode', 'year'], right_on=['locationid1', 'ywp_year'], how='left')
# External exists is a value that we dont want Null values for, ffill them. 
merged['external_exists'].fillna(value=0, inplace=True)

# # Add dummies for US and RU interventions
# intervention_dummies = merged['external_name'].str.get_dummies()
# merged = pd.concat([merged, intervention_dummies], axis=1)

mask = merged.year > 1975
merged = merged[mask]
merged.to_csv('masterfile.csv')

In [49]:
merged.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2400 entries, 30 to 6899
Columns: 138 entries, yearc to ywp_name
dtypes: float64(97), int64(32), object(9)
memory usage: 2.5+ MB


In [52]:
for x in merged.columns:
    print(x)

yearc
year
cowcode
country
onset
war
warname
warno
wartype
yrbeg
yrend
anarc
anarcl
anoc
anocl
area2001
asia
autoc
autocl
democ
democl
eeurop
ethfrac
gdp
gdppc
gdppcl
implag
imppower
instab
instabl
lamerica
lmtnest
lnpop
lnpopl
milperc
milpercl
nafrme
nbcivil
nbconq
nbinter
nbnatind
nbnonind
nsflag
nsfyear
ocivil
oconq
oil
oilpc
oilpcl
ointer
ointrap
onatind
ononind
pdemnb
pocivil
poconq
pointer
pointrap
poldisc
poldiscl
ponatind
pononind
ponset
pop
relfrac
ssafrica
western
start
Dwight D. Eisenhower
George H. W. Bush
Gerald Ford
Harry S. Truman
Jimmy Carter
John F. Kennedy
Lyndon B. Johnson
Richard Nixon
Ronald Reagan
Democratic
Republican
index_x
SU_gdppc
SU_gdppcl
US_gdppc
US_gdppcl
SU_milex
SU_cinc
US_milex
US_cinc
ccode
milex
cinc
index_y
locationid1
ywp_year
SU_external_type__$
SU_external_type__I
SU_external_type__L
SU_external_type__M
SU_external_type__O
SU_external_type__T
SU_external_type__U
SU_external_type__W
SU_external_type__X
SU_external_type__Y
Soviet Union
US_external_

In [54]:
logit = sm.Logit(merged['United States'], merged[['Soviet Union', 'cinc']], missing='drop')
#logit = sm.Logit(late20['Soviet Union'], late20[['onatind', 'ssafrica']], missing='drop')

#'anarc', 'ointrap', 'instab'

In [55]:
results = logit.fit(missing='drop')

Optimization terminated successfully.
         Current function value: 0.632447
         Iterations 7


In [56]:
print(results.summary())

                           Logit Regression Results                           
Dep. Variable:          United States   No. Observations:                  463
Model:                          Logit   Df Residuals:                      461
Method:                           MLE   Df Model:                            1
Date:                Fri, 25 May 2018   Pseudo R-squ.:                 0.03971
Time:                        23:12:09   Log-Likelihood:                -292.82
converged:                       True   LL-Null:                       -304.93
                                        LLR p-value:                 8.610e-07
                   coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------
Soviet Union     0.2197      0.220      0.996      0.319      -0.212       0.652
cinc           -76.8792     15.845     -4.852      0.000    -107.935     -45.823


In [None]:
# MATCHES - US
[['Soviet Union', 'poldisc']]

In [None]:
# Matches - SU

In [None]:
for col in wim.columns:
    print(col)