# Propensity score testing
In this notebook, propensity score matching is tested to see the influence of conflict on IPC

In [1]:
import pandas as pd
import statsmodels.api as sm
from causalinference import CausalModel

In [2]:
#Import the dataset
dataset = pd.read_csv("merged_df_for_causality_v2.csv")

#Replace NaN values for IPC
dataset.dropna(subset=['ipc'], inplace=True)

# Below you can see that I've tried to make every non-zero value 1 in the total conflict column, 
# making this a binary variable. Sadly this did not help.
# if you uncomment the code below and use "dataset_conflict_binary" instead of "dataset" you will see this

# dataset_conflict_binary = dataset
# dataset_conflict_binary.loc[dataset_conflict_binary['total_conflict#'] >= 1] = 1
# print(dataset_conflict_binary['total_conflict#'])

60     0.0
61     0.0
66     0.0
67     0.0
72     0.0
      ... 
299    0.0
306    0.0
307    0.0
314    0.0
315    0.0
Name: total_conflict#, Length: 78, dtype: float64


### Transform categorical data into dummy variables and isolate cofounders

In [3]:
#Drop first is done to get rid of the dummy variable trap
dataset_conflict_binary = pd.get_dummies(data = dataset, drop_first = True)
Y = dataset_conflict_binary.loc[:, 'ipc'].values
X = dataset_conflict_binary.loc[:, 'total_conflict#'].values
confounders = dataset_conflict_binary.drop(columns = ["ipc", "total_conflict#"]).values

How should we fix the problem of having too few control units?

In [4]:
dataset['total_conflict#'].unique()

array([0., 1.])

In [5]:
print(X.shape)
print(Y.shape)
print(confounders.shape)
print("")
print(X)
print(Y)
print(confounders)

(78,)
(78,)
(78, 95)

[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 0. 0. 0. 0. 1. 1.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 1. 1. 1. 0. 1. 0. 1. 0. 1. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0.]
[1. 1. 1. 1. 2. 1. 1. 1. 2. 2. 2. 2. 2. 2. 2. 3. 1. 1. 1. 1. 2. 2. 1. 1.
 2. 3. 1. 2. 1. 1. 2. 2. 2. 2. 1. 1. 1. 1. 1. 1. 4. 1. 3. 1. 3. 1. 3. 3.
 4. 4. 4. 4. 4. 3. 3. 2. 4. 2. 4. 3. 4. 3. 3. 3. 4. 3. 4. 3. 3. 3. 4. 3.
 4. 3. 3. 3. 3. 3.]
[[127.        29.85742    8.633903 ...   0.         0.         0.      ]
 [143.        29.63566    9.268476 ...   0.         0.         0.      ]
 [127.        29.85742    8.633903 ...   0.         0.         0.      ]
 ...
 [143.        29.63566    9.268476 ...   0.         1.         0.      ]
 [127.        29.85742    8.633903 ...   0.         0.         1.      ]
 [143.        29.63566    9.268476 ...   0.         0.         1.      ]]


### Propensity score matching

In [6]:
from causalinference import CausalModel
model = CausalModel(Y, X, confounders)

# The bias_adj specifies whether bias adjustments should be attempted
model.est_via_matching(bias_adj = True)
print(model.estimates)


ValueError: Too few control units: N_c < K+1