In [14]:
# Importing useful libraries

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import pandas as pd

In [15]:
# Reading and loading the data

all_var_df = pd.read_csv('../data/cleaned_data.csv')
causal_var_df = pd.read_csv('../data/causal_inference_data.csv')
hold_out_df = pd.read_csv('../data/hold_out_data.csv')
causal_hold_out = pd.read_csv('../data/causal_hold_out_data.csv')

In [16]:
# Creating the feature and target variables from the above read data

X_all = all_var_df.drop(["diagnosis"], axis=1)
y_all = all_var_df["diagnosis"]
X_causal = causal_var_df.drop(["diagnosis"], axis=1)
y_causal = causal_var_df["diagnosis"]
X_hold = hold_out_df.drop(["diagnosis"], axis=1)
y_hold = hold_out_df["diagnosis"]
X_causal_hold = causal_hold_out.drop(["diagnosis"], axis=1)
y_causal_hold = causal_hold_out["diagnosis"]

In [17]:
# Train test split for the feature extracted dataframe

X_train_all, X_test_all, y_train_all, y_test_all = train_test_split(
    X_all, y_all,test_size=0.2, random_state=10
)

In [18]:
# Train test split for the causal inference dataframe

X_train_causal, X_test_causal, y_train_causal, y_test_causal = train_test_split(
    X_causal, y_causal,test_size=0.2, random_state=10
)

In [19]:
# Training the feature extracted model

classifier_all = RandomForestClassifier(n_estimators=100)
classifier_all.fit(X_train_all, y_train_all)

RandomForestClassifier()

In [20]:
# Training the causal inferred model

classifier_causal= RandomForestClassifier(n_estimators=100)
classifier_causal.fit(X_train_causal, y_train_causal)

RandomForestClassifier()

In [21]:
# Predicting using the feature extracted model

y_pred_all = classifier_all.predict(X_test_all)

In [22]:
# Predicting using the causal inferred model

y_pred_causal = classifier_causal.predict(X_test_causal)

In [23]:
# Finding out the accuracy of the feature extracted model

print ('Accuracy Score of the random forest regressor for the whole variable is :',accuracy_score(y_test_all, y_pred_all))

Accuracy Score of the random forest regressor for the whole variable is : 0.9298245614035088


In [24]:
# Finding out the accuracy of the causal inferred model

print ('Accuracy Score of the random forest regressor for the causal inference variable is:',accuracy_score(y_test_causal, y_pred_causal))

Accuracy Score of the random forest regressor for the causal inference variable is: 0.9726027397260274


### From the above results we can see that the causal inference has better accuracy. From this we can see that even after feature selection we can get better accuracy by using causal inference and decreasing the features

# Now lets see how the models are accurate to the hold_out data

In [25]:
# Predicing the hold out data by using the above trained two models

y_pred_all_hold = classifier_all.predict(X_hold)
y_pred_causal_hold = classifier_causal.predict(X_causal_hold)

In [26]:
# Finding out the accuracy of the above models by using the hold out data

print ('Accuracy Score of the random forest regressor for the causal inference variable is:',accuracy_score(y_causal_hold, y_pred_causal_hold))
print ('Accuracy Score of the random forest regressor for the whole variable is:',accuracy_score(y_hold, y_pred_all_hold))

Accuracy Score of the random forest regressor for the causal inference variable is: 0.9380530973451328
Accuracy Score of the random forest regressor for the whole variable is: 0.9823008849557522


### From the above metrices we can infer that the causal inference model accuracy for the hold_out data set is slightly less than the whole feature selected variable model accuracy