### casual inference model


* Content

    * Structure from Domain Knowledge
    * Visualise the learned Structure Model
        * Plotting the Structure Model
        * Remove edges below threshold
        * Retrieve the largest subgraph
        * Targeted subgraph
    * Features Extracted From the Graph
    * Jaccard Similarity (stability)
        * Half of the Data
        * Using Full Data



In [88]:
# importing important modules
from causalnex.structure import StructureModel
from causalnex.structure.notears import from_pandas
from causalnex.plots import plot_structure, NODE_STYLE, EDGE_STYLE
from IPython.display import Image
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler
import mlflow
import logging,os
from sklearn.model_selection import train_test_split
from causalnex.network import BayesianNetwork
from causalnex.evaluation import classification_report,roc_auc
from causalnex.inference import InferenceEngine
import pandas as pd
import numpy as np

In [77]:
data = pd.read_csv('../data/merged_data.csv')

#### Structure from Domain Knowledge

In [11]:
# Creating an empty Structural Model
sm = StructureModel()

In [12]:
sm.edges

OutEdgeView([])

#### Visualise the learned Structure Model

In [93]:
train, test = train_test_split(data, train_size=0.75, test_size=0.25, random_state=10)

In [73]:
sample_20 = train.sample(frac=0.2, random_state=42)

In [74]:
# Plotting the Structure Model
node_attributes = {
    "diagnosis": { 
        "shape": "star",
        "style": "filled",
        "width": 0.6,
        "penwidth": "1", 
        "color": "#4a90e2d9",
        "orientation": 25, 
    },
    "fontsize": 0.1
}

graph_attributes = {
    "scale": "1",
    "size": 5,
    "label": "Breast Cancer Causality model",
    "fontcolor": "#FFFFFFD9",
    "fontname": "Helvetica",
    "fontsize": 25, # font size of the graph title
    "dpi": 200,  # resolution
    "labeljust": "l",  # left
    "labelloc": "t",  # top
}

def plot_sm_graph(sm):
    viz = plot_structure(
            sm,
            graph_attributes={"scale": "2.0", "size": 3.5},
            all_node_attributes=NODE_STYLE.WEAK,
            all_edge_attributes=EDGE_STYLE.WEAK,
        )
    return Image(viz.draw(format="png"))

In [None]:
sm = from_pandas(train)

plot_sm_graph(sm)

#### Plotting the Structure Model

In [None]:
# Plotting the Structure Model
viz = plot_structure(
    train,
    prog="circo",
    graph_attributes=graph_attributes,
    node_attributes=node_attributes,
    all_edge_attributes=EDGE_STYLE.WEAK)
    
Image(viz.draw(format='png'))

#### Remove edges below threshold

In [None]:
sm.remove_edges_below_threshold(0.8)
# Plotting the Structure Model
viz = plot_structure(
    train,
    prog="circo",
    graph_attributes=graph_attributes,
    node_attributes=node_attributes,
    all_edge_attributes=EDGE_STYLE.WEAK)
    
Image(viz.draw(format='png'))

id                       0
order_id                 0
driver_id                0
lat                      0
lng                      0
Trip ID                  0
duration                 0
Trip_Origin_lat          0
Trip_Origin_long         0
Trip_Destination_lat     0
Trip_Destination_long    0
distance                 0
pilot_distance           0
dtype: int64

#### Retrieve the largest subgraph

In [70]:
## retrieve the largest subgraph easily by calling the StructureModel function get_largest_subgraph().
sm = sm.get_largest_subgraph()
viz = plot_structure(
    train,
    prog="circo",
    graph_attributes=graph_attributes,
    node_attributes=node_attributes,
    all_edge_attributes=EDGE_STYLE.WEAK)
Image(viz.draw(format='png'))

#### Features Extracted From the Graph

In [None]:
edges = list(target.edges())
feature_extraction = []
for start,end in edges:
    if(end == 'driver_id'):
        feature_extraction.append(start)
feature_extraction

#### Modeling

#### Random Forest

In [82]:
data['fulfilled'] = False

In [83]:
data['fulfilled'] = np.where((data['order_id'] == data['Trip ID']) & (data['driver_action'] == 'accepted'),True,data['fulfilled'])

In [84]:
fulfill_map = {0: "No", 1: "Yes"}
data["fulfilled"] = data["fulfilled"].map(fulfill_map)

In [85]:
driver_action_map = {0: "accept", 1: "reject"}
data["driver_action"] = data["driver_action"].map(driver_action_map)

In [89]:
logging.basicConfig(filename='../logs/log.log', filemode='a',encoding='utf-8', level=logging.DEBUG)
mlflow.set_experiment("Random Forest Regressor")

2022/10/24 00:14:02 INFO mlflow.tracking.fluent: Experiment with name 'Random Forest Regressor' does not exist. Creating a new experiment.


<Experiment: artifact_location='file:///C:/Users/hp/casual-inference/notebooks/mlruns/1', experiment_id='1', lifecycle_stage='active', name='Random Forest Regressor', tags={}>

In [90]:
mlflow.log_param('Model', 'Random Forest Regressor')

In [94]:
x_train = train.drop(columns=['fulfilled','driver_action'])
y_train = train['fulfilled']
x_test = test.drop(columns=['fulfilled','driver_action'])
y_test = test['fulfilled']

In [96]:
n_estimators = 35
max_depth=20
random_state=5
pipeline = Pipeline(steps = [('preprocessor', MinMaxScaler()),('model',RandomForestRegressor(n_estimators = n_estimators,max_depth=max_depth, random_state=random_state))])
mlflow.log_param('Number of estimators',n_estimators)
mlflow.log_param('Max depth',max_depth)
mlflow.log_param('Random state',random_state)

In [None]:
random_forest_model = pipeline.fit(x_train, y_train)

In [None]:
Score = random_forest_model.score(x_test, y_test)
y_pred= random_forest_model.predict(x_test)
report = str(data.columns)
mse = mean_squared_error(y_test, y_pred)

In [None]:
print("Score",Score,"\n","Mean Squared Error",mse)