In [None]:
from google.colab import drive
drive.mount('/content/gdrive')
driver_locations = "/content/gdrive/MyDrive/week_8_data/driver_locations_during_request.csv"
data = "/content/gdrive/MyDrive/week_8_data/data_cleaning.csv"

In [None]:
import pandas as pd
df= pd.read_csv(data)

#**Split Data into Training and Hold-out Set**
First, split your dataset into a training set and a hold-out set. This allows us to train our causal model on one part and evaluate interventions and machine learning models on the other.

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

# we use trip_distance_km as our  target variable
X = df.drop(columns=['trip_distance_km'])
y = df['trip_distance_km']

X_train, X_holdout, y_train, y_holdout = train_test_split(X, y, test_size=0.2, random_state=42)

# **Create a Causal Graph using all Training Data**
Use a causal inference library like pgmpy to create a causal graph from your training data. This graph represents the causal relationships among variables.

In [None]:
!pip install pgmpy

In [None]:
from pgmpy.estimators import PC
from pgmpy.base.DAG import DAG

# Example: Using PC algorithm to estimate the causal graph (replace with your method)
model = PC(data=X_train)
causal_graph = model.estimate(return_type='dag')


# **Create New Causal Graphs with Increasing Fractions of the Data and Compare**
Generate causal graphs using increasing fractions of the training data and compare them with the ground truth graph using Jaccard Similarity Index.

In [None]:
from pgmpy.estimators import PC
from pgmpy.base.DAG import DAG
from sklearn.utils import shuffle

def calculate_jaccard_similarity(graph1, graph2):
    # Convert graphs to sets of edges
    edges1 = set(graph1.edges())
    edges2 = set(graph2.edges())

    # Calculate Jaccard Similarity Index
    intersection = len(edges1.intersection(edges2))
    union = len(edges1.union(edges2))
    return intersection / union if union != 0 else 0.0

# Initialize variables
jaccard_scores = []
num_iterations = 10  # Example: 10 iterations with increasing fractions

for i in range(1, num_iterations + 1):
    # Shuffle and select a fraction of the training data
    fraction_size = i / num_iterations
    shuffled_data = shuffle(X_train)
    split_index = int(len(shuffled_data) * fraction_size)
    fraction_data = shuffled_data.iloc[:split_index]

    # Learn causal graph using PC algorithm
    model_fraction = PC(data=fraction_data)
    causal_graph_fraction = model_fraction.estimate(return_type='dag')

    # Calculate Jaccard Similarity with ground truth graph
    jaccard_score = calculate_jaccard_similarity(causal_graph, causal_graph_fraction)
    jaccard_scores.append(jaccard_score)

# Plotting Jaccard Similarity Scores
plt.figure(figsize=(8, 5))
plt.plot(np.arange(1, num_iterations + 1), jaccard_scores, marker='o')
plt.xlabel('Fraction of Data Used')
plt.ylabel('Jaccard Similarity Index')
plt.title('Comparison of Causal Graphs with Increasing Data Fractions')
plt.grid(True)
plt.show()


 # **Select Variables Pointing Directly to the Target Variable**
After identifying a stable causal graph (you can choose based on Jaccard similarity stability or other criteria), select variables that directly influence the target variable for intervention analysis.

In [None]:
from pgmpy.inference import Inference

# Example: Selecting variables directly pointing to the target variable 'trip_distance_km'
direct_causes = list(causal_graph.predecessors('trip_distance_km'))

# Print or use these variables for further analysis or interventions
print("Variables directly pointing to 'trip_distance_km':", direct_causes)


# **Answer Intervention Questions using Do-Operations**
Use do-operations (interventions) based on the causal graph to answer specific questions about interventions.

In [None]:
# Example: Interventions based on the causal graph
infer = Inference(causal_graph)
# specific interventions and queries as per our questions
result_i = infer.query(variables=['unfulfilled_requests'], do={'drivers_movement': 'recommended'})
result_ii = infer.query(variables=['unfulfilled_requests'], do={'location_accuracy': '20%'})
result_iii = infer.query(variables=['completed_orders'], do={'time_requirements': 'changed'})
result_iv = infer.query(variables=['completed_orders'], do={'drivers_increase': '10%'})

# Print or use the results as needed
print("Effect on number of unfulfilled requests with recommended driver movement:", result_i)
print("Effect on number of unfulfilled requests with 20% location accuracy:", result_ii)
print("Fraction of orders completed with changed time requirements:", result_iii)
print("Fraction of orders completed with 10% increase in drivers:", result_iv)


# **Train ML Models and Evaluate Overfitting**
Finally, train machine learning models using all variables and only the variables selected by the causal graph. Evaluate each model's performance and overfitting using a hold-out set.

In [None]:
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

# Example: Train models using all variables and selected variables
def train_and_evaluate(X_train, X_holdout, y_train, y_holdout):
    # Model using all variables
    model_all = RandomForestClassifier(random_state=42)
    model_all.fit(X_train, y_train)
    y_pred_all = model_all.predict(X_holdout)
    acc_all = accuracy_score(y_holdout, y_pred_all)

    # Model using selected variables from causal graph
    selected_features = direct_causes  # Example: Use variables directly pointing to target
    X_train_selected = X_train[selected_features]
    X_holdout_selected = X_holdout[selected_features]

    model_selected = XGBClassifier(random_state=42)
    model_selected.fit(X_train_selected, y_train)
    y_pred_selected = model_selected.predict(X_holdout_selected)
    acc_selected = accuracy_score(y_holdout, y_pred_selected)

    return acc_all, acc_selected

# Example usage
accuracy_all, accuracy_selected = train_and_evaluate(X_train, X_holdout, y_train, y_holdout)
print(f"Accuracy of model using all variables: {accuracy_all:.4f}")
print(f"Accuracy of model using selected variables: {accuracy_selected:.4f}")
