In [5]:
from PC import PC
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
import numpy as np

np.random.seed(42)

ModuleNotFoundError: No module named 'numba'

In [None]:
def getEdges(dictGraph):
    edges = []
    for node1, addjacents in dictGraph.items():
        for node2 in addjacents:
            edges.append((node1, node2))
    return edges

: 

In [None]:
data = pd.read_csv('data/train.csv')
data.drop(columns=['PassengerId', 'Name', 'Ticket', 'Cabin', 'Embarked'], inplace=True)

data['Sex'] = data['Sex'].apply(lambda x: 1 if x=='male' else 0)

data.dropna(inplace=True)

: 

In [None]:
originalGraph = {column: set(data.columns.drop(column)) \
                            for column in data.columns}

G = nx.DiGraph(getEdges(originalGraph))
nx.draw(G, with_labels=True, font_weight='bold')
plt.show()

: 

In [None]:
def discretize(data, column, bins):
    data[column] = pd.cut(data[column], bins, labels=False)

# Age and Fare discretization
discretize(data, 'Age', 5)
discretize(data, 'Fare', 5)

: 

In [None]:
data_train = data.sample(frac=0.8, random_state=42)
data_test = data.drop(data_train.index)

: 

In [None]:
pc = PC(0.25, exogeneous=['Age', 'Sex'], endogeneous=['Survived'], directional=True, maxSeparatingDepth=2)

graph, separatingSets = pc.causalDiscovery(data_train)

: 

In [None]:
print('Number of disconnections:', len(separatingSets))
for separatingSet in separatingSets:
    print(separatingSet[0], 'are sepated by', separatingSet[1])

: 

In [None]:
for key, value in graph.items():
    value.discard('PassengerId')

: 

In [None]:
def plotGraph(graph):
        G = nx.DiGraph(getEdges(graph))

        # For a beautiful graph
        pos = nx.nx_agraph.graphviz_layout(G, prog='dot')
        node_color = ['green' if node == 'Survived' else 'skyblue' for node in G.nodes()]
        edge_color = ['green' if edge[1] == 'Survived' else 'black' for edge in G.edges()]

        nx.draw(G, pos, node_color=node_color, edge_color=edge_color, 
                with_labels=True, font_weight='bold')
        plt.show()

plotGraph(graph)

: 

In [None]:
X_features = list(graph.keys())
X_features.remove('Survived')

X_train = data_train[X_features]
X_test = data_test[X_features]

y_train = data_train['Survived']
y_test = data_test['Survived']

: 

In [None]:
from pgmpy.models import BayesianNetwork

BN = BayesianNetwork()

BN.add_nodes_from(graph.keys())
for node, addjacent in graph.items():
    for addj in addjacent:
        BN.add_edge(node, addj)

: 

In [None]:
BN.fit(data_train)

: 

In [None]:
y_pred_BN = BN.predict(X_test)['Survived']

: 

In [None]:
from sklearn.metrics import f1_score

f1_BN = f1_score(y_test, y_pred_BN)

: 

In [None]:
NB = BayesianNetwork([(feature, 'Survived') for feature in X_features])

: 

In [None]:
plotGraph({feature: {'Survived'} for feature in X_features})

: 

In [None]:
NB.fit(data_train)

: 

In [None]:
y_pred_NB = NB.predict(X_test)

: 

In [None]:
f1_NB = f1_score(y_test, y_pred_NB)
print("F1 Score:\nBayesian Network ->", f1_BN, "\nNaive Bayes ->", f1_NB)

: 

In [None]:
import seaborn as sns

corr_matrix = data.corr(method='kendall')

sns.heatmap(corr_matrix, annot=True, cmap='coolwarm')
plt.show()

: 