## Model Developement - Use Case :  Titanic

### Import RAI libary

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from sklearn import preprocessing

import torch
import torch.nn as nn
import torch.nn.functional as F 
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader 

In [None]:
#from responsibleML import base_rai_model as sklearnmodel
#from responsibleML import rai_models
#from responsibleML import ProblemType
#from responsibleML import ModelFramework
#from responsibleML import pytorch_model as pytorchmodel

In [None]:
from aigovernance.responsibleML importresponsibleML


In [None]:
#Create a Model List
models = rai_models()

### Data Pre Processing

In [None]:
titanic_df = pd.read_csv('../data/titanic.csv')
titanic_df.head(3)

In [None]:
# Remove all na value from test set 
titanic_df['Age'].fillna(titanic_df['Age'].mean(), inplace = True)
titanic_df['Embarked'].fillna('N', inplace = True)

In [None]:
def encode_features(dataDF) :
    features = ['Sex','Embarked']
    
    for feature in features :
        le = preprocessing.LabelEncoder()
        le = le.fit(dataDF[feature])
        dataDF[feature] = le.transform(dataDF[feature])
        
    return dataDF

titanic_df = encode_features(titanic_df)
titanic_df.head()

In [None]:
# Drop the unnecessary field (Name)
titanic_df.drop(['Name'], axis = 1, inplace = True)

In [None]:
y_titanic_df = titanic_df['target']
X_titanic_df = titanic_df.drop('target', axis = 1)

### Model 1:  Decision Tree

#### 1.1 Decision Tree - RAI Model

In [None]:
# Create a responsible model
dt_model = sklearnmodel("decision_tree", ProblemType.BINARY)

#### 1.2 Decision RAI Model:  Calculate Class Imbalance

In [None]:
# Calcualate Class Imbalance
dt_model.calculate_class_balance(y_titanic_df)

#### 1.3 Decision RAI Model:  Start Emissions Tracker

In [None]:
dt_model.start_emissions_tracker()

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split


X_train, X_test, y_train, y_test = train_test_split(X_titanic_df, y_titanic_df, test_size=0.2, random_state = 11)

dt_clf = DecisionTreeClassifier(random_state = 11)

dt_clf.fit(X_train, y_train)
dt_pred = dt_clf.predict(X_test)
print("== DecisionTree Accuracy : {0:.4f}".format(accuracy_score(y_test, dt_pred)))

dt_accuracy = accuracy_score(y_test, dt_pred)
dt_model.set_model_accuracy(dt_accuracy)

#### 1.4 Decision RAI Model:  Stop Emissions Tracker

In [None]:
# Stop the CO2 tracker
dt_model.stop_emissions_tracker()

#### 1.5 Decision RAI Model:  Calculate Interpretability

In [None]:
# calculate Interpretability
dt_model.calculate_interpretability("treebased", dt_clf, X_titanic_df)

#### 1.6 Decision RAI Model:  Add model to RAI Models

In [None]:
models.add_model(dt_model)

In [None]:
print("Class Balance Index : " + str(dt_model.get_class_balance_index()))
print("Emissions Index : " + str(dt_model.get_emissions_index()))
print("Interpreability Index : " + str(dt_model.get_interpretability_index()))

In [None]:
dt_model.get_model_info()

### Model 2:  Random Forest

#### Random Forest - Data Prep

In [None]:
indexes = titanic_df[ (titanic_df['target'] == 1) & (titanic_df['Age'] > 20)].index
rfmodel_df = titanic_df.drop(indexes)

y_rf_titanic_df = rfmodel_df['target']
x_rf_titanic_df = rfmodel_df.drop('target', axis = 1)

#### 2.1 Random Forest - RAI Model

In [None]:
rf_model = sklearnmodel("random_forest", ProblemType.BINARY)

#### 2.2 Random Forest RAI Model:  Calculate Class Imbalance

In [None]:
rf_model.calculate_class_balance(y_rf_titanic_df)

In [None]:
print("Class Balance Index : " + str(rf_model.get_class_balance_index()))
rf_model.get_class_balance()

#### 2.3 Random Forest RAI Model:  Start Emissions Tracker

In [None]:
rf_model.start_emissions_tracker()

In [None]:
#2. RandomForest 
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(x_rf_titanic_df,y_rf_titanic_df, test_size=0.2, random_state = 11)

rf_clf = RandomForestClassifier(random_state = 11)
rf_clf.fit(X_train, y_train)
rf_pred = rf_clf.predict(X_test)
print("== RandomForest Accuracy : {0:.4f}".format(accuracy_score(y_test, rf_pred)))

rf_accuracy = accuracy_score(y_test, rf_pred)
rf_model.set_model_accuracy(rf_accuracy)

#### 2.4 Random Forest RAI Model:  Stop Emissions Tracker

In [None]:
rf_model.stop_emissions_tracker()

#### 2.5 Random Forest RAI Model:  Calculate Interpretability

In [None]:
# calculate Interpretability
rf_model.calculate_interpretability("treebased", rf_clf, x_rf_titanic_df)

#### 2.6 Random Forest RAI Model:  Add model to RAI Models

In [None]:
models.add_model(rf_model)

In [None]:
print("Class Balance Index : " + str(rf_model.get_class_balance_index()))
print("Emissions Index : " + str(rf_model.get_emissions_index()))
print("Interpreability Index : " + str(rf_model.get_interpretability_index()))

In [None]:
rf_model.get_model_info()

### Model 3:  Logistic Regression

In [None]:
# Drop the unnecessary field (Name)

lrmodel_df = titanic_df.drop(['PassengerId', 'Age'], axis = 1)



In [None]:
y_lr_titanic_df = lrmodel_df['target']
x_lr_titanic_df = lrmodel_df.drop('target', axis = 1)

#### 3.1 Logistic Regression - RAI Model

In [None]:
lr_model = sklearnmodel("logistic_regression", ProblemType.BINARY)

#### 3.2 Logistic Regression RAI Model:  Calculate Class Imbalance

In [None]:
lr_model.calculate_class_balance(y_lr_titanic_df)

#### 3.3 Logistic Regression RAI Model:  Start Emissions tracker

In [None]:
lr_model.start_emissions_tracker()

In [None]:
#3. LogisticRegression
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

X_train, X_test, y_train, y_test = train_test_split(x_lr_titanic_df, y_lr_titanic_df, test_size=0.2, random_state = 11)

lr_clf = LogisticRegression()

lr_clf.fit(X_train, y_train)
lr_pred = lr_clf.predict(X_test)
print("== LogisticRegression Accuracy : {0:.4f}".format(accuracy_score(y_test, lr_pred)))

lr_accuracy = accuracy_score(y_test, lr_pred)
lr_model.set_model_accuracy(lr_accuracy)

#### 3.4 Logistic Regression RAI Model:  Stop Emissions tracker

In [None]:
lr_model.stop_emissions_tracker()

#### 3.5 Logistic Regression RAI Model:  Calculate Interpretability

In [None]:
# calculate Interpretability
lr_model.calculate_interpretability("linear", lr_clf, x_lr_titanic_df)

#### 3.6 Logistic Regression RAI Model:  Add Model to RAI model list

In [None]:
models.add_model(lr_model)

In [None]:
lr_model.get_model_info()

### Model 4:  PyTorch Model

#### 4.1 PyTorch Model - RAI Model

In [None]:
py_model = pytorchmodel("Pytorch", ProblemType.BINARY)

In [None]:
# converting dataframe to numpy array
labels = titanic_df["target"].to_numpy()

df_cleaned = titanic_df.drop(['target'], axis=1)
feature_names = list(df_cleaned.columns)
features = df_cleaned.to_numpy()

# loading data into torch tensor
feature_tensor = torch.from_numpy(features).type(torch.FloatTensor)
label_tensor = torch.from_numpy(labels)

# loading data into torch dataset
train_dataset = torch.utils.data.TensorDataset(feature_tensor, label_tensor)

# loading data into torch dataloader
batch_size = 32
train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

#### 4.2 PyTorch RAI Model:  Calculate Class Imbalance

In [None]:
py_model.calculate_class_balance(y_titanic_df)

#### 4.3 PyTorch RAI Model:  Start Emissions tracker

In [None]:
py_model.start_emissions_tracker()

In [None]:
class TitanicModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.linear1 = nn.Linear(7, 7)
        self.sigmoid1 = nn.Sigmoid()
        self.linear2 = nn.Linear(7, 5)
        self.sigmoid2 = nn.Sigmoid()
        self.linear3 = nn.Linear(5, 2)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, x):
        lin1_out = self.linear1(x)
        sigmoid_out1 = self.sigmoid1(lin1_out)
        sigmoid_out2 = self.sigmoid2(self.linear2(sigmoid_out1))
        return self.softmax(self.linear3(sigmoid_out2))

In [None]:
def train(trainloader, model, optimizer):
    
    loss_fn = nn.CrossEntropyLoss()
    epochs = 100
    
    for epoch in range(epochs):
        for i, (features, label) in enumerate(train_dataloader):
            # Forward pass
            y_pred = model(features)
            loss = loss_fn(y_pred, label)

            # Backward and optimize
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            if (i+1) % 10 == 0:
                print ('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}' 
                       .format(epoch+1, epochs, i+1, len(train_dataloader), loss.item()))
                train_acc = torch.sum(y_pred == label)
                print(train_acc)

    return model

In [None]:
vanilla_model = TitanicModel()
optimizer = torch.optim.Adam(vanilla_model.parameters(), lr=0.001)
trained_vanilla_model = train(train_dataloader, vanilla_model, optimizer)


In [None]:
X_titanic_df.values

#### 4.4 PyTorch RAI Model:  Stop Emissions tracker

In [None]:
py_model.stop_emissions_tracker()

#### 4.5 PyTorch RAI Model:  Calculate Interpretability

In [None]:
# calculate Interpretability
py_model.calculate_interpretability(feature_tensor, trained_vanilla_model, target_class=1)

#### 4.6 PyTorch RAI Model:  Add Model

In [None]:
models.add_model(py_model)

In [None]:
py_model.get_model_info()

### Calculate Responsible Metrics

#### List all models

In [None]:
# List all Models 
model_json = models.list_models()
model_json

In [None]:
# Rank them by RAI index
df_models = pd.read_json(model_json)

In [None]:
# Visualize
df_models.head(5)

In [None]:
from matplotlib import pyplot as plt
import seaborn as sns

In [None]:

#sns.lineplot('model name', 'interpretability', data=df_models)
sns.scatterplot('model name', 'emissions', data=df_models)

In [None]:
df_models.to_csv("temp.csv")