In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
from sklearn import metrics 
import sys
import os
import time
import concurrent.futures
from functools import partial
from subprocess import check_output

os.chdir('..')

from models.model3 import model as model3
from models.model4 import model as model4
from models.model5 import model as model5

from model_utils.evaluator import gen_eval, print_eval

nc_src = './well_data/src_data.csv'
k_fold = 1

In [2]:
def gen_ia_predictions(test_src, stain_color, model, k_fold):
    cmd_arr = [
        'node',
        './models/model_utils/iarsenic-wrapper.js',
        test_src,
        stain_color,
        model,
        str(k_fold),
    ]

    stdout = check_output(cmd_arr).decode(sys.stdout.encoding).replace('\n', '')
    df = pd.read_csv(stdout)
  
    df['Prediction'].replace('highlyPolluted', 'polluted', inplace=True)

    return df['Prediction']

In [3]:
def gen_labels(df):
    return np.where(df['Arsenic'] > 10, 'polluted', 'safe')

In [4]:
start = time.time()

m3 = gen_ia_predictions(f'./well_data/k{k_fold}.csv', 'Red', 'model3', '1')
m4 = gen_ia_predictions(f'./well_data/k{k_fold}.csv', 'Red', 'model4', '1')
m5 = gen_ia_predictions(f'./well_data/k{k_fold}.csv', 'Red', 'model5', '1')

end = time.time()
print(f'time taken: {end - start} seconds')

time taken: 49068.50016236305 seconds


In [5]:
df = pd.read_csv(f'./well_data/k{k_fold}.csv')

m3_df = df.copy()
m3_df['Prediction'] = m3

m4_df = df.copy()
m4_df['Prediction'] = m4

m5_df = df.copy()
m5_df['Prediction'] = m5

In [6]:
m3_df['Label'] = gen_labels(m3_df)
m4_df['Label'] = gen_labels(m4_df)
m5_df['Label'] = gen_labels(m5_df)

m5_df.info()
print(m3_df['Prediction'].unique())
print(m4_df['Prediction'].unique())
print(m5_df['Prediction'].unique())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 173736 entries, 0 to 173735
Data columns (total 9 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   Division    173736 non-null  object 
 1   District    173736 non-null  object 
 2   Upazila     173736 non-null  object 
 3   Union       173736 non-null  object 
 4   Mouza       173736 non-null  object 
 5   Depth       173736 non-null  float64
 6   Arsenic     173736 non-null  float64
 7   Prediction  173736 non-null  object 
 8   Label       173736 non-null  object 
dtypes: float64(2), object(7)
memory usage: 11.9+ MB
['safe' 'polluted']
['safe' 'polluted']
['safe' 'polluted'
 'We do not have enough data to make an estimate for your well']


In [7]:
print('--------m5--------')
ned = 'We do not have enough data to make an estimate for your well'
print(m5_df['Prediction'].value_counts())

m5_df_polluted = m5_df.copy()
m5_df_polluted['Prediction'].replace(ned, 'polluted', inplace=True)

m5_df_safe = m5_df.copy()
m5_df_safe['Prediction'].replace(ned, 'safe', inplace=True)

m5_safe_eval = gen_eval(m5_df_safe)
m5_polluted_eval = gen_eval(m5_df_polluted)

print('--------safe---------')
print_eval(m5_safe_eval)
print('--------polluted---------')
print_eval(m5_polluted_eval)

--------m5--------
safe                                                            96126
polluted                                                        77581
We do not have enough data to make an estimate for your well       29
Name: Prediction, dtype: int64
--------safe---------
accuracy: 0.8359925404061335
precision: 0.8220956161946869
specificity: 0.8551199286201648

sensitivity: 0.8127715971505397
f1_score: 0.8174070181734293
--------polluted---------
accuracy: 0.8359292259520191
precision: 0.8219043937636903
specificity: 0.8549099879284102

sensitivity: 0.8128862892023805
f1_score: 0.8173704678980787
