In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
from sklearn import metrics 
import sys
import os
import time
import concurrent.futures
from functools import partial
from subprocess import check_output

os.chdir('..')

from models.model5 import model as model5

from model_utils.evaluator import gen_eval, print_eval

In [7]:
cmd_arr = [
    'node',
    './models/model_utils/iarsenic-wrapper.js',
    './well_data/k1.csv',
    'Red',
    'model5',
    '1',
]

stdout = check_output(cmd_arr).decode(sys.stdout.encoding).replace('\n', '')
df = pd.read_csv(stdout)

In [8]:
df['Prediction'].replace('polluted', 'safe', inplace=True)
df['Prediction'].replace('highlyPolluted', 'polluted', inplace=True)
df['Prediction'].replace('We do not have enough data to make an estimate for your well', 'safe', inplace=True)

In [15]:
test_df = pd.read_csv('./well_data/k1.csv')
test_df['Prediction'] = df['Prediction']
test_df['Label'] = np.where(test_df['Arsenic'] > 50, 'polluted', 'safe')
test_df.info()
print(test_df.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 173736 entries, 0 to 173735
Data columns (total 9 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   Division    173736 non-null  object 
 1   District    173736 non-null  object 
 2   Upazila     173736 non-null  object 
 3   Union       173736 non-null  object 
 4   Mouza       173736 non-null  object 
 5   Depth       173736 non-null  float64
 6   Arsenic     173736 non-null  float64
 7   Prediction  173736 non-null  object 
 8   Label       173736 non-null  object 
dtypes: float64(2), object(7)
memory usage: 11.9+ MB
     Division     District      Upazila                 Union        Mouza  \
0      Sylhet  Maulvibazar    Kamalganj  Kamalganj Paurashava   Gopalnagar   
1      Khulna    Jhenaidah  Kotchandpur             Sabdalpur    Sabdalpur   
2  Chittagong     Noakhali     Chatkhil         Ramnarayanpur    Madhabpur   
3       Dhaka      Tangail      Delduar                Elasin  

In [16]:
print_eval(gen_eval(test_df))

accuracy: 0.8533234332550537
precision: 0.7371934604904632
specificity: 0.9091244572514349

sensitivity: 0.7000754716981132
f1_score: 0.7181551733672509


In [36]:
import pandas as pd
import sys
import os
import geopandas as gpd
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import MinMaxScaler 

from model_utils.utils import cat_int_enc, gen_labels, conv_cat_num, conv_cat_str, load_k_train, stratify, append_test_train, gen_centroids, split_test_train
from model_utils.evaluator import gen_eval, print_eval 

def gen_predictions(train_df, test_df):
    train = train_df.copy()
    test = test_df.copy()
    gdf = gpd.read_file('./geodata/mou/mou-c005-s010-vw-pr.geojson')

    tt_df = append_test_train(test, train)
    tt_df['lon'], tt_df['lat'] = gen_centroids(tt_df, gdf)
    tt_df = tt_df.drop(
        columns=[
            'Division',
            'District',
            'Union',
            'Upazila',
            'Mouza'
        ]
    )
    tt_df.info()
    cat_int_enc(tt_df)
    tt_df = pd.DataFrame(MinMaxScaler().fit_transform(tt_df), columns=tt_df.columns)

    test, train = split_test_train(tt_df)

    conv_cat_num(train, 'Label')
    conv_cat_num(test, 'Label')
    
    test.info()

    stratify(test)
    stratify(train)

    train_X = train.drop(['Arsenic', 'Label', 'Prediction', 'Strata', 'Depth'], axis='columns', errors='ignore')
    train_y = train['Label']
    test_X = test.drop(['Arsenic', 'Label', 'Strata', 'Depth'], axis='columns')

    clf = MLPClassifier(
        solver='adam',
        alpha=0.0001,
        hidden_layer_sizes=(250, 50),
        learning_rate='adaptive',
        random_state=99,
        max_iter=100,
        verbose=True,
        validation_fraction=0.4,
        activation='relu'
    )
    clf.fit(train_X, train_y)
    test['Prediction'] = clf.predict(test_X)

    conv_cat_str(test, 'Prediction')
    return test['Prediction']

def m9_highlypoll(
  test_src='./well_data/k1.csv',
  k_fold=1,
):

    train_df = load_k_train(k_fold)
    test_df = pd.read_csv(test_src) 

    train_df['Label'] = np.where(train_df['Arsenic'] > 50, 'polluted', 'safe')
    test_df['Label'] = np.where(test_df['Arsenic'] > 50, 'polluted', 'safe')

    test_df['Prediction'] = gen_predictions(train_df, test_df)
    print_eval(gen_eval(test_df))

    return test_df

m9_highlypoll()


  gdf['lon'] = gdf.centroid.x

  gdf['lat'] = gdf.centroid.y


<class 'pandas.core.frame.DataFrame'>
Int64Index: 868678 entries, 0 to 173734
Data columns (total 6 columns):
 #   Column   Non-Null Count   Dtype  
---  ------   --------------   -----  
 0   Depth    868678 non-null  float64
 1   Arsenic  868678 non-null  float64
 2   Label    868678 non-null  object 
 3   tid      868678 non-null  int64  
 4   lon      868678 non-null  float64
 5   lat      868678 non-null  float64
dtypes: float64(4), int64(1), object(1)
memory usage: 46.4+ MB
<class 'pandas.core.frame.DataFrame'>
Int64Index: 173736 entries, 0 to 173735
Data columns (total 5 columns):
 #   Column   Non-Null Count   Dtype  
---  ------   --------------   -----  
 0   Depth    173736 non-null  float64
 1   Arsenic  173736 non-null  float64
 2   Label    173736 non-null  float64
 3   lon      173736 non-null  float64
 4   lat      173736 non-null  float64
dtypes: float64(5)
memory usage: 8.0 MB
Iteration 1, loss = 0.58024012
Iteration 2, loss = 0.58004368
Iteration 3, loss = 0.58002803

Unnamed: 0,Division,District,Upazila,Union,Mouza,Depth,Arsenic,Label,Prediction
0,Sylhet,Maulvibazar,Kamalganj,Kamalganj Paurashava,Gopalnagar,54.9,10.0,safe,polluted
1,Khulna,Jhenaidah,Kotchandpur,Sabdalpur,Sabdalpur,27.4,10.0,safe,polluted
2,Chittagong,Noakhali,Chatkhil,Ramnarayanpur,Madhabpur,13.7,60.0,polluted,polluted
3,Dhaka,Tangail,Delduar,Elasin,Musuria,22.9,0.0,safe,polluted
4,Dhaka,Tangail,Delduar,Elasin,Momin Nagar,24.4,10.0,safe,polluted
...,...,...,...,...,...,...,...,...,...
173731,Khulna,Jessore,Keshabpur,Keshabpur,Brahmakati,54.9,200.0,polluted,polluted
173732,Khulna,Chuadanga,Jiban Nagar,Uthali,Madhabkhali,21.3,25.0,safe,polluted
173733,Dhaka,Faridpur,Boalmari,Shekhar,Bayra Bamangati,59.4,60.0,polluted,polluted
173734,Sylhet,Sunamganj,Jamalganj,Sachna Bazar,Sujatpur,137.2,25.0,safe,polluted
