In [98]:
import geopandas as gpd
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import numpy as np
from sklearn.metrics import roc_auc_score, roc_curve, average_precision_score
import shapely
import random
import rasterio
import json
import os
import math

In [32]:
train = pd.read_csv('zones/train.csv')
test = pd.read_csv('zones/test.csv')

vector_mapping = json.load(open('bioclip/species_vector_mapping.json', 'r'))

In [62]:
district_to_zone = {
    'Jaisalmer': 'Arid Western Plain and Hyper Arid Partial irrigated', 
    'Jodhpur':   'Arid Western Plain and Hyper Arid Partial irrigated',
    'Phalodi':   'Arid Western Plain and Hyper Arid Partial irrigated', 
    'Balotra':   'Arid Western Plain and Hyper Arid Partial irrigated', 
    'Bikaner':   'Arid Western Plain and Hyper Arid Partial irrigated', 
    'Barmer':    'Arid Western Plain and Hyper Arid Partial irrigated', 
    'Hanumangarh': 'Irrigated North Western Plain',  
    'Jhunjhunu': 'Transitional plain zone of Inland drainage', 
    'Nagaur':    'Transitional plain zone of Inland drainage', 
    'Sikar':     'Transitional plain zone of Inland drainage', 
    'Churu':     'Transitional plain zone of Inland drainage',
    'Jalore': 'Transitional plain zone of Luni Basin', 
    'Pali':   'Transitional plain zone of Luni Basin', 
    'Jaipur': 'Semi arid eastern plain', 
    'Ajmer':  'Semi arid eastern plain', 
    'Dausa':  'Semi arid eastern plain', 
    'Tonk':   'Semi arid eastern plain', 
    'Kotputli-Behror': 'Flood prone eastern plain', 
    'Sawai Madhopur':  'Flood prone eastern plain', 
    'Bharatpur':       'Flood prone eastern plain', 
    'Karauli':         'Flood prone eastern plain', 
    'Alwar':           'Flood prone eastern plain', 
    'Sawai':           'Flood prone eastern plain', # names are messed up in parts
    'Deeg':            'Flood prone eastern plain',
    'Chittorgarh': 'Sub humid southern plain and alluvial hill', 
    'Bhilwara':    'Sub humid southern plain and alluvial hill', 
    'Udaipur':     'Sub humid southern plain and alluvial hill', 
    'Pratapgarh': 'Southern humid plain', 
    'Bundi': 'South eastern humid plain', 
    'Kota':  'South eastern humid plain', 
}

In [24]:
def get_summary(data: pd.DataFrame):
    representation = {'Total': 0}
    for _, row in data.iterrows():
        representation['Total'] += 1
        if district_to_zone[row['source']] not in representation:
            representation[district_to_zone[row['source']]] = 0

        representation[district_to_zone[row['source']]] += 1

    summary = pd.DataFrame({'Zone': ['Total'], 'Count': [representation['Total']], 'Proportion': [1.0]})
    for k, v in representation.items():
        if k == 'Total':
            continue

        summary = pd.concat([summary, pd.DataFrame({
            'Zone': [k], 
            'Count': [v], 
            'Proportion': [v / representation['Total']]
        })], ignore_index=True)

    summary = summary.sort_values('Proportion', ascending=False)
    return summary

In [25]:
get_summary(train)

Unnamed: 0,Zone,Count,Proportion
0,Total,11700,1.0
2,Transitional plain zone of Inland drainage,5359,0.458034
1,Semi arid eastern plain,2014,0.172137
3,Transitional plain zone of Luni Basin,1214,0.103761
4,Flood prone eastern plain,961,0.082137
6,Sub humid southern plain and alluvial hill,745,0.063675
5,South eastern humid plain,505,0.043162
7,Southern humid plain,428,0.036581
8,Arid Western Plain and Hyper Arid Partial irri...,403,0.034444
9,Irrigated North Western Plain,71,0.006068


In [26]:
get_summary(test)

Unnamed: 0,Zone,Count,Proportion
0,Total,1465,1.0
3,Transitional plain zone of Inland drainage,668,0.455973
4,Semi arid eastern plain,250,0.170648
1,Transitional plain zone of Luni Basin,192,0.131058
2,Flood prone eastern plain,104,0.07099
8,Sub humid southern plain and alluvial hill,90,0.061433
5,South eastern humid plain,61,0.041638
6,Arid Western Plain and Hyper Arid Partial irri...,53,0.036177
7,Southern humid plain,40,0.027304
9,Irrigated North Western Plain,7,0.004778


In [54]:
def get_summary_by_species(data: pd.DataFrame, species: str):
    id = vector_mapping['species_order'].index(species)

    representation = {'Total': 0}
    for _, row in data.iterrows():
        if row['species_vector_str'][2*id] != "1":
            continue

        representation['Total'] += 1
        if district_to_zone[row['source']] not in representation:
            representation[district_to_zone[row['source']]] = 0

        representation[district_to_zone[row['source']]] += 1

    summary = pd.DataFrame({'Zone': ['Total'], 'Count': [representation['Total']], 'Proportion': [1.0]})
    for k, v in representation.items():
        if k == 'Total':
            continue

        summary = pd.concat([summary, pd.DataFrame({
            'Zone': [k], 
            'Count': [v], 
            'Proportion': [v / representation['Total']]
        })], ignore_index=True)

    summary = summary.sort_values('Proportion', ascending=False)
    return summary

In [59]:
all_species = ['Azadirachta indica', 'Vachellia nilotica', 'Ailanthus excelsa', 'Prosopis cineraria', 'prosopis juliflora']

get_summary(test).to_csv(f'zones/results/counts/all_test.csv', index=False)
get_summary(train).to_csv(f'zones/results/counts/all_train.csv', index=False)

for species in all_species:
    get_summary_by_species(test, species).to_csv(f'zones/results/counts/{species.lower().replace(' ', '_')}_test.csv', index=False)
    get_summary_by_species(train, species).to_csv(f'zones/results/counts/{species.lower().replace(' ', '_')}_train.csv', index=False)

In [112]:
def get_stats(guesses: pd.DataFrame):
    guess_by_zone = {'Total': ([], [], [])}
    for _, row in guesses.iterrows():
        if type(row['pixel_id']) == type(0.0) and math.isnan(row['pixel_id']):
            continue
        
        zone = district_to_zone[row['pixel_id'][:row['pixel_id'].index('_')]]
        if zone not in guess_by_zone:
            guess_by_zone[zone] = ([], [], [])

        guess_by_zone[zone][0].append(row['true_label'])
        guess_by_zone[zone][1].append(row['probability'])
        guess_by_zone[zone][2].append(row['predicted'])

        guess_by_zone['Total'][0].append(row['true_label'])
        guess_by_zone['Total'][1].append(row['probability'])
        guess_by_zone['Total'][2].append(row['predicted'])

    summary = pd.DataFrame({'Zone': [], 'Accuracy': [], 'AUC': [], 'AP': [], 'Count': [], 'Positive Samples': []})
    for k, v in guess_by_zone.items():
            if sum(v[0]) == 0 or sum(v[0]) == len(v[0]):
                roc_auc = 0.0
                auprc = 0.0
            else:
                roc_auc = roc_auc_score(v[0], v[1])
                auprc = average_precision_score(v[0], v[1])
            accuracy = np.sum(np.array(v[0]) == np.array(v[2])) / len(v[0])
            
            summary = pd.concat([summary, pd.DataFrame({
                'Zone': [k], 
                'Accuracy': [accuracy], 
                'AUC': [roc_auc], 
                'AP': [auprc],
                'Count': [len(v[0])],
                'Positive Samples': [sum(v[0])]
            })], ignore_index=True)

    summary = summary.sort_values('Count', ascending=False)
    return summary

In [88]:
guesses = pd.read_csv('/Users/Hugo/Coding_Projects/Sidd Lab/NUTMGS/DATA/fig4/human_1000/Ailanthus_excelsa/species_Ailanthus_excelsa/Ailanthus_excelsa_predictions.csv')
get_stats(guesses)

Unnamed: 0,Zone,Accuracy,AUC,AP,Count,Positive Samples
0,Total,0.545392,0.735574,0.201223,1465.0,140.0
3,Transitional plain zone of Inland drainage,0.315868,0.585107,0.221778,668.0,121.0
4,Semi arid eastern plain,0.584,0.555322,0.092158,250.0,12.0
1,Transitional plain zone of Luni Basin,0.838542,0.365027,0.020467,192.0,4.0
2,Flood prone eastern plain,0.903846,0.0,0.0,104.0,0.0
8,Sub humid southern plain and alluvial hill,0.677778,0.923372,0.547619,90.0,3.0
5,South eastern humid plain,0.852459,0.0,0.0,61.0,0.0
6,Arid Western Plain and Hyper Arid Partial irri...,0.584906,0.0,0.0,53.0,0.0
7,Southern humid plain,0.975,0.0,0.0,40.0,0.0
9,Irrigated North Western Plain,0.571429,0.0,0.0,7.0,0.0


In [114]:
s2s = {
    'Azadirachta indica': 'Azadirachta_Indica', 
    'Vachellia nilotica': 'Vachellia_nilotica', 
    'Ailanthus excelsa': 'Ailanthus_Excelsa', 
    'Prosopis cineraria': 'Prosopis_cineraria', 
    'prosopis juliflora': 'Prosopis_Juliflora'
}

to_analyze = ['table5/s2'] + ['fig4/human_1000', 'fig4/human_11700', 'fig4/machine_1000', 'fig4/machine_11700'] + \
    [f'fig3/{i}' for i in range(100, 3400, 100)]

for dir in to_analyze:
    for species in all_species:
        file = f'../../NUTMGS/DATA/{dir}/{s2s[species]}/species_{species.replace(' ', '_')}/{species.replace(' ', '_')}_predictions.csv'
        if not os.path.exists(file):
            file = f'../../NUTMGS/DATA/{dir}/{species.replace(' ', '_')}/species_{species.replace(' ', '_')}/{species.replace(' ', '_')}_predictions.csv'
            if not os.path.exists(file):
                continue

        guesses = pd.read_csv(file)
        os.makedirs(f'zones/results/accuracy/{dir}/', exist_ok=True)
        get_stats(guesses).to_csv(f'zones/results/accuracy/{dir}/{species.lower().replace(' ', '_')}.csv', index=False)