In [None]:
import sqlite3
import pandas as pd
import os
import math
import numpy as np
"""This code is used to create sampled datasets of the classification 2 and 3 in order to evaluate
the false positive rate and the false negative rate. We have to label the success of the classification
by hand before computation."""

size_sample=100
# Create a SQL connection to our SQLite database
con = sqlite3.connect("DATABASES/project.db")

cur = con.cursor()

# Classification 2 and 3 sampling

In [None]:

# Load the data into a DataFrame
class_2_df = pd.read_sql_query("SELECT * from classification_2_matching_ads", con)
class_3_df = pd.read_sql_query("SELECT * from classification_3_matching_ads", con)

class_2_df.index = class_2_df.id
class_2_df.drop(columns=["id"], inplace=True)
class_3_df.index = class_3_df.id
class_3_df.drop(columns=["id"], inplace=True)


In [None]:
#Array of NaN
nan_ar = np.empty(size_sample)
nan_ar[:]=np.nan

#False negatives
c2_mod_fn=class_2_df.query("ids_matching == '-1' | ids_matching == '-2'").sample(size_sample)
c2_mod_fn["verification"]=nan_ar

c3_mod_fn=class_3_df.query("ids_matching == '-1' | ids_matching == '-2'").sample(size_sample)
c3_mod_fn["verification"]=nan_ar

#False positives
c2_mod_fp=class_2_df.query("ids_matching != '-1' & ids_matching != '-2'").sample(size_sample)
c2_mod_fp["verification"]=nan_ar

c3_mod_fp=class_3_df.query("ids_matching != '-1' & ids_matching != '-2'").sample(size_sample)
c3_mod_fp["verification"]=nan_ar

In [None]:
try:
    c2_mod_fn.to_sql("classification_2_eval_fn", con, if_exists="fail")#fail#replace
    c3_mod_fn.to_sql("classification_3_eval_fn", con, if_exists="fail")
    c2_mod_fp.to_sql("classification_2_eval_fp", con, if_exists="fail")#fail#replace
    c3_mod_fp.to_sql("classification_3_eval_fp", con, if_exists="fail")
except:
    print("Samplings already exist")

# Balanced sampling through species

In [None]:
from collections import Counter

#Weighted the sampling with the logarithm of the inverse frequency of the a assignation. 

def balanced_sample(df, size_sample) :
    """Create a balanced sample according to the frequency of the species. We take the 
    frequency of an assignation and we weight this assignation with the logarithm of the inverse of the 
    frequency"""
    df=df.copy()
    list_class=[]
    df["ids_matching"]\
            .apply(lambda x : list_class.extend([int(_) for _ in x.split(";")]))
    freq_class=Counter(list_class)

    # df["weights"]=df["ids_matching"]\
    #         .apply(lambda x : 1/ ( sum([freq_class[int(_)]/len(list_class) for _ in x.split(";")]) ) ) #take the inverse of the frequencies' sum
    # df["weights"]=df["ids_matching"]\
    #         .apply(lambda x :  ( sum([1-(freq_class[int(_)]/len(list_class)) for _ in x.split(";")]) ) ) #take the inverse of the frequencies' sum


    
    df["weights"]=df["ids_matching"]\
            .apply(lambda x :  math.log(1/(sum([freq_class[int(_)]/len(list_class) for _ in x.split(";")])))) #take the inverse of the frequencies' sum

    #Sample according to the weight
    df=df.sample(size_sample, weights="weights")

    #Add a verification column
    nan_ar = np.empty(size_sample)
    nan_ar[:]=np.nan
    df["verification"]=nan_ar

    return df
balanced_s_class2=balanced_sample(class_2_df, size_sample)
balanced_s_class3=balanced_sample(class_3_df, size_sample)

In [None]:
#Write them into the SQL database

try:
    balanced_s_class2.to_sql("classification_2_eval", con, if_exists="fail")#fail#replace
    balanced_s_class3.to_sql("classification_3_eval", con, if_exists="fail")
except:
    print("Samplings already exist")

# Rates computation

In [None]:
filled_2_fn = pd.read_sql_query("SELECT * from classification_2_eval_fn", con)
filled_2_fp = pd.read_sql_query("SELECT * from classification_2_eval_fp", con)
filled_3_fn = pd.read_sql_query("SELECT * from classification_3_eval_fn", con)
filled_3_fp = pd.read_sql_query("SELECT * from classification_3_eval_fp", con)

print(f"""
Classification 2 | FP rate = {filled_2_fp["verification"].sum()/(2*size_sample)}, FN rate = {filled_2_fn["verification"].sum()/(2*size_sample)}\n
Classification 3 | FP rate = {filled_3_fp["verification"].sum()/(2*size_sample)}, FN rate = {filled_3_fn["verification"].sum()/(2*size_sample)}\n
""")

In [None]:
# Be sure to close the connection
con.close()