In [None]:
import sqlite3
import pandas as pd
import math
import numpy as np
"""
Ce script permet de créer des sous-échantillons des classifications 2 et 3 avec une pondération
suivant la règle log(1/sum(freq(assignation))) qui permet d'obtenir les assignations d'espèces rares
dans une quantité plus raisonnable tout en rendant le tirage d'assignation très fréquentes proportionnel
à leur apparition. Cela permet d'évaluer de manière plus complète les cas particulier de la classification
avec un petit échantillon labelisé à la main. Le script évalue les statistiques d'évaluation une fois que 
les tables sont remplies à la main pour mettre le score de classement de chaque annonce.
"""

size_sample=100
# Create a SQL connection to our SQLite database
con = sqlite3.connect("DATABASES/project.db")

cur = con.cursor()

# Load dataframes

In [None]:

# Load the data into a DataFrame
class_2_df = pd.read_sql_query("SELECT * from classification_2_matching_ads", con)
class_3_df = pd.read_sql_query("SELECT * from classification_3_matching_ads", con)

class_2_df.index = class_2_df.id
class_2_df.drop(columns=["id"], inplace=True)
class_3_df.index = class_3_df.id
class_3_df.drop(columns=["id"], inplace=True)


# Balanced sampling through species

In [None]:
from collections import Counter

#Weighted the sampling with the logarithm of the inverse frequency of the a assignation. 

def balanced_sample(df, size_sample) :
    """Create a balanced sample according to the frequency of the species. We take the 
    frequency of an assignation and we weight this assignation with the logarithm of the inverse of the 
    frequency"""
    df=df.copy()
    list_class=[]
    df["ids_matching"]\
            .apply(lambda x : list_class.extend([int(_) for _ in x.split(";")]))
    freq_class=Counter(list_class)

    df["weights"]=df["ids_matching"]\
            .apply(lambda x :  math.log(1/(sum([freq_class[int(_)]/len(list_class) for _ in x.split(";")])))) #take the inverse of the frequencies' sum

    #Sample according to the weight
    df=df.sample(size_sample, weights="weights")

    #Add a verification column
    nan_ar = np.empty(size_sample)
    nan_ar[:]=np.nan
    df["verification"]=nan_ar

    return df
balanced_s_class2=balanced_sample(class_2_df, size_sample)
balanced_s_class3=balanced_sample(class_3_df, size_sample)

In [None]:
#Write them into the SQL database

try:
    balanced_s_class2.to_sql("classification_2_eval", con, if_exists="fail")#fail#replace
    balanced_s_class3.to_sql("classification_3_eval", con, if_exists="fail")
except:
    print("Samplings already exist")

# Rates computation

In [None]:
filled_2 = pd.read_sql_query("SELECT * from classification_2_eval", con)
filled_3 = pd.read_sql_query("SELECT * from classification_3_eval", con)

def print_stat(df) :
    tp=df.query("ids_matching != '-1' & ids_matching != '-2'")["verification"].astype(int).sum()
    tn=df.query("ids_matching == '-1' | ids_matching == '-2'")["verification"].astype(int).sum()
    fp=len(df.query("ids_matching != '-1' & ids_matching != '-2'"))-tp
    fn=len(df.query("ids_matching == '-1' | ids_matching == '-2'"))-tn

    sensitivity=tp/(tp+fn)
    specificity=tn/(tn+fp)
    precision=tp/(tp+fp)
    tpr=tp/(tp+fn)
    fpr=fp/(fp+tn)
    tnr=tn/(tn+fp)
    fnr=fn/(fn+tp)

    print(f"""
    \tSensibilité\t\t | {sensitivity:.2f}\n
    (Taux vrai positif)\n
    \tSpécificité\t\t | {specificity:.2f}\n
    (Taux vrai négatif)\n
    \tPrécision\t\t | {precision:.2f}\n
    \tTaux faux positif\t | {fpr:.2f}\n
    \tTaux faux négatif\t | {fnr:.2f}\n
    """)

print("Classification 2 :\n")
print_stat(filled_2)
print("Classification 3 :\n")
print_stat(filled_3)

In [None]:
# Be sure to close the connection
con.close()