In [None]:
import sys
sys.path.append("../")

In [None]:
header_length = len("2022-10-05 11:56:15.580  INFO 8982 --- [           main] ")
header_length

In [None]:
event_length = len("f.i.sirene4.repertoire.BatchApplication  ")
event_length

In [None]:
first_line = "2022-10-05 11:56:15.580  INFO 8982 --- [           main] f.i.sirene4.repertoire.BatchApplication  : Starting BatchApplication v2.3.3 using Java 11.0.16.1 on qfbatrelst01.ad.insee.intra with PID 8982 (/opt/insee/sirene4/qf3/lib/repertoire-batch-2.3.3.jar started by www-data in /opt/insee/sirene4/qf3/tmp)"

In [None]:
line_without_timestamp = first_line[header_length:]
event_type = line_without_timestamp[:event_length]
event_type.rstrip()

In [None]:
description = line_without_timestamp[event_length + 2:]
description

In [None]:
import pandas as pd

def extract_log_info(f):
    event_types = []
    descriptions = []
    for line in f:
        line_without_timestamp = line[header_length:]
        if not line_without_timestamp:
            continue
        event_types.append(line_without_timestamp[:event_length].rstrip())
        descriptions.append(line_without_timestamp[event_length + 2:])
    return pd.DataFrame(list(zip(event_types, descriptions)), columns =['event_type', 'description'])

with open("../data/api_log.log") as f:
    df = extract_log_info(f)

In [None]:
df

In [None]:
identifier = "s.i.AbstractBatchCodificationServiceImpl"
log_info = "r.b.j.c.s.i.BatchCodificationServiceImpl"
raw_input = "stractLiasse1ToLiasseVarInteretProcessor"

In [None]:
df_ids = df[df.event_type == identifier]
df_ids.head()

In [None]:
df_info = df[df.event_type == log_info]
df_info.head()

In [None]:
df_ids.shape

In [None]:
df_info.shape

In [None]:
df_raw_input = df[df.event_type == raw_input]
df_raw_input.head()

In [None]:
df_raw_input = df_raw_input[df_raw_input.description.str.startswith("LiasseVarInteretCodification")]
df_raw_input.head()

In [None]:
df_raw_input.shape

In [None]:
test_str = df_raw_input.iloc[0, 1]
test_str

In [None]:
import re

regex = re.compile(r'norme=([^,]*),')
matches = regex.search(test_str)
matches.group(1)

In [None]:
regex = re.compile(r'liasseType=([^,]*),')
matches = regex.search(test_str)
matches.group(1)

In [None]:
regex

In [None]:
raw_fields = [
    "norme",
    "siren",
    "nic",
    "liasseType",
    "categorieJuridique",
    "domas",
    "ssdom",
    "domaineAssoc",
    "ssDomaineAssoc",
    "libelleActivitePrincipaleEtablissement",
    "sedentarite",
    "natureActivites",
    "surface",
    "lieuExercice",
    "presenceSalaries"
]
raw_regexes = [re.compile(r'{}'.format(field + '=([^,]*)[,\]]')) for field in raw_fields]
raw_regexes

In [None]:
def parse_raw_input(raw_input, fields, regexes):
    raw_input_dict = {}
    for field, regex in zip(fields, regexes):
        matches = regex.search(raw_input)
        raw_input_dict[field] = matches.group(1)
    return raw_input_dict

In [None]:
parse_raw_input(test_str, raw_fields, raw_regexes)

In [None]:
dict_series = [parse_raw_input(raw_input, raw_fields, raw_regexes) for raw_input in df_raw_input.description]

In [None]:
pd.DataFrame(list(dict_series))

In [None]:
test_str = df_info.iloc[0, 1]
test_str

In [None]:
info_fields = [
    "libelleActivite",
    "natureActivites",
    "liasseType",
    "evenementType",
    "surface",
    "libelleNettoye",
    "predictions",
    "bilan"
]
info_regexes = [re.compile(r'{}'.format(field + '=([^,]*),')) for field in info_fields]

info_fields.append("fasttextVersion")
info_regexes.append(re.compile(r'fasttextVersion=([^,]*)\]'))

info_regexes

In [None]:
parse_raw_input(test_str, info_fields, info_regexes)

In [None]:
def extract_first_pred(predictions):
    regex = re.compile(r'proposé = ([^;]*) ;.*associée = ([^;]*)\].*proposé = ([^;]*) ;.*associée = ([^;]*)\]')
    matches = regex.search(predictions)
    first_code = matches.group(1)
    second_code = matches.group(3)
    first_proba = matches.group(2)
    second_proba = matches.group(4)
    return (first_code, second_code, float(first_proba), float(second_proba))

extract_first_pred(parse_raw_input(test_str, info_fields, info_regexes)["predictions"])

In [None]:
dict_series = [parse_raw_input(info_input, info_fields, info_regexes) for info_input in df_info.description]

In [None]:
df = pd.DataFrame(list(dict_series))

In [None]:
predictions = [extract_first_pred(predictions) for predictions in df["predictions"]]
df["first_pred"] = [prediction[0] for prediction in predictions]
df["second_pred"] = [prediction[0] for prediction in predictions]
df["first_proba"] = [prediction[0] for prediction in predictions]
df["second_proba"] = [prediction[0] for prediction in predictions]

df

In [None]:
import pandas as pd

df = pd.read_csv("comparison.csv")

In [None]:
df = df[~df.RAW.str.contains("oeuvre")]

In [None]:
df.RAW.iloc[7]

In [None]:
df.RAW.replace(
    to_replace=r"(\s|^)([a-z]{1})(?:\s|$)", value=r'\1', regex=True
).iloc[7]

In [None]:
df.RAW.replace(
    to_replace=r"\b([a-z]{1})\b", value=' ', regex=True
).iloc[7]

In [1]:
import sys
import pandas as pd
import unidecode
import re
import string
import numpy as np
from nltk.corpus import stopwords as ntlk_stopwords
from nltk.stem.snowball import SnowballStemmer
from enum import Enum

sys.path.append("../")

HEADER_LEN = 58
EVENT_LEN = 41


class EventType(Enum):
    """

    """
    ID = "s.i.AbstractBatchCodificationServiceImpl"
    INFO = "r.b.j.c.s.i.BatchCodificationServiceImpl"
    RAW_INPUT = "stractLiasse1ToLiasseVarInteretProcessor"


def extract_log_info(f):
    """_summary_

    Args:
        f (_type_): _description_

    Returns:
        _type_: _description_
    """
    event_types = []
    descriptions = []
    for line in f:
        line_without_timestamp = line[HEADER_LEN:]
        if not line_without_timestamp:
            continue
        event_types.append(line_without_timestamp[:EVENT_LEN].rstrip())
        descriptions.append(line_without_timestamp[EVENT_LEN + 2:])
    return pd.DataFrame(
        list(zip(event_types, descriptions)),
        columns=['event_type', 'description']
    )


def parse_raw_input(raw_input, fields, regexes):
    """_summary_

    Args:
        raw_input (_type_): _description_
        fields (_type_): _description_
        regexes (_type_): _description_

    Returns:
        _type_: _description_
    """
    raw_input_dict = {}
    for field, regex in zip(fields, regexes):
        matches = regex.search(raw_input)
        if matches.group(1) is not None:
            raw_input_dict[field] = matches.group(1).strip('"')
        else:
            raw_input_dict[field] = matches.group(2)
    return raw_input_dict


def extract_first_pred(predictions):
    """_summary_

    Args:
        predictions (_type_): _description_

    Returns:
        _type_: _description_
    """
    regex = re.compile(r'proposé = ([^;]*) ;.*associée = ([^;]*)\].*proposé = ([^;]*) ;.*associée = ([^;]*)\]')
    matches = regex.search(predictions)
    first_code = matches.group(1)
    second_code = matches.group(3)
    first_proba = matches.group(2)
    second_proba = matches.group(4)
    return (first_code, second_code, float(first_proba), float(second_proba))


def clean_lib(df, text_feature):
        """
        Cleans a text feature for pd.DataFrame `df` at index idx.

        Args:
            df (pd.DataFrame): DataFrame.
            text_feature (str): Name of the text feature.

        Returns:
            df (pd.DataFrame): DataFrame.
        """
#        Libellé vide de sens fournit par Christine
#        LibVideSens = r"\bidem\b|\bvoir ci dessous\b|\[vide\]|\bundefined\b|\bpas d objet\b|\(voir ci dessus\)|\(voir extrait siege social\/etablissement principal\)|\bcf activite principale\b|\bcf activite principale et objet\b|\bcf activites de l entreprise\b|\bcf activites principales de l entreprise\b|\bcf actvites principales\b|\bcf k bis\b|\bcf le principales activites de l  entreprise\b|\bcf le sprincipale activites de l  entreprise\b|\bcf le sprincipales activites de l  entreprise\b|\bcf les activites principales de l  entreprise\b|\bcf les ppales activites de l  entreprise\b|\bcf les ppales activites de la ste\b|\bcf les principale activites de l  entreprise\b|\bcf les principales activites\b|\bcf les principales activites de l  entreprise\b|\bcf les principales activites de l  entreprises\b|\bcf les principales activites ppales de l  entreprise\b|\bcf les principales activtes de l  entreprise\b|\bcf les principales acttivites de l  entreprise\b|\bcf les prinipales activites de l  entreprise\b|\bcf lesprincipales activites de l  entreprise\b|\bcf objet\b|\bcf obs\b|\bcf principales activite de l  entreprise\b|\bcf principales activites de l  entreprise\b|cf rubrique \"principales activites de l entreprise\" idem|cf rubrique n2 ci dessus \(743b\)|\bcf supra\b|\bcf ci  dessus\b|\bcommerce de detail, idem case 2\b|\bextension a: voir ci dessus\b|\bid\b|\bid principales activites\b|\bid principales activites de l  entreprise\b|\bidem ci dessus\b|idem \( voir principales activites\)|\bidem  dessus\b|\bidem 1ere page\b|\bidem a principales activites de l  entreprise\b|\bidem activiet eprincipale\b|\bidem activite\b|\bidem activite 1ere page\b|\bidem activite ci  dessus\b|\bidem activite de l  entreprise\b|\bidem activite enoncee ci  dessus\b|\bidem activite entreprise\b|\bidem activite generales\b|\bidem activite premiere page\b|\bidem activite principale\b|\bidem activite princippale\b|\bidem activite prinicpale\b|\bidem activite sur 1ere page\b|\bidem activites ci dessus\b|\bidem activites declarees au siege et principal\b|\bidem activites enoncees ci dessus\b|\bidem activites entreprise\b|\bidem activites principales\b|\bidem activites principales de l entreprise\b|\bidem activites siege\b|\bidem activte principale\b|\bidem activtie 1ere page\b|\bidem au siege\b|\bidem au siege social\b|\bidem aux principales actiivtes\b|\bidem aux principales activites\b|\bidem case 13\b|\bidem ci dessous\b|\bidem ci dessus enoncee\b|\bidem cidessus\b|\bidem objet\b|\bidem premiere page\b|\bidem pricincipales activites de l entreprise\b|\bidem pricipales activites\b|\bidem principale activite\b|\bidem principales activite de l entreprise\b|\bidem principales activite de l entreprises\b|\bidem principales activite l entreprise\b|\bidem principales activites\b|\bidem principales activites citees ci dessus\b|\bidem principales activites de l entreprises\b|idem principales activites de l entreprise\(objet\)|\bidem principales activites et objet social\b|\bidem principales activitse de l entreprise\b|\bidem que celle decrite plus haut\b|\bidem que ci dessus\b|\bidem que l activite decrite plus haut\b|\bidem que les activites principales\b|\bidem que les activites principales ci dessus\b|\bidem que les activitges principales\b|\bidem que les principales activites\b|\bidem que les principales activites de l entreprise\b|\bidem que pour le siege\b|\bidem rubrique principales activites de l entreprise\b|\bidem siege\b|idem siege \+ voir observation|\bidem siege et ets principal\b|\bidem siege social\b|idem siege, \(\+ articles americains\)|\bidem societe\b|\bidem voir activite principale\b|\bidem voir ci dessus\b|\bidentique a l objet social indique en case 2 de l imprime m2\b|\bidm ci dessus\b|\bnon indiquee\b|\bnon precise\b|\bnon precisee\b|\bnon precisees\b|\bvoir 1ere page\b|\bvoir activite ci dessus\b|\bvoir activite principale\b|\bvoir activite principale ci dessus\b|\bvoir activites principales\b|\bvoir cidessus\b|\bvoir idem ci dessus\b|\bvoir objet social\b|\bvoir page 1\b|\bvoir page precedente\b|\bvoir plus haut\b|\bvoir princiale activite\b|\bvoir princiales activites\b|\bvoir princiapales activites\b|\bvoir princiaples activites\b|\bvoir principale activite\b|\bvoir principales activites\b|\bvoir principales activites de l entreprise\b|\bvoir principales actvites\b|\bvoir principalesactivites\b|\bvoir principles activites\b|\bvoir rubrique principales activites de l entreprise\b|\bvoir sur la 1ere page\b|\bvoir dessus\b|voir: \"principales activite de l entreprise\"|voir: \"principales activites de l entreprises\"|voir: \"principales activites de l entrprise\"|voir: \"principales activites en entreprise\"|\bconforme au kbis\b|\bsans changement\b|\bsans activite\b|\bsans acitivite\b|\bactivite inchangee\b|\bactivites inchangees\b|\bsiege social\b|\ba definir\b|\ba preciser\b|\bci dessus\b|\bci desus\b|\bci desssus\b|\bvoir activit principale\b|\bidem extrait kbis\b|\bn a plus a etre mentionne sur l extrait decret\b|\bcf statuts\b|\bactivite principale case\b|\bactivites principales case\b|\bactivite principale\b|\bactivites principales\b|\bvoir case\b|\baucun changement\b|\bsans modification\b|\bactivite non modifiee\b|\bactivite identique\b|\bpas de changement\b|\bcode\b|\bape\b|\bnaf\b|\binchangee\b|\binchnagee\b|\bkbis\b|\bk bis\b|\binchangees\b|\bnp\b|\binchange\b|\bnc\b|\bxx\b|\bxxx\b|\binconnue\b|\binconnu\b|\bvoir\b|\bannexe\b|\bmo\b|\biem\b|\binchanges\b|\bactivite demeure\b|\bactivite inchangée\b|\bcase precedente\b|\bidem cadre precedent\b|\bactivite demeure\b|\bactivite inchangée\b|\bnon renseignee\b|\bneant\b|\bnon renseigne\b"

        # On définit une regex de mots à supprimer du jeu de données
        Word2remove = r"\bcode\b|\bcadre\b|\bape\b|\bape[a-z]{1}\b|\bnaf\b|\binchangee\b|\binchnagee\b|\bkbis\b|\bk bis\b|\binchangees\b|\bnp\b|\binchange\b|\bnc\b|\bidem\b|\bxx\b|\bxxx\b|\baa\b|\baaa\b|\bidem cadre precedent\b|\bidem case\b|\binchanges\b|\bmo\b|\biem\b|\bci dessus\b|\bet\b"

        # On harmonise l'encodage (principalement suppression accents)
        df[text_feature] = df[text_feature].map(unidecode.unidecode)

        # On passe tout en minuscule
        df[text_feature] = df[text_feature].map(str.lower)

        # On supprime les libellés vide de sens (DOIT ETRE FAIT EN AMONT DU MODELE EN JAVA)
        #df[text_feature] = df[text_feature].replace(
        #    to_replace=LibVideSens, value="", regex=True
        #)

        # supprime hyphen pour les mots comme e-commerce
        df[text_feature] = df[text_feature].replace(
            to_replace=r"e-", value="e", regex=True
        )

        # accole le e pour les mots comme e-commerce
        df[text_feature] = df[text_feature].replace(
            to_replace=r"\be\s", value=" e", regex=True
        )

        # On supprime toutes les ponctuations
        df[text_feature] = df[text_feature].replace(
            to_replace=r"[^\w\s]+", value=" ", regex=True
        )

        # On supprime certains mots sans sens (DOIT ETRE FAIT DANS LE PREPROCESSING EN JAVA)
        df[text_feature] = df[text_feature].replace(
            to_replace=Word2remove, value="", regex=True
        )

        # On supprime les mots d'une seule lettre
        df[text_feature] = df[text_feature].apply(
            lambda x: ' '.join([w for w in x.split() if len(w) > 1])
        )

        # On supprime tous les chiffres
        df[text_feature] = df[text_feature].replace(
            to_replace=r"[\d+]", value=" ", regex=True
        )

        # On supprime les mots d'une seule lettre
        df[text_feature] = df[text_feature].apply(
            lambda x: ' '.join([w for w in x.split() if len(w) > 1])
        )

        # On supprime les multiple space
        df[text_feature] = df[text_feature].replace(r"\s\s+", " ", regex=True)

        # On strip les libellés
        df[text_feature] = df[text_feature].str.strip()

        # On remplace les empty string par des NaN
        df[text_feature] = df[text_feature].replace(r"^\s*$", np.nan, regex=True)

        # On supprime les NaN
        # df = df.dropna(subset=[text_feature])
        df[text_feature] =df[text_feature].fillna(value="NaN")

        # On tokenize tous les libellés
        libs_token = [lib.split() for lib in df[text_feature].to_list()]

        # Pour chaque libellé on supprime les stopword et on racinise les mots
        libs_token = [
            [
                stemmer.stem(word)
                for word in libs_token[i]
                if word not in stopwords
            ]
            for i in range(len(libs_token))
        ]

        # On supprime les mots duppliqués dans un même libellé
        libs_token = [
            sorted(set(libs_token[i]), key=libs_token[i].index)
            for i in range(len(libs_token))
        ]

        df[text_feature] = [
            " ".join(libs_token[i]) for i in range(len(libs_token))
        ]

        return df


RAW_INPUT_FIELDS = [
    "norme",
    "siren",
    "nic",
    "liasseType",
    "categorieJuridique",
    "domas",
    "ssdom",
    "domaineAssoc",
    "ssDomaineAssoc",
    "libelleActivitePrincipaleEtablissement",
    "sedentarite",
    "natureActivites",
    "surface",
    "lieuExercice",
    "presenceSalaries"
]
RAW_INPUT_REGEXES = [
    re.compile(r'{}'.format(field + '=([^,]*)[,\]]'))
    for field in RAW_INPUT_FIELDS
]
INFO_FIELDS = [
    "natureActivites",
    "liasseType",
    "evenementType",
    "surface",
    "libelleNettoye",
    "predictions",
    "bilan"
]
INFO_REGEXES = [
    re.compile(r'{}'.format(field + '=([^,]*),'))
    for field in INFO_FIELDS
]
INFO_FIELDS.append("fasttextVersion")
INFO_REGEXES.append(re.compile(r'fasttextVersion=([^,]*)\]'))
INFO_FIELDS.append("libelleActivite")
INFO_REGEXES.append(re.compile(r'libelleActivite=(\"[^\"]*\")?([^,]*),'))


if __name__ == "__main__":
    with open("../data/api_log.log") as f:
        df = extract_log_info(f)

    df_ids = df[df.event_type == EventType.ID.value]
    df_info = df[df.event_type == EventType.INFO.value]
    df_raw_input = df[df.event_type == EventType.RAW_INPUT.value]
    df_raw_input = df_raw_input[
        df_raw_input.description.str.startswith("LiasseVarInteretCodification")
    ]

    df = pd.DataFrame(
        [
            parse_raw_input(info_input, INFO_FIELDS, INFO_REGEXES)
            for info_input in df_info.description
            if not info_input.__contains__('""')
        ]
    )

    predictions = [
        extract_first_pred(predictions)
        for predictions in df["predictions"]
    ]
    df["first_pred"] = [prediction[0] for prediction in predictions]
    df["second_pred"] = [prediction[1] for prediction in predictions]
    df["first_proba"] = [prediction[2] for prediction in predictions]
    df["second_proba"] = [prediction[3] for prediction in predictions]

    stemmer = SnowballStemmer(language="french")
    stopwords = tuple(ntlk_stopwords.words("french")) + tuple(string.ascii_lowercase)

    dff = df.copy()
    dff["libelleActivite"] = df["libelleActivite"].fillna(value="NaN")
    df_prepro = clean_lib(dff, "libelleActivite")
    lib_raw = df.libelleActivite.to_list()
    lib_clean_PY = df_prepro.libelleActivite.to_list()
    lib_clean_JAVA = df.libelleNettoye.apply(lambda x : x.split(" AUTO")[0]).to_list()
    compare_libs = pd.DataFrame({"RAW" : lib_raw, "PYTHON" : lib_clean_PY, "JAVA" : lib_clean_JAVA})
    compare_libs["CHECK"] = compare_libs.PYTHON ==  compare_libs.JAVA
    compare_libs.to_csv("comparison.csv")


In [2]:
x = compare_libs[~compare_libs["CHECK"]]

In [9]:
x.to_csv("false.csv")

1) Prepro mots vide de sens
2) ```"œ"``` pas décodé de la même manière
3) ```"€"``` pas décodé de la même manière
4) ```"Yeti"``` pas racinisé de la même manière
5) ```"Terassemment"``` pas racinisé de la même manière

In [4]:
import fasttext
model = fasttext.load_model("../models/model.bin")



In [6]:
res = model.predict(df[compare_libs.CHECK].libelleNettoye.to_list(), k=3)

first_pred = [res[0][i][0].replace("__label__", "") for i in range(len(res[0]))]
second_pred = [res[0][i][1].replace("__label__", "") for i in range(len(res[0]))]
trois_pred = [res[0][i][2].replace("__label__", "") for i in range(len(res[0]))]
first_proba = [res[1][i][0] for i in range(len(res[1]))]
second_proba = [res[1][i][1] for i in range(len(res[1]))]
trois_proba = [res[1][i][2] for i in range(len(res[1]))]

In [7]:
lib_clean = df[compare_libs.CHECK].libelleNettoye.apply(lambda x : x.split(" AUTO")[0]).to_list()

df_compare = pd.DataFrame({
    "Lib" : df[compare_libs.CHECK].libelleNettoye, 
    "pred_1_PY" : first_pred, 
    "pred_1_JAVA" : df[compare_libs.CHECK].first_pred,
    "pred_2_PY" : second_pred, 
    "pred_2_JAVA" : df[compare_libs.CHECK].second_pred,
    "prob_1_PY" : first_proba, 
    "prob_1_JAVA" : df[compare_libs.CHECK].first_proba,
    "prob_2_PY" : second_proba, 
    "prob_2_JAVA" : df[compare_libs.CHECK].second_proba,
    "pred_3_PY" : trois_pred, 
    "prob_3_PY" : trois_proba, 
    })


In [8]:
df_compare

Unnamed: 0,Lib,pred_1_PY,pred_1_JAVA,pred_2_PY,pred_2_JAVA,prob_1_PY,prob_1_JAVA,prob_2_PY,prob_2_JAVA,pred_3_PY,prob_3_PY
0,coaching sportif AUTO_X NAT_SICORE_NaN SURF_Na...,8551Z,8551Z,9609Z,9609Z,0.994950,0.994950,0.002642,0.002642,8559B,0.000814
1,realis dessin anim AUTO_X NAT_SICORE_NaN SURF_...,9003A,9003A,1813Z,1813Z,0.615098,0.615098,0.334599,0.334599,5911A,0.152042
2,electricien electricit general AUTO_Z NAT_SICO...,4321A,4321A,4321B,4321B,1.000010,1.000010,0.000893,0.000893,9820Z,0.000010
3,prestat servic assist ressort britann tach adm...,6831Z,6831Z,8219Z,8219Z,0.148057,0.148057,0.140346,0.140346,8211Z,0.095359
4,agent commercial mandatair immobili AUTO_R NAT...,6831Z,6831Z,4619B,4619B,1.000010,1.000010,0.000367,0.000367,9820Z,0.000010
...,...,...,...,...,...,...,...,...,...,...,...
32859,realis seanc hypnos AUTO_C NAT_SICORE_99 SURF_...,9609Z,9609Z,8690F,8690F,0.971574,0.971574,0.050341,0.050341,4771Z,0.003604
32860,exploit boutiqu ecommerc recour acte commerc A...,4791A,4791A,4791B,4791B,0.476590,0.476590,0.055015,0.055015,6820A,0.006300
32861,preprat vent plat cuisin emport AUTO_C NAT_SIC...,5610C,5610C,4724Z,4724Z,0.314061,0.314061,0.013233,0.013233,5621Z,0.004342
32862,anim ateli stag spectacl improvis theatral clo...,9001Z,9001Z,8552Z,8552Z,0.999474,0.999474,0.766304,0.766304,9004Z,0.035155


In [47]:
labels = model.labels

In [57]:
pd.DataFrame([x.split("__label__")[1] for x in labels])

In [74]:
pred1 = df_compare[(df_compare.pred_1_PY != df_compare.pred_1_JAVA)].pred_1_PY.to_list()
pred2 = df_compare[(df_compare.pred_1_PY != df_compare.pred_1_JAVA)].pred_2_PY.to_list()

In [2]:
import numpy as np

In [11]:
df_compare[(df_compare.pred_1_PY != df_compare.pred_1_JAVA)].shape

(49, 11)

In [13]:
df_compare[(df_compare.pred_1_PY != df_compare.pred_1_JAVA)].shape

(49, 11)

In [17]:
df_compare[(df_compare.pred_2_PY != df_compare.pred_2_JAVA) & (df_compare.pred_1_PY == df_compare.pred_1_JAVA)].shape

(24, 11)

In [19]:
df_compare[(abs(df_compare.prob_1_PY - df_compare.prob_1_JAVA)> 0.00001)].shape

(0, 11)

In [20]:
df_compare.shape

(32853, 11)