In [1]:
import pandas as pd
from ftfy import fix_text
import re
import json
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
pd.set_option('display.max_colwidth', None)
df = pd.read_csv('./test.csv')

In [3]:
df1 = pd.read_csv('./gem25proct-prompt_1.csv')
df2 = pd.read_csv('./gem25proct-prompt_2.csv')
df3 = pd.read_csv('./gem25proct-prompt_3.csv')
df4 = pd.read_csv('./gem25proct-prompt_4.csv')
df = pd.concat([df1, df2, df3, df4], ignore_index=True)
len(df)
df.to_csv('./gem25proct-prompt.csv', index=False)

In [14]:
def split_predict_column_method1(df):
    df_copy = df.copy()
    def extract_json(text):
        match = re.search(r'{[\s\S]*?}', text)
        if match:
            return json.loads(match.group(0))
        else:
            return None
    if df_copy['predict'].dtype == 'object':
        df_copy['predict'] = df_copy['predict'].apply(lambda x: extract_json(x) if isinstance(x, str) else x)
    json_df = pd.json_normalize(df_copy['predict'])
    result_df = pd.concat([df_copy.drop('predict', axis=1), json_df], axis=1)
    return result_df

df = pd.read_csv('./gem25proct-prompt.csv')
df = df.dropna()
df['predict'] = df['predict'].str.strip()
df = split_predict_column_method1(df)

In [15]:
from scipy.stats import entropy
def calculate_negative_entropy(row):
    """
    Tính negative entropy cho một hàng dữ liệu
    """
    # Lấy các cột xác suất
    prob_cols = [
        'Opposite_meaning_probability',
        'Misrepresentation_probability', 
        'Related_but_unverifiable_probability',
        'Entailment_probability',
        'Entity_error_probability',
        'Unrelated_and_unverifiable_probability',
        'Numeric_error_probability',
        'Missing_information_probability'
    ]
    
    # Lấy giá trị xác suất và chuyển đổi kiểu dữ liệu
    try:
        probabilities = []
        for col in prob_cols:
            if col in row.index:
                val = row[col]
                if pd.notna(val):  # Kiểm tra không phải NaN
                    val = float(val)  # Chuyển đổi sang float
                    if val > 0:  # Chỉ lấy giá trị > 0
                        probabilities.append(val)
        
        probabilities = np.array(probabilities, dtype=float)
        
        if len(probabilities) == 0 or np.sum(probabilities) == 0:
            return 0.0
        
        # Tính entropy thủ công để tránh lỗi
        ent = np.sum(probabilities * np.log2(probabilities))  # Thêm epsilon để tránh log(0)
        
        # Trả về negative entropy
        return ent
        
    except Exception as e:
        print(f"Lỗi tính entropy cho hàng: {e}")
        return 0.0
# Tính negative entropy cho tất cả các hàng
df['negative_entropy'] = df.apply(calculate_negative_entropy, axis=1)
df = df.drop(columns=['Opposite_meaning_probability','Misrepresentation_probability','Related_but_unverifiable_probability','Entailment_probability','Entity_error_probability','Unrelated_and_unverifiable_probability','Numeric_error_probability','Missing_information_probability'], axis=1, errors='ignore')
df['negative_entropy'].describe()

count    1000.000000
mean       -0.394148
std         0.321615
min        -1.800958
25%        -0.568996
50%        -0.362493
75%         0.000000
max         0.000000
Name: negative_entropy, dtype: float64

In [None]:
n = 1
top_n_df = df.groupby('answer', group_keys=False) \
             .apply(lambda x: x.nlargest(n, 'negative_entropy'))
top_n_df

In [16]:
df = df.rename(columns={'answer': 'Label'})

# Thay thế các giá trị trong cột 'Label' bằng dạng viết tắt
df['Label'] = df['Label'].replace({
    'Misrepresentation': 'misinter', 
    'Opposite meaning': 'negat',
    'Related but unverifiable': 'relunvef', 
    'Entailment': 'entail', 
    'Entity error': 'entierr',
    'Unrelated and unverifiable': 'unrelunvef', 
    'Numeric error': 'numerr',
    'Missing information': 'missinfo'
})
nan = df[df.isna().any(axis=1)]
df = df.drop(columns=['claim_clean', 'reference_clean', 'negative_entropy'], axis=1)
df['Label'].unique()

array(['misinter', 'negat', 'relunvef', 'entail', 'entierr', 'missinfo',
       'numerr', 'unrelunvef', nan], dtype=object)

In [17]:
nan

Unnamed: 0,ID,claim_clean,reference_clean,Label,negative_entropy
999,i_78,Techniques to Address Imbalanced Data: Resampling Methods: Hybrid Sampling: Combines both oversampling and undersampling to balance the dataset .,"Imbalance data are defined as a dataset whose proportion of classes is severely skewed. Classification performance of existing models tends to deteriorate due to class distribution imbalance. In addition, over-representation by majority classes prevents a classifier from paying attention to minority classes, which are generally more interesting. An effective ensemble classification method called RHSBoost has been proposed to address the imbalance classification problem. This classification rule uses random undersampling and ROSE sampling under a boosting scheme. According to the experimental results, RHSBoost appears to be an attractive classification model for imbalance data.\n[9]: The imbalanced data problem occurs when the number of representative instances for classes of interest is much lower than for other classes. The influence of imbalanced data on classification performance has been discussed in some previous research as a challenge to be studied. In this paper, we propose a method to solve the imbalanced data problem by focusing on preprocessing, including: I) sampling techniques (i.e., under-sampling, over-sampling, and hybrid-sampling) and ii) the instance weighting method to increase the number of features in minority classes and to reduce comprehensive coverage in majority classes. The experimental results show that the noisy data is reduced, making a smaller sized dataset, and training time decreases significantly. Moreover, distinct properties of each class are examined effectively. Refined data is used as input for Naive Bayes and support vector machine classifiers for the targets of the training process. The proposed methods are evaluated based on the number of non-geotagged resources that are labeled correctly with their geo-locations. In comparison with previous research, the proposed method achieves accuracy of 84, whereas previous results were 75.",,0.0
741,,,,negat,-0.847585


In [18]:
df.loc[df['ID'] == 'i_78', 'Label'] = 'relunvef'
df = df.dropna()
len(df)

999

In [22]:
df2 = pd.read_csv('./test.csv')
missing_ids = df2['ID'][~df2['ID'].isin(df['ID'])]
missing_ids

Series([], Name: ID, dtype: object)

In [21]:
new_row = pd.DataFrame({'ID': ['s_528'], 'Label': ['negat']})
df = pd.concat([df, new_row], ignore_index=True)
len(df)

1000

In [23]:
df.to_csv('./gem25proct0.csv', index=False)

In [24]:
df['Label'] = df['Label'].replace({
    'misinter': 'contra', 
    'negat': 'contra',
    'relunvef': 'unver', 
    'entail': 'entail', 
    'entierr': 'contra',
    'unrelunvef': 'unver', 
    'numerr': 'contra',
    'missinfo': 'contra'
})
df.to_csv('./gem25proct02.csv', index=False)