In [15]:
import pandas as pd
import numpy as np
import json

from sklearn.decomposition import PCA

from tqdm.notebook import tqdm
from scipy.spatial.distance import cdist

import openai

from sklearn.metrics import (
    accuracy_score, 
    f1_score, 
    recall_score,
    precision_score,
    roc_auc_score, confusion_matrix, roc_curve, classification_report,
    precision_recall_curve, PrecisionRecallDisplay,
    confusion_matrix
)

from sklearn.model_selection import KFold

import pickle

In [16]:
with open('../secrets.json') as f:
    secrets = json.load(f)

In [17]:
openai.api_key = secrets['chatgpt_api_key']

# Load data

In [18]:
with open('../datasets/used_data/03_bert_like_models/02_topics.npy', 'rb') as f:
    df_topics = pd.DataFrame(np.load(f), columns=['topic'])
df_topics.shape

(6529, 1)

In [19]:
with open('../datasets/used_data/02_classical_ml/03_04_statements_herbert.npy', 'rb') as f:
    df_herbert = pd.DataFrame(np.load(f))
df_herbert.shape

(6529, 1024)

In [20]:
df = pd.read_parquet('../datasets/used_data/03_bert_like_models/01_basic_info_data.parquet')

df['labels'] = df['assestment'].astype(int)

df = df.sample(frac=1, random_state=111)

df = df[df.index.isin(df_topics.index)]
df.shape

(6529, 4)

# Process

In [21]:
cv_fold = []

for i in df_topics['topic'].unique().reshape(10,-1):
    train_cv = df_topics.index[ ~np.isin(df_topics["topic"], i) ].values
    test_cv = df_topics.index[ np.isin(df_topics["topic"], i) ].values
    
    cv_fold.append( [train_cv, test_cv])

In [22]:
kf = KFold(n_splits=10, shuffle=True, random_state=111)
kf.get_n_splits(df)

cv_Kfold = []

for train_index, test_index in kf.split(df_topics):
    train_cv = df_topics.iloc[ train_index, : ].index.values
    test_cv = df_topics.iloc[ test_index, : ].index.values

    cv_Kfold.append( [train_cv, test_cv])

# Example

## Run PCA on train data -> transform test

In [23]:
train_cv, test_cv = cv_fold[0]

In [24]:
# find n components allowing to have > 0.9 variance
n_com_list = []
for train_cv, test_cv in tqdm(cv_fold):
    data = df_herbert.loc[train_cv, :]
    
    pca = PCA()
    pca.fit(data)
    
    cumulative_variance = np.cumsum(pca.explained_variance_ratio_)
    
    n_comp = (cumulative_variance < 0.9).sum() + 1
    
    n_com_list.append(n_comp)

  0%|          | 0/10 [00:00<?, ?it/s]

In [25]:
n_com_list

[171, 171, 171, 172, 171, 171, 171, 171, 171, 171]

In [26]:
def get_2_nearest(example, space, verbose=False):
    distances = cdist(example, space, 'euclidean')

    nearest_indices = np.argsort(distances)[0][:2].tolist()
    if verbose:
        print("Indices of the three nearest neighbors:", nearest_indices)
        print("Distances to the three nearest neighbors:", distances[0, nearest_indices])

    return nearest_indices, distances[0, nearest_indices]

In [55]:
#test
# prompt = 'Zaklasyfikuj poniższe zdanie jako prawdziwe (True) lub fałszywe (Fake), na podstawie podanych przykładów, odpowiedz uzywajac jednego słowa - True/Fake:'
# train_cv, test_cv = cv_fold[0]

In [26]:
# data_train = df_herbert.loc[train_cv, :]
# data_test = df_herbert.loc[test_cv, :]

# df_train = df.loc[train_cv, :]
# df_test = df.loc[test_cv, :]

# pca = PCA(n_components = 171)
# pca.fit(data_train)

# data_train_pca = pd.DataFrame(pca.transform(data_train), index=train_cv)
# data_test_pca = pd.DataFrame(pca.transform(data_test), index=test_cv)

# data_train_pca_fake = data_train_pca[df_train['assestment']==1]
# data_train_pca_true = data_train_pca[df_train['assestment']==0]

# df_train_fake = df_train[df_train['assestment']==1]
# df_train_true = df_train[df_train['assestment']==0]

In [52]:
# t_i = 0

# example = data_test_pca.iloc[t_i,:].values.reshape(1,-1)

# nearest_fake, dist_fake = get_2_nearest(example, data_train_pca_fake)
# nearest_true, dist_true = get_2_nearest(example, data_train_pca_true)

# nearest = pd.DataFrame({
#     'index' : nearest_fake + nearest_true,
#     'label' : [1, 1, 0 , 0],
#     'dist' : dist_fake.reshape(-1).tolist() + dist_true.reshape(-1).tolist(),
#     'text' : df_train_fake.iloc[nearest_fake, :]['text'].values.tolist() + df_train_true.iloc[nearest_true,:]['text'].values.tolist()
# }
# )

# nearest = nearest.sort_values('dist').head(3)

# nearest

In [53]:
# messages = [ {"role": "system", "content": prompt} ]

# for l, t in zip(nearest['label'].values, nearest['text'].values):
#     fake_true = 'True'
#     if l == 1:
#         fake_true = 'Fake'
#     messages.append( 
#         {"role": "user", "content": f'"{t} : {fake_true}"'}, 
#     ) 

# messages.append( 
#     {"role": "user", "content": f'"{df_test["text"].values[t_i]}" : '}, 
# ) 

# messages

In [54]:
# chat = openai.ChatCompletion.create( 
#     model="gpt-3.5-turbo", messages=messages 
# ) 

# reply = chat.choices[0].message.content 

# y_zero_shot_test.append(reply)

In [27]:
y_few_shot_test = []

In [82]:
# y_few_shot_test.append(y_few_shot_test_i)

In [28]:
prompt = 'Zaklasyfikuj poniższe zdanie jako prawdziwe (True) lub fałszywe (Fake), na podstawie podanych przykładów, odpowiedz uzywajac jednego słowa - True/Fake:'

In [84]:
for train_cv, test_cv in tqdm(cv_fold[1:]):
    data_train = df_herbert.loc[train_cv, :]
    data_test = df_herbert.loc[test_cv, :]
    
    df_train = df.loc[train_cv, :]
    df_test = df.loc[test_cv, :]
    
    pca = PCA(n_components = 171)
    pca.fit(data_train)
    
    data_train_pca = pd.DataFrame(pca.transform(data_train), index=train_cv)
    data_test_pca = pd.DataFrame(pca.transform(data_test), index=test_cv)
    
    data_train_pca_fake = data_train_pca[df_train['assestment']==1]
    data_train_pca_true = data_train_pca[df_train['assestment']==0]
    
    df_train_fake = df_train[df_train['assestment']==1]
    df_train_true = df_train[df_train['assestment']==0]

    y_few_shot_test_i = []
    
    for t_i in tqdm(range(df_test.shape[0]), total=df_test.shape[0]):
        example = data_test_pca.iloc[t_i,:].values.reshape(1,-1)
        
        nearest_fake, dist_fake = get_2_nearest(example, data_train_pca_fake)
        nearest_true, dist_true = get_2_nearest(example, data_train_pca_true)
        
        nearest = pd.DataFrame({
            'index' : nearest_fake + nearest_true,
            'label' : [1, 1, 0 , 0],
            'dist' : dist_fake.reshape(-1).tolist() + dist_true.reshape(-1).tolist(),
            'text' : df_train_fake.iloc[nearest_fake, :]['text'].values.tolist() + df_train_true.iloc[nearest_true,:]['text'].values.tolist()
        }
        )
        
        nearest = nearest.sort_values('dist').head(3)

        
        messages = [ {"role": "system", "content": prompt} ]
        
        for l, t in zip(nearest['label'].values, nearest['text'].values):
            fake_true = 'True'
            if l == 1:
                fake_true = 'Fake'
            messages.append( 
                {"role": "user", "content": f'"{t} : {fake_true}"'}, 
            ) 
        
        messages.append( 
            {"role": "user", "content": f'"{df_test["text"].values[t_i]}" : '}, 
        ) 

        chat = openai.ChatCompletion.create( 
            model="gpt-3.5-turbo", messages=messages 
        ) 
          
        reply = chat.choices[0].message.content 
        
        y_few_shot_test_i.append(reply)

    y_few_shot_test.append(y_few_shot_test_i)

  0%|          | 0/9 [00:00<?, ?it/s]

  0%|          | 0/652 [00:00<?, ?it/s]

  0%|          | 0/623 [00:00<?, ?it/s]

  0%|          | 0/905 [00:00<?, ?it/s]

  0%|          | 0/632 [00:00<?, ?it/s]

  0%|          | 0/632 [00:00<?, ?it/s]

  0%|          | 0/548 [00:00<?, ?it/s]

  0%|          | 0/656 [00:00<?, ?it/s]

  0%|          | 0/668 [00:00<?, ?it/s]

  0%|          | 0/590 [00:00<?, ?it/s]

In [86]:
with open('../datasets/used_data/04_decoder_model/02_few_shot_chatgpt_labels_topics.pkl', "wb") as output:
    pickle.dump(y_few_shot_test, output, pickle.HIGHEST_PROTOCOL);

In [102]:
_, test = cv_fold[0]

In [103]:
test.shape

(623,)

In [105]:
len(y_few_shot_test[0])

623

In [123]:
acc = []
f1 = []

for (train_cv, test_cv), y_pred in tqdm(zip(cv_fold, y_few_shot_test)):
    
    df_train = df.loc[train_cv, :]
    df_test = df.loc[test_cv, :]

    print(np.unique(y_pred))

    y_test = df_test['assestment']

    if 'Nie mogę zaklasyfikować tego zdania jako prawdziwe (True) lub fałszywe (Fake), ponieważ nie posiadam wystarczających informacji, które by mi to umożliwiły.' in y_pred:
        print(
            df_test[np.array(y_pred)=='Nie mogę zaklasyfikować tego zdania jako prawdziwe (True) lub fałszywe (Fake), ponieważ nie posiadam wystarczających informacji, które by mi to umożliwiły.'
            ].values
        )

    y_pred = pd.Series(y_pred).isin(['Fake', 'Fake.','False']).astype(int)

    acc.append( accuracy_score(y_test, y_pred) )
    f1.append( f1_score(y_test, y_pred) )

0it [00:00, ?it/s]

['Fake' 'False' 'True']
['Fake' 'False' 'True']
['Fake' 'False' 'True']
['Fake' 'False' 'True']
['Fake' 'False' 'True']
['Fake' 'Fake.' 'False'
 'Nie mogę zaklasyfikować tego zdania jako prawdziwe (True) lub fałszywe (Fake), ponieważ nie posiadam wystarczających informacji, które by mi to umożliwiły.'
 'True' 'True.']
[[1 'A w jakim czasie taka dokumentacja powinna być wydana?'
  'A w jakim czasie taka dokumentacja powinna być wydana?' 1]]
['Fake' 'False' 'True']
['Fake' 'False' 'True']
['Fake' 'False' 'True']
['Fake' 'False' 'True']


In [114]:
print( f"{np.mean(acc):0.2f} +- {np.std(acc):0.2f}" )

0.59 +- 0.02


In [115]:
print( f"{np.mean(f1):0.2f} +- {np.std(f1):0.2f}" )

0.56 +- 0.04


In [29]:
y_few_shot_test_random = []

In [30]:
for train_cv, test_cv in tqdm(cv_Kfold):
    data_train = df_herbert.loc[train_cv, :]
    data_test = df_herbert.loc[test_cv, :]
    
    df_train = df.loc[train_cv, :]
    df_test = df.loc[test_cv, :]
    
    pca = PCA(n_components = 171)
    pca.fit(data_train)
    
    data_train_pca = pd.DataFrame(pca.transform(data_train), index=train_cv)
    data_test_pca = pd.DataFrame(pca.transform(data_test), index=test_cv)
    
    data_train_pca_fake = data_train_pca[df_train['assestment']==1]
    data_train_pca_true = data_train_pca[df_train['assestment']==0]
    
    df_train_fake = df_train[df_train['assestment']==1]
    df_train_true = df_train[df_train['assestment']==0]

    y_few_shot_test_i = []
    
    for t_i in tqdm(range(df_test.shape[0]), total=df_test.shape[0]):
        example = data_test_pca.iloc[t_i,:].values.reshape(1,-1)
        
        nearest_fake, dist_fake = get_2_nearest(example, data_train_pca_fake)
        nearest_true, dist_true = get_2_nearest(example, data_train_pca_true)
        
        nearest = pd.DataFrame({
            'index' : nearest_fake + nearest_true,
            'label' : [1, 1, 0 , 0],
            'dist' : dist_fake.reshape(-1).tolist() + dist_true.reshape(-1).tolist(),
            'text' : df_train_fake.iloc[nearest_fake, :]['text'].values.tolist() + df_train_true.iloc[nearest_true,:]['text'].values.tolist()
        }
        )
        
        nearest = nearest.sort_values('dist').head(3)

        
        messages = [ {"role": "system", "content": prompt} ]
        
        for l, t in zip(nearest['label'].values, nearest['text'].values):
            fake_true = 'True'
            if l == 1:
                fake_true = 'Fake'
            messages.append( 
                {"role": "user", "content": f'"{t} : {fake_true}"'}, 
            ) 
        
        messages.append( 
            {"role": "user", "content": f'"{df_test["text"].values[t_i]}" : '}, 
        ) 

        chat = openai.ChatCompletion.create( 
            model="gpt-3.5-turbo", messages=messages 
        ) 
          
        reply = chat.choices[0].message.content 
        
        y_few_shot_test_i.append(reply)

    y_few_shot_test_random.append(y_few_shot_test_i)

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/653 [00:00<?, ?it/s]

  0%|          | 0/653 [00:00<?, ?it/s]

  0%|          | 0/653 [00:00<?, ?it/s]

  0%|          | 0/653 [00:00<?, ?it/s]

  0%|          | 0/653 [00:00<?, ?it/s]

  0%|          | 0/653 [00:00<?, ?it/s]

  0%|          | 0/653 [00:00<?, ?it/s]

  0%|          | 0/653 [00:00<?, ?it/s]

  0%|          | 0/653 [00:00<?, ?it/s]

  0%|          | 0/652 [00:00<?, ?it/s]

In [None]:
cant = [
    'Nie mogę',
]

In [55]:
acc = []
f1 = []

for (train_cv, test_cv), y_pred_str in tqdm(zip(cv_Kfold, y_few_shot_test_random)):
    
    df_train = df.loc[train_cv, :]
    df_test = df.loc[test_cv, :]

    print(np.unique(y_pred_str))

    y_test = df_test['assestment']

    if pd.Series(y_pred_str).str.contains('Nie mogę').shape[0] > 0:
        print(
            df_test[pd.Series(y_pred_str).str.contains('Nie mogę').values
            ].values
        )

    y_pred = pd.Series(y_pred_str).isin(['Fake', 'Fake.', 'False', 'Fejk' ]).astype(int).values
    y_pred[pd.Series(y_pred_str).str.contains('Nie mogę').values] = -1

    y_test = y_test[y_pred>-1].values
    y_pred = y_pred[y_pred>-1]

    acc.append( accuracy_score(y_test, y_pred) )
    f1.append( f1_score(y_test, y_pred) )

0it [00:00, ?it/s]

['Fake' 'False' 'True']
[]
['Fake' 'False' 'True']
[]
['Fake' 'Fake.' 'False' 'Fejk' 'True']
[]
['Fake' 'False' 'True']
[]
['Fake' 'False' 'True']
[]
['Fake' 'False' 'True']
[]
['Fake' 'Fake.' 'False'
 'Nie mogę odpowiedzieć na podstawie podanego zdania, ponieważ nie jest poprawnie przekształcone na zdanie zdawkowe. Proszę podać pełne zdanie w formie zdawkowej, a następnie podać odpowiedź jako True lub False.'
 'True']
[[1
  'Monika Olejnik: Pani minister, a jaki procent rodziców zrezygnowało z nauczania obowiązkowego od szóstego roku życia, bo mogło zrezygnować na podstawie zaświadczeń.'
  'Monika Olejnik: Pani minister, a jaki procent rodziców zrezygnowało z nauczania obowiązkowego od szóstego roku życia, bo mogło zrezygnować na podstawie zaświadczeń.'
  1]]
['Fake' 'Fake.' 'False'
 'Nie mogę ocenić prawdziwości tego zdania bez dodatkowych informacji.'
 'True']
[[1 '[pakiet onkologiczny]' 'pakiet onkologiczny' 1]]
['Fake' 'False' 'True']
[]
['Fake' 'False'
 'Nie mogę odpowiedz

In [56]:
print( f"{np.mean(acc):0.2f} +- {np.std(acc):0.2f}" )

0.59 +- 0.03


In [57]:
print( f"{np.mean(f1):0.2f} +- {np.std(f1):0.2f}" )

0.57 +- 0.03


In [60]:
df[df['text'].str.contains('pakiet onko')]

Unnamed: 0,assestment,text,text_clean,labels
3282,1,[pakiet onkologiczny],pakiet onkologiczny,1


In [87]:
# y_pred = pd.Series(y_few_shot_test_i).isin(['Fake','False']).astype(int)

In [88]:
# y_test = df_test['assestment']

In [89]:
# accuracy_score(y_test, y_pred) 

In [90]:
# f1_score(y_test, y_pred) 

In [91]:
# confusion_matrix(y_test, y_pred) 

In [92]:
# precision, recall, _ = precision_recall_curve(y_test, y_pred)
# disp = PrecisionRecallDisplay(precision=precision, recall=recall)
# disp.plot()

In [11]:
# example = data_test_pca.iloc[0,:].values.reshape(1,-1)
# nearest_fake = get_2_nearest(example, data_train_pca_fake)
# nearest_true = get_2_nearest(example, data_train_pca_true)

In [12]:
# df_test[['assestment', 'text']].head(1).values

In [13]:
# df_train_fake[['assestment', 'text']].iloc[nearest_fake, :].values

In [14]:
# df_train_true[['assestment', 'text']].iloc[nearest_true, :].values