# Preprocessing of question-answering pairs

## Description
The the scores distributions in different filtering ways, output json file at the end.

## Tasks

1. Check answers count for each question

2. Filter answers with blocks of code

3. Check answers count for each question again

4. Score distributions and filter questions with less than 3 answers

5. Score distribution

6. JSON output

In [None]:
import pandas as pd
from bs4 import BeautifulSoup
from os.path import join as opj
import plotly.figure_factory as ff
import numpy as np
from sklearn import preprocessing
import seaborn as sns

In [None]:
pd.set_option("display.max_colwidth", None)

In [None]:
data_path = r'D:\CQA_RLHF\data\interim'

In [None]:
q_a_df = pd.read_csv(opj(data_path, 'question_answers.csv'))

In [None]:
q_a_df['A_Score_norm_ans_count'] = q_a_df.A_Score / q_a_df.AnswerCount
# q_a_df['A_Score_norm_views_count'] = q_a_df.A_Score / q_a_df.ViewCount

## Answers without blocks of code

In [None]:
def filter_column(df, column_name, text_to_filter, regex=False):
    before_filtering = len(df)
    df = df[
        ~df[f"{column_name}"].str.contains(f"{text_to_filter}", na=False, regex=regex)
    ]
    after_filtering = len(df)
    print(f"Deleted {before_filtering-after_filtering} rows")
    return df

In [None]:
q_a_df_no_code = filter_column(q_a_df, 'A_Body', "</code></pre>")

### Count of answers to each question

Match for each questions in a row count of anaswers

In [None]:
q_a_df_no_code['count_available_anaswers'] = q_a_df_no_code['Q_Id'].map(q_a_df_no_code['Q_Id'].value_counts())

In [None]:
print('1 answer available:', len(q_a_df_no_code.loc[q_a_df_no_code['count_available_anaswers'] == 1]))
print('More than 1 answer available:', len(q_a_df_no_code.loc[q_a_df_no_code['count_available_anaswers'] != 1]))

In [None]:
m_a_scaler = preprocessing.MaxAbsScaler()

##### Transform without outliers

Фильтрация выбросов

In [None]:
# def filter_outliers_iqr(data, column):
#     q1, q3 = data[column].quantile([0.15, 0.75])
#     iqr = q3 - q1
#     lower_bound = q1 - (1.5 * iqr)
#     upper_bound = q3 + (1.5 * iqr)
#     return (
#         data[column][(data[column] > lower_bound) & (data[column] < upper_bound)],
#         data[column][data[column] <= lower_bound],
#         data[column][data[column] >= upper_bound],
#     )


# def filter_and_transform(data, column, scaler, new_col_name):
#     filtered_data, lower_outliers, upper_outliers = filter_outliers_iqr(data, column)

#     print(len(filtered_data), len(lower_outliers), len(upper_outliers))

#     filtered_scaled_data = scaler.fit_transform(filtered_data.values.reshape(-1, 1)).reshape(1, -1)[0]
#     filtered_scaled_data = pd.Series(filtered_scaled_data, index=filtered_data.index)

#     transformed_series = pd.concat(
#         [filtered_scaled_data, lower_outliers, upper_outliers], axis=0
#     )
#     transformed_series.name = new_col_name

#     return data.join(transformed_series, how="left", on=data.index)

In [12]:
def filter_outliers_iqr(data, column):
    q1, q3 = data[column].quantile([0.15, 0.75])
    iqr = q3 - q1
    lower_bound = q1 - (1.5 * iqr)
    upper_bound = q3 + (1.5 * iqr)

    return (
        data[column][(data[column] > lower_bound) & (data[column] < upper_bound)],
        data[column][data[column] <= lower_bound],
        data[column][data[column] >= upper_bound],
    )


def filter_and_transform(data, column, scaler, new_col_name):
    filtered_data, lower_outliers, upper_outliers = filter_outliers_iqr(data, column)
    lower_outliers.values[:] = -1
    upper_outliers.values[:] = 1

    print(len(filtered_data), len(lower_outliers), len(upper_outliers))

    filtered_scaled_data = scaler.fit_transform(filtered_data.values.reshape(-1, 1)).reshape(1, -1)[0]
    filtered_scaled_data = pd.Series(filtered_scaled_data, index=filtered_data.index)

    transformed_series = pd.concat(
        [filtered_scaled_data, lower_outliers, upper_outliers], axis=0
    )
    transformed_series.name = new_col_name

    return data.join(transformed_series, how="left", on=data.index)

In [None]:
q_a_df_no_code = filter_and_transform(q_a_df_no_code, 'A_Score_norm_ans_count', m_a_scaler, 'A_Score_norm_ans_count_max_abs')

In [15]:
def filter_and_transform_sep(data, column, scaler, new_col_name):
    filtered_data, lower_outliers, upper_outliers = filter_outliers_iqr(data, column)
    lower_outliers.values[:] = -1
    upper_outliers.values[:] = 1

    print(len(filtered_data), len(lower_outliers), len(upper_outliers))

    pos = filtered_data[filtered_data[:] >= 0]
    neg = filtered_data[filtered_data[:] < 0]

    pos_filtered_scaled_data = scaler.fit_transform(pos.values.reshape(-1, 1)).reshape(1, -1)[0]
    neg_filtered_scaled_data = scaler.fit_transform(neg.values.reshape(-1, 1)).reshape(1, -1)[0]

    neg_filtered_scaled_data = pd.Series(neg_filtered_scaled_data, index=neg.index)
    pos_filtered_scaled_data = pd.Series(pos_filtered_scaled_data, index=pos.index)

    transformed_series = pd.concat(
        [pos_filtered_scaled_data, neg_filtered_scaled_data, lower_outliers, upper_outliers], axis=0
    )
    transformed_series.name = new_col_name

    return data.join(transformed_series, how="left", on=data.index)

In [16]:
q_a_df_no_code = filter_and_transform_sep(q_a_df_no_code, 'A_Score_norm_ans_count', m_a_scaler, 'A_Score_norm_ans_count_max_abs')

60211 128 5514


Изменить Score Accepted Asnwers to 1

In [17]:
accepted_answers_indexes = q_a_df_no_code[
    q_a_df_no_code["A_Id"].isin(q_a_df_no_code["AcceptedAnswerId"].unique())
].index

In [None]:
# q_a_df_no_code.loc[
#     accepted_answers_indexes, ["A_Score_norm_ans_count_max_abs"]
# ] = 1

In [None]:
q_a_df_no_code['A_Score_norm_ans_count_max_abs_tanh'] = np.tanh(q_a_df_no_code.A_Score_norm_ans_count_max_abs)
q_a_df_no_code['A_Score_norm_ans_count_tanh'] = np.tanh(q_a_df_no_code.A_Score_norm_ans_count)

In [None]:
q_a_df_no_code.loc[
    accepted_answers_indexes, ["A_Score_norm_ans_count_max_abs_tanh", "A_Score_norm_ans_count_tanh"]
] = 1.2

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Предполагая, что у вас есть DataFrame df с колонками 'col1', 'col2', 'col3'
fig, axs = plt.subplots(1, 4, figsize=(20, 6))

sns.kdeplot(data=q_a_df_no_code.A_Score_norm_ans_count, ax=axs[0])
sns.kdeplot(data=q_a_df_no_code.A_Score_norm_ans_count_max_abs_tanh, ax=axs[1])
sns.kdeplot(data=q_a_df_no_code.A_Score_norm_ans_count_max_abs, ax=axs[2])
sns.kdeplot(data=q_a_df_no_code.A_Score_norm_ans_count_tanh, ax=axs[3])
# sns.histplot(data=df, x="col3", ax=axs[2], kde=True)

plt.tight_layout()
plt.show()

Просмотр KDE

## Use $log_2$ for scores

In [None]:
import math

In [None]:
accepted_answers_indexes = q_a_df_no_code[
    q_a_df_no_code["A_Id"].isin(q_a_df_no_code["AcceptedAnswerId"].unique())
].index

In [None]:
math.log2(1)

## Saving data as JSON

In [None]:
log_scores = []
for i in q_a_df_no_code['A_Score']:
    if i >= 0:
        log_scores.append(round(np.log(i+1)))
    else:
        log_scores.append(-1)

In [None]:
len(log_scores)

In [None]:
q_a_df_no_code['Log_scores'] = log_scores

In [None]:
q_a_df_no_code['Log_scores'] = q_a_df_no_code['Log_scores'] + q_a_df_no_code['AcceptedAnswer']

In [18]:
import json
import re

In [None]:
save_path = r'D:\CQA_RLHF\data\processed\log_score'

In [19]:
q_a_df_no_code['AcceptedAnswer'] = q_a_df_no_code['AcceptedAnswerId'] == q_a_df_no_code['A_Id']

In [20]:
data = q_a_df_no_code[['Q_CreationDate', 'Q_Title', 'Q_Body', 'A_Body', 'A_Score_norm_ans_count_max_abs', 'AcceptedAnswer', 'count_available_anaswers']]
# data = q_a_df_no_code[['Q_CreationDate', 'Q_Title', 'Q_Body', 'A_Body', 'A_Score_norm_ans_count_max_abs_tanh', 'AcceptedAnswer', 'count_available_anaswers']]

In [None]:
data['Q_Title'] = data['Q_Title'].apply(lambda x: BeautifulSoup(x, 'html.parser').get_text().strip())
data['Q_Body'] = data['Q_Body'].apply(lambda x: BeautifulSoup(x, 'html.parser').get_text().strip())
data['A_Body'] = data['A_Body'].apply(lambda x: BeautifulSoup(x, 'html.parser').get_text().strip())

In [21]:
data = data.rename(
    columns={
        "Q_Title": "Title",
        "Q_Body": "Question",
        "A_Body": "Answer",
        # "A_Score_norm_ans_count_max_abs_tanh": "Score",
        'A_Score_norm_ans_count_max_abs': "Score",
        "AcceptedAnswer": "Is_accepted",
        "count_available_anaswers": "N_answers",
    }
)

In [22]:
data = data.sort_values('Q_CreationDate', ascending=True)

In [None]:
data['Q_Id'] = data.groupby('Q_CreationDate').ngroup()

In [None]:
data.head(1)

In [None]:
data.to_csv(r'D:\CQA_RLHF\data\processed\tanh_score\1.0-all-data-tanh_score.csv', index=False)

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
train_df = data[:-3000]
val_df = data[-3000:-2000]
test_df = data[-2000:]
# Convert each set to a dictionary
train_dict = train_df.to_dict(orient='records')
val_dict = val_df.to_dict(orient='records')
test_dict = test_df.to_dict(orient='records')

# Combine the dictionaries into a list of dictionaries
data_to_save = {'train': train_dict, 'val': val_dict, 'test': test_dict}

# Save the list of dictionaries to a JSON file
with open(opj(save_path, '1.0-data-div-ans-sep.json'), 'w') as f:
    json.dump(data_to_save, f)


In [None]:
with open(opj(save_path, '1.0-data-div-ans-sep.json'), 'r') as f:
    pairs = json.load(f)