# 3. Create folds for validation and test set

In [43]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold

## Validation set

In [None]:
#Validation

sentences= pd.read_csv(r".\Intermediate results\sentences_val.csv", sep = '\t')

# Parameters
folds = 5
fold_size = 150
total_rows = folds * fold_size

# Removing only np.nan values
nan_rows = sentences[sentences['manual_sentence_labels'].isna()]
int_rows = sentences[~sentences['manual_sentence_labels'].isna()]

# Calculate the number of rows to remove
rows_to_remove = len(sentences) - total_rows

# Randomly remove rows from the nan_rows
rows_to_drop = nan_rows.sample(n=rows_to_remove, random_state=42)
sentences_reduced = sentences.drop(rows_to_drop.index)

# Replace np.nan with a placeholder for stratification
sentences_reduced['manual_sentence_labels'] = sentences_reduced['manual_sentence_labels'].fillna(-1)

# StratifiedKFold to create stratified samples
skf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=42)
stratified_samples = []

# Generate the stratified samples
for _, sample_index in skf.split(sentences_reduced, sentences_reduced['manual_sentence_labels']):
    stratified_sample = sentences_reduced.iloc[sample_index]
    stratified_sample['manual_sentence_labels'] = stratified_sample['manual_sentence_labels'].replace(-1, np.nan)
    stratified_samples.append(stratified_sample)

# Determine the "examples" set with the highest count for each class
class_counts = pd.DataFrame()
for i, sample in enumerate(stratified_samples):
    counts = sample['manual_sentence_labels'].value_counts(dropna=False)
    counts.name = i
    class_counts = pd.concat([class_counts, counts], axis=1)

# Identify the sample that should be used as "examples"
examples_index = class_counts.idxmax(axis=1).value_counts().idxmax()

# Assign the selected sample as "examples"
examples = stratified_samples[examples_index]

# Assign the remaining samples to "fold1" through "fold5"
remaining_samples = [sample for i, sample in enumerate(stratified_samples) if i != examples_index]
folds_dict = {f"fold{i+1}": remaining_samples[i] for i in range(folds - 1)}

# Output the resulting folds
output_dfs = {"examples": examples}
output_dfs.update(folds_dict)

# Display the output
output_dfs["examples"], output_dfs["fold1"], output_dfs["fold2"]

# List of fold names
fold_names = ["examples", "fold1", "fold2", "fold3", "fold4"]

# Add the 'fold' column to each dataframe
for name in fold_names:
    output_dfs[name]['fold'] = name

# Concatenate all dataframes into one
merged_df = pd.concat([output_dfs[name] for name in fold_names], ignore_index=True)


merged_df['fold'] = merged_df['fold'].replace(['examples', 'fold1', 'fold2', 'fold3', 'fold4'], ['fold1', 'fold2', 'fold3', 'fold4', 'fold5'])


print(merged_df['manual_sentence_labels'].value_counts(dropna=False))


merged_df.to_csv(r'-\Intemrediate results\folds_val.csv', sep= '\t')


## Test set

In [49]:
#Test

sentences= pd.read_csv(r".\Intermediate results\sentences_test.csv", sep = '\t')

# Parameters
folds = 5
fold_size = 150
total_rows = folds * fold_size

# Removing only np.nan values
nan_rows = sentences[sentences['manual_sentence_labels'].isna()]
int_rows = sentences[~sentences['manual_sentence_labels'].isna()]

# Calculate the number of rows to remove
rows_to_remove = len(sentences) - total_rows

# Randomly remove rows from the nan_rows
rows_to_drop = nan_rows.sample(n=rows_to_remove, random_state=42)
sentences_reduced = sentences.drop(rows_to_drop.index)

# Replace np.nan with a placeholder for stratification
sentences_reduced['manual_sentence_labels'] = sentences_reduced['manual_sentence_labels'].fillna(-1)

# StratifiedKFold to create stratified samples
skf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=42)
stratified_samples = []

# Generate the stratified samples
for _, sample_index in skf.split(sentences_reduced, sentences_reduced['manual_sentence_labels']):
    stratified_sample = sentences_reduced.iloc[sample_index]
    stratified_sample['manual_sentence_labels'] = stratified_sample['manual_sentence_labels'].replace(-1, np.nan)
    stratified_samples.append(stratified_sample)

# Determine the "examples" set with the highest count for each class
class_counts = pd.DataFrame()
for i, sample in enumerate(stratified_samples):
    counts = sample['manual_sentence_labels'].value_counts(dropna=False)
    counts.name = i
    class_counts = pd.concat([class_counts, counts], axis=1)

# Identify the sample that should be used as "examples"
examples_index = class_counts.idxmax(axis=1).value_counts().idxmax()

# Assign the selected sample as "examples"
examples = stratified_samples[examples_index]

# Assign the remaining samples to "fold1" through "fold5"
remaining_samples = [sample for i, sample in enumerate(stratified_samples) if i != examples_index]
folds_dict = {f"fold{i+1}": remaining_samples[i] for i in range(folds - 1)}

# Output the resulting folds
output_dfs = {"examples": examples}
output_dfs.update(folds_dict)

# Display the output
output_dfs["examples"], output_dfs["fold1"], output_dfs["fold2"]

# List of fold names
fold_names = ["examples", "fold1", "fold2", "fold3", "fold4"]

# Add the 'fold' column to each dataframe
for name in fold_names:
    output_dfs[name]['fold'] = name

# Concatenate all dataframes into one
merged_df = pd.concat([output_dfs[name] for name in fold_names], ignore_index=True)


merged_df['fold'] = merged_df['fold'].replace(['examples', 'fold1', 'fold2', 'fold3', 'fold4'], ['fold1', 'fold2', 'fold3', 'fold4', 'fold5'])


print(merged_df['manual_sentence_labels'].value_counts(dropna=False))


merged_df.to_csv(r'.\Intermediate results\folds_test.csv', sep= '\t')
