In [8]:
import pandas as pd
import numpy as np
from itertools import chain
import random
import os
from datetime import datetime

### adding euph status column if needed

In [7]:
csvFile = pd.read_csv("turkish_dataset_2024_pre.csv")
df = pd.DataFrame(csvFile)
freq = df['PET'].value_counts()

pets = (set(df['PET']))

df['euph_status'] = None

### initializing the euph_status # DO NOT TOUCH ####################
for pet in pets:
    examples = df[df['PET'] == pet]
    examples_index = df['PET'] == pet
    avg_label = examples['label'].mean()
    if avg_label == 1:
        df.loc[examples_index,'euph_status'] = 'always_euph'
    else:
        df.loc[examples_index,'euph_status'] = 'sometimes_euph'
#####################################################################

df.to_csv("turkish_combined_2024.csv")


### sometimes euph in both train/test

In [9]:
from sklearn.model_selection import train_test_split
langs = ['turkish'] # ['chinese', 'english', 'spanish', 'yoruba', 'turkish']
number_sets = 20


for language in langs:
    main_folder = f"zero_shot {language} {datetime.today().date()}"
    if not os.path.exists(main_folder):
        os.makedirs(main_folder)
    
    for number in range(number_sets):
        csvFile = pd.read_csv(f"{language}_combined_2024.csv")
        df = pd.DataFrame(csvFile)
        freq = df['PET'].value_counts()

        train_data = .8 * len(df)
        val_data = .1 * len(df)
        test_data = .1 * len(df)
    
        pets = (set(df['PET']))
    
        df['euph_status'] = None
    
        ### initializing the euph_status # DO NOT TOUCH ####################
        for pet in pets:
            examples = df[df['PET'] == pet]
            examples_index = df['PET'] == pet
            avg_label = examples['label'].mean()
            if avg_label == 1:
                df.loc[examples_index,'euph_status'] = 'always_euph'
            else:
                df.loc[examples_index,'euph_status'] = 'sometimes_euph'
        #####################################################################
        
        SE_examples = df[df['euph_status'] == 'sometimes_euph']
        AE_examples = df[df['euph_status'] == 'always_euph']
        AE_pets = sorted(set(AE_examples['PET']))
        
        temp_df = SE_examples.sample(frac = 0.9)
        SE_examples = SE_examples.drop(temp_df.index)

        test_df = SE_examples.sample(frac = 1.0)
        SE_examples = SE_examples.drop(test_df.index)
        
        random.shuffle(AE_pets)

        for pet in AE_pets:
            this_pet = AE_examples[AE_examples['PET'] == pet]
            if len(temp_df) <= train_data + val_data:
                temp_df = pd.concat([temp_df, this_pet])
                
            elif len(test_df) <= test_data:
                test_df = pd.concat([test_df, this_pet])
              
        # ## FOR CREATING THE DF, DO NOT TOUCH ###############################
        
        if language == 'chinese':
             index = 0
        if language == 'english':
             index = 20
        if language == 'spanish':
             index = 40    
        if language == 'yoruba':
             index = 60
        if language == 'turkish':
             index = 80
        if language == 'polish':
            index = 100
        if language == 'ukrainian':
            index = 120
    
        train_df, val_df = train_test_split(temp_df, test_size=0.1111)
                
        trainName = os.path.join(main_folder, f"train_{number + index}.csv")
        valName = os.path.join(main_folder, f"val_{number + index}.csv")
        testName = os.path.join(main_folder, f"test_{number}_{language}.csv")
    
        train_df.to_csv(trainName)
        val_df.to_csv(valName)
        test_df.to_csv(testName)

        # ##################################################################

### sometimes euph can be in both

In [1]:
from sklearn.model_selection import train_test_split
langs = ['chinese', 'english', 'spanish', 'yoruba']

for language in langs:
    csvFile = pd.read_csv(f"{language}_combined_2024.csv")
    df = pd.DataFrame(csvFile)
    freq = df['PET'].value_counts()
    
    train_data = .8 * len(df)
    val_data = .1 * len(df)
    test_data = .1 * len(df)

    pets = sorted(set(df['PET']))

    df['euph_status'] = None

    
    for pet in pets:
        examples = df[df['PET'] == pet]
        examples_index = df['PET'] == pet
        avg_label = examples['label'].mean()
        if avg_label == 1:
            df.loc[examples_index,'euph_status'] = 'always_euph'
        else:
            df.loc[examples_index,'euph_status'] = 'sometimes_euph'
            
    number_sets = 20
    main_folder = f"zero_shot {language} {datetime.today().date()}"
    
    if not os.path.exists(main_folder):
        os.makedirs(main_folder)
    
    for number in range(number_sets):
        list_train = []
        list_val = []
        list_test = []
        random.shuffle(pets)
    
        for pet in pets:
            examples = df[df['PET'] == pet]
            list_examples = examples.values.tolist()
            examples_index = df['PET'] == pet
            if df.loc[examples_index,'euph_status'].all() == 'always_euph':
                status = 'always'
            else:
                status = 'sometimes'
    
            if status == 'sometimes':
                  for i in range(len(list_examples)):
                    if len(list_train) <= train_data + val_data:
                        list_train.append(list_examples[i])
                    elif len(list_test) <= test_data:
                        list_test.append(list_examples[i])
            else:
                if len(list_train) <= train_data + val_data:
                    for i in range(len(list_examples)):
                        list_train.append(list_examples[i])
                elif len(list_test) <= test_data:
                    for i in range(len(list_examples)):
                        list_test.append(list_examples[i])
              
    
        if language == 'chinese':
            index = 0
            temp_df = pd.DataFrame(list_train, columns = ['index', 'orig_text', 'label', 'pet', 'text', 'euph_status'])
            dataframe_test = pd.DataFrame(list_test, columns = ['index', 'orig_text', 'label', 'pet', 'text', 'euph_status'])
    
        if language == 'english':
            index = 20
            temp_df = pd.DataFrame(list_train, columns = ['index', 'text', 'label', 'category', 'pet', 'euph_status', 'sentence', 'orig_text'])
            dataframe_test = pd.DataFrame(list_test, columns = ['index', 'text', 'label', 'category', 'pet', 'euph_status', 'sentence', 'orig_text'])

        if language == 'spanish':
            index = 40
            temp_df = pd.DataFrame(list_train, columns = ['index', 'PET', 'text', 'label', 'category', 'source', 'bibliography', 'country', 'euph_status'])
            dataframe_test = pd.DataFrame(list_test, columns = ['index', 'PET', 'text', 'label', 'category', 'source', 'bibliography', 'country', 'euph_status'])

        if language == 'yoruba':
            index = 60
            temp_df = pd.DataFrame(list_train, columns = ['index', 'PET', 'literal translation', 'euphemistic meanings', 'category', 'label', 'text', 'source', 'contains_boundary_tokens', 'euph_status'])
            dataframe_test = pd.DataFrame(list_test, columns = ['index', 'PET', 'literal translation', 'euphemistic meanings', 'category', 'label', 'text', 'source', 'contains_boundary_tokens', 'euph_status'])

        
        dataframe_train, dataframe_val = train_test_split(temp_df, test_size=0.1111)
        
        print(len(dataframe_train), len(dataframe_val), len(dataframe_test))
    
        
        trainName = os.path.join(main_folder, f"train_{number + index}.csv")
        valName = os.path.join(main_folder, f"val_{number + index}.csv")
        testName = os.path.join(main_folder, f"test_{number}_{language}.csv")
    
        dataframe_train.to_csv(trainName)
        dataframe_val.to_csv(valName)
        dataframe_test.to_csv(testName)



NameError: name 'pd' is not defined

### train and val NO OVERLAP

In [None]:
number_sets = int(input())

main_folder = f"zero_shot {language} {datetime.today().date()}"

if not os.path.exists(main_folder):
    os.makedirs(main_folder)

for number in range(number_sets):
    list_train = []
    list_eval = []
    list_test = []
    random.shuffle(pets)

    for word in pets:
        examples = df[df['PET'] == word] 
        list_examples = examples.values.tolist()
        
        if len(list_train) <= train_data:
            for i in range(len(list_examples)):
                list_train.append(list_examples[i])
        elif len(list_eval) <= eval_data:
            for i in range(len(list_examples)):
                list_eval.append(list_examples[i])
        elif len(list_test) <= test_data:
            for i in range(len(list_examples)):
                list_test.append(list_examples[i])

    
    print(len(list_train), len(list_eval), len(list_test))

    
    # # chinese
    # dataframe_train = pd.DataFrame(list_train, columns = ['index', 'orig_text', 'label', 'pet', 'text'])
    # dataframe_eval = pd.DataFrame(list_eval, columns = ['index', 'orig_text', 'label', 'pet', 'text'])
    # dataframe_test = pd.DataFrame(list_test, columns = ['index', 'orig_text', 'label', 'pet', 'text'])

    # # english
    # dataframe_train = pd.DataFrame(list_train, columns = ['index', 'text', 'label', 'category', 'pet', 'euph_status', 'sentence'])
    # dataframe_eval = pd.DataFrame(list_eval, columns = ['index', 'text', 'label', 'category',  'pet','euph_status', 'sentence'])
    # dataframe_test = pd.DataFrame(list_test, columns = ['index', 'text', 'label', 'category', 'pet', 'euph_status', 'sentence'])

    # # spanish
    # dataframe_train = pd.DataFrame(list_train, columns = ['index', 'PET', 'text', 'label', 'category', 'source', 'bibliography', 'country'])
    # dataframe_eval = pd.DataFrame(list_eval, columns = ['index', 'PET', 'text', 'label', 'category', 'source', 'bibliography', 'country'])
    # dataframe_test = pd.DataFrame(list_test, columns = ['index', 'PET', 'text', 'label', 'category', 'source', 'bibliography', 'country'])

    # yoruba
    dataframe_train = pd.DataFrame(list_train, columns = ['index', 'PET', 'literal translation', 'euphemistic meanings', 'category', 'label', 'text', 'source', 'contains_boundary_tokens', 'multiple_boundary_tokens'])
    dataframe_eval = pd.DataFrame(list_eval, columns = ['index', 'PET', 'literal translation', 'euphemistic meanings', 'category', 'label', 'text', 'source', 'contains_boundary_tokens', 'multiple_boundary_tokens'])
    dataframe_test = pd.DataFrame(list_test, columns = ['index', 'PET', 'literal translation', 'euphemistic meanings', 'category', 'label', 'text', 'source', 'contains_boundary_tokens', 'multiple_boundary_tokens'])

    
    trainName = os.path.join(main_folder, f"train_{number + 60}.csv")
    evalName = os.path.join(main_folder, f"val_{number + 60}.csv")
    testName = os.path.join(main_folder, f"test_{number}_{language}.csv")

    dataframe_train.to_csv(trainName)
    dataframe_eval.to_csv(evalName)
    dataframe_test.to_csv(testName)



### for train/val sets to have overlap

In [75]:
from sklearn.model_selection import train_test_split
number_sets = int(input())

main_folder = f"zero_shot {language} {datetime.today().date()}"

if not os.path.exists(main_folder):
    os.makedirs(main_folder)

for number in range(number_sets):
    list_train = []
    list_val = []
    list_test = []
    random.shuffle(pets)

    for word in pets:
        examples = df[df['PET'] == word] 
        list_examples = examples.values.tolist()
        
        if len(list_train) <= train_data + val_data:
            for i in range(len(list_examples)):
                list_train.append(list_examples[i])
        elif len(list_test) <= test_data:
            for i in range(len(list_examples)):
                list_test.append(list_examples[i])
                

    
    if language == 'chinese':
        temp_df = pd.DataFrame(list_train, columns = ['index', 'orig_text', 'label', 'pet', 'text'])
        dataframe_test = pd.DataFrame(list_test, columns = ['index', 'orig_text', 'label', 'pet', 'text'])

    if language == 'english':
        temp_df = pd.DataFrame(list_train, columns = ['index', 'text', 'label', 'category', 'pet', 'euph_status', 'sentence'])
        dataframe_test = pd.DataFrame(list_test, columns = ['index', 'text', 'label', 'category', 'pet', 'euph_status', 'sentence'])

    if language == 'spanish':
        temp_df = pd.DataFrame(list_train, columns = ['index', 'PET', 'text', 'label', 'category', 'source', 'bibliography', 'country'])
        dataframe_test = pd.DataFrame(list_test, columns = ['index', 'PET', 'text', 'label', 'category', 'source', 'bibliography', 'country'])

    if language == 'yoruba':
        temp_df = pd.DataFrame(list_train, columns = ['index', 'PET', 'literal translation', 'euphemistic meanings', 'category', 'label', 'text', 'source', 'contains_boundary_tokens', 'multiple_boundary_tokens'])
        dataframe_test = pd.DataFrame(list_test, columns = ['index', 'PET', 'literal translation', 'euphemistic meanings', 'category', 'label', 'text', 'source', 'contains_boundary_tokens', 'multiple_boundary_tokens'])

    dataframe_train, dataframe_val = train_test_split(temp_df, test_size=0.1111)
    
    print(len(dataframe_train), len(dataframe_val), len(dataframe_test))

    
    trainName = os.path.join(main_folder, f"train_{number + 60}.csv")
    valName = os.path.join(main_folder, f"val_{number + 60}.csv")
    testName = os.path.join(main_folder, f"test_{number}_{language}.csv")

    dataframe_train.to_csv(trainName)
    dataframe_val.to_csv(valName)
    dataframe_test.to_csv(testName)




KeyboardInterrupt

