# Set up

In [None]:
# Generic Imports
import os
import csv
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# Load the datasets

In [None]:
ice_error_corpus_data = pd.read_csv("./generated_datasets/labeledData.csv", encoding='latin-1')
bin_data = pd.read_csv("./original_datasets/Storasnid_beygm.csv", low_memory=False, encoding='latin-1', header=None, sep=";")

## Inspect the bin_dataset

In [None]:
bin_data

In [None]:
bin_data.describe()

### Manipulate the dataset

In [None]:
# Drop unused columns
case_data = bin_data.drop([0,2,5,6,7], axis=1)

### Inspect our dataset again

In [None]:
case_data

## Inspect the ice_error_corpus Data

In [None]:
ice_error_corpus_data

In [None]:
ice_error_corpus_data.describe()

### Manipulate the data

In [None]:
# Select all the correct sentences
correct_sentences = ice_error_corpus_data.loc[ice_error_corpus_data['label'] == 'correct']
correct_sentences.describe()


In [None]:
# Drop unneeded columns
correct_sentences = correct_sentences.drop('label', axis=1)
correct_sentences = correct_sentences.drop('Error', axis=1)
correct_sentences.describe()

In [None]:
# Drop duplicate rows
correct_sentences = correct_sentences.drop_duplicates()
correct_sentences.describe()

## Create a Validation Set 

In [None]:
# Create a small test set
correct_sentences, test_set = train_test_split(correct_sentences, test_size=0.1)
test_set.describe()

In [None]:
# Select all the incorrect sentences
incorrect_sentences = ice_error_corpus_data.loc[ice_error_corpus_data['label'] == 'incorrect']
incorrect_sentences = incorrect_sentences[incorrect_sentences['Error'].str.contains('inflection')]
incorrect_sentences = incorrect_sentences.drop('Error', axis=1)
incorrect_sentences = incorrect_sentences.drop_duplicates()
incorrect_sentences.describe()

In [None]:
test_set = pd.concat([test_set,incorrect_sentences])
test_set.fillna('correct', inplace=True)
test_set.describe()

In [None]:
test_set

In [None]:
test_set.to_csv('synthetic_validation_set.csv', encoding='utf-8', index=False)

# Load helping functions meant to be abstracted

In [None]:
# A function that, if given a correct sentence, returns an array of incorrectly declined sentences
def generate_incorrect_sentences(correct_sentence):
    # An array to gather incorrect sentences
    incorrect_sentences = []
    # Split the correct sentence into a word array
    word_array = correct_sentence.split()
    # For each word in the array, 
    for x in range(len(word_array)):
        # Find the word in the Dataframe
        df_same_words = case_data.loc[case_data[3] == word_array[x]]
        if not(df_same_words.empty):
            # Sample a new cases of that word
            new_cases = sample_new_cases(df_same_words)
            # If a new case was found, make a new sentence and add it to the array
            for new_case in new_cases:
                incorrect_string = ""
                for y in range(len(word_array)):
                    if not y == x:
                        incorrect_string += word_array[y] + " "
                    else:
                        incorrect_string += new_case + " "
                incorrect_string = incorrect_string[:len(incorrect_string)-1]
                incorrect_sentences.append(incorrect_string)
    return incorrect_sentences

# A function that returns an array of different declensions of a word
def sample_new_cases(df_same_words):
    output_array = []
    array_of_relevant_cases = return_array_of_incorrect_case_types(df_same_words)
    all_word_ids = set(df_same_words[1].tolist())
    for id_ in all_word_ids:
        df_other_cases = case_data.loc[case_data[1] == id_]
        df_relevant_cases =  df_other_cases[3][df_other_cases[4].isin(array_of_relevant_cases)]
        for word in df_relevant_cases:
            output_array.append(word)
    return output_array

def return_array_of_incorrect_case_types(df_same_words):
    array_of_correct_cases = []
    array_of_incorrect_cases = []
    for case in df_same_words[4]:
        array_of_correct_cases.append(case)
        # IF It's a question (SP)
        if case[0:2] == "SP":
            index = case.find("VH")
            if index == -1:
                index = case.find("FH")
                newString = case[0:index]+"VH"+case[index+2:]
            else:
                newString = case[0:index]+"FH"+case[index+2:]
        # IF It's a non Noun (GM-OP-MM)
        elif case[0:2] == "GM" or  case[0:2] == "OP" or case[0:2] == "MM":
            skip = False
            index = case.find("1P")
            if not index == -1:
                newString1 = case[0:index]+"2P"+case[index+2:]
                newString2 = case[0:index]+"3P"+case[index+2:]
            else:
                index = case.find("2P")
                if not index == -1:
                    newString1 = case[0:index]+"1P"+case[index+2:]
                    newString2 = case[0:index]+"3P"+case[index+2:]
                else:
                    index = case.find("3P")
                    if not index == -1:
                        newString1 = case[0:index]+"1P"+case[index+2:]
                        newString2 = case[0:index]+"2P"+case[index+2:]
                    else:
                        skip = True
            if not skip:
                array_of_incorrect_cases.append(newString1)
                array_of_incorrect_cases.append(newString2)
        # If it's a Noun (NF,EF,ÞGF,ÞF)
        else:
            index = case.find("NF")
            if not index ==-1:
                newString1 = case[0:index]+"EF"+case[index+2:]
                newString2 = case[0:index]+"ÞGF"+case[index+2:]
                newString3 = case[0:index]+"ÞF"+case[index+2:]
            if index == -1:
                index = case.find("EF")
                if not index == -1:
                    newString1 = case[0:index]+"NF"+case[index+2:]
                    newString2 = case[0:index]+"ÞGF"+case[index+2:]
                    newString3 = case[0:index]+"ÞF"+case[index+2:]
            if index == -1:
                index = case.find("ÞGF")
                if not index == -1:
                    newString1 = case[0:index]+"EF"+case[index+3:]
                    newString2 = case[0:index]+"NF"+case[index+3:]
                    newString3 = case[0:index]+"ÞF"+case[index+3:]
            if index == -1:
                index = case.find("ÞF")
                if not index == -1:
                    newString1 = case[0:index]+"EF"+case[index+2:]
                    newString2 = case[0:index]+"ÞGF"+case[index+2:]
                    newString3 = case[0:index]+"NF"+case[index+2:]
            if not index == -1:
                array_of_incorrect_cases.append(newString1)
                array_of_incorrect_cases.append(newString2)
                array_of_incorrect_cases.append(newString3)
    output_array = [x for x in array_of_incorrect_cases if x not in array_of_correct_cases]
    output_array = set(output_array)
    return output_array
                

# A function that returns an array, with one random column from each row
def sample_column(csv):
    data = pd.read_csv('./generated_datasets/synthetic_data.csv', encoding='utf-8')
    data = data.dropna(axis = 0, how = 'all')
    array = []
    for i in range(len(data)):
        number_of_columns = (data.iloc[i].notnull().sum())
        rng_column = np.random.randint(0,number_of_columns)
        array.append([data.iloc[i][rng_column],"incorrect"])
    return (array)       
            
def create_csv(data):
    with open('synthetic_data.csv', 'w', newline='', encoding="utf-8") as file:
        writer = csv.writer(file)
        writer.writerow(["text","label"])
        for row in data:
            for column in row:
                writer.writerow([column,"incorrect"])

# Implementation

In [None]:
## Array that collects our incorrect sentences
output = []

# Variable used to track the progress of the parser
rows_to_use = len(correct_sentences)
milestone = rows_to_use/10
    
for i in range(rows_to_use):
    correct_sentence = correct_sentences['text'].iloc[i]
    incorrect_sentences = generate_incorrect_sentences(correct_sentence)
    output.append(incorrect_sentences)
    
    # Track the progress of the parser
    if(i>=milestone):
        print("Finished : ",(milestone/(rows_to_use/10))*10,"%")
        milestone += rows_to_use/10
print("Finished!")
    
# Create a backup of the object to work with, just in case
my_back_up = output.copy()

# my_data = output.copy()
full_CSV = create_csv(my_data)

In [None]:
import csv
import random

# Open the CSV file and read the rows
with open('./generated_datasets/April/synthetic_validation_set.csv', 'r') as csv_file:
    csv_reader = csv.reader(csv_file)
    rows = list(csv_reader)

# Shuffle the rows randomly
random.shuffle(rows)

# Write the shuffled rows to a new CSV file
with open('shuffled_validation_file.csv', 'w', newline='') as shuffled_file:
    csv_writer = csv.writer(shuffled_file)
    csv_writer.writerows(rows)