# Data Exploration and Preprocessing

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Read the CSV file into a DataFrame
kv = pd.read_csv(r"C:\Users\Home\Desktop\KartuVerbs-main\data_vn+", sep=';')

In [2]:
# Explore the DataFrame
print(kv.head(10)),  # Display the first few rows of the DataFrame
print(kv.columns)  # Display the column names
print(kv.info())  # Display information about the DataFrame
print(kv.describe())  # Display summary statistics for numerical columns

          form tense_in_paradigm  person number preverb pre2    root sf2  \
0    ვაბეზარობ           present       1     sg       -    ვ  აბეზარ  ობ   
1    ჰაბეზარობ           present       2     sg       -    ჰ  აბეზარ  ობ   
2     აბეზარობ           present       2     sg       -    -  აბეზარ  ობ   
3   ჰაბეზარობს           present       3     sg       -    ჰ  აბეზარ  ობ   
4    აბეზარობს           present       3     sg       -    -  აბეზარ  ობ   
5   ვაბეზარობთ           present       1     pl       -    ვ  აბეზარ  ობ   
6   ჰაბეზარობთ           present       2     pl       -    ჰ  აბეზარ  ობ   
7    აბეზარობთ           present       2     pl       -    -  აბეზარ  ობ   
8  ჰაბეზარობენ           present       3     pl       -    ჰ  აბეზარ  ობ   
9   აბეზარობენ           present       3     pl       -    -  აბეზარ  ობ   

  caus_sf ending tsch_class morph_type sub_id  id          vn  
0       -      -         MV     active    1-1   1  *აბეზარობა  
1       -      -         MV     act

In [3]:
# Check for missing values in the DataFrame
print(kv.isnull().sum())  # Count missing values in each column

form                 0
tense_in_paradigm    0
person               0
number               0
preverb              0
pre2                 0
root                 0
sf2                  0
caus_sf              0
ending               0
tsch_class           0
morph_type           0
sub_id               0
id                   0
vn                   0
dtype: int64


## Trying to extract third person column 

This is the first try of transforming the data frame so that the target value becomes the third person. This ultimately failed. Scroll down to find the second try.


In [5]:
# Change the "-" values to NaN
kv.replace("-", np.nan, inplace=True)  # Replace "-" with NaN

In [None]:
kv.head(10)  # Display the first few rows of the DataFrame again
# Check for missing values again after replacement

In [None]:
# Function to add a new column with the third person singular form of the verb
def add_third_person_singular(kv):
    # Create a new column '3PS' and fill it with the third person singular form of the verb
    kv['3PS'] = kv.apply(
        lambda row: row['form'] if (row["person"] == 3 and row["pre2"] != "ჰ" and row["number"] != "pl") else None,
        axis=1
    )

add_third_person_singular(kv)  # Call the function to add the new column
# Display the DataFrame with the new column 
kv.head(10)  # Display the first few rows of the DataFrame again
# Check the unique values in the '3PS' column

In [None]:
# Set the 'vn' column to NaN where '3PS' column has non-null values
kv.loc[kv['3PS'].notnull(), 'form'] = np.nan
kv.head(10)  # Display the first few rows to verify the changes

In [None]:
# Remove leading asterisks from the 'vn' column
kv['vn'] = kv['vn'].str.lstrip('*')
kv.head(10)  # Display the first few rows to verify the changes

In [None]:
# Check for missing values in the DataFrame again after all modifications
print(kv.isnull().sum())  # Count missing values in each column

In [11]:
# Save the modified DataFrame to a new CSV file
kv.to_csv(r"C:\Users\Home\Desktop\KartuVerbs-main\data_vn+_modified.csv", sep=';', index=False)  # Save the DataFrame to a new CSV file

In [None]:
kv_2 = pd.read_csv(r"C:\Users\Home\Desktop\KartuVerbs-main\data_vn-", sep=';')  # Read the modified CSV file into a new DataFrame
kv_2.head(10)  # Display the first few rows of the new DataFrame

In [13]:
add_third_person_singular(kv_2)  # Call the function to add the new column to the new DataFrame
kv_2.loc[kv_2['3PS'].notnull(), 'form'] = np.nan
kv_2.head(10)  # Display the first few rows of the new DataFrame again
# Save the modified DataFrame to a new CSV file
kv_2.to_csv(r"C:\Users\Home\Desktop\KartuVerbs-main\data_vn-_modified.csv", sep=';', index=False)  # Save the modified DataFrame to a new CSV file


 since the original kartuverbs projects seems to have pre-split the data into train and test set, data_vn- has little relevance for us. I will split the training and test sets myself in the coding of the NN in the georgian_conjugation_NN.ipynb file.

## Second try of extracting adequate data

here, our main goal is to extract the form column so that we may have a target value y that is comprised of third person forms.

In [8]:
# Check the data types of each column
print(kv.dtypes)  # Display the data types of each column in the DataFrame
# Check the sum of unique values in the 'vn' column
print(kv['vn'].nunique())  # Display the number of unique values in the 'vn' column
# Check the sum of unique values in the 'form' column 
print(kv['form'].nunique())  # Display the number of unique values in the 'form' column


form                 object
tense_in_paradigm    object
person                int64
number               object
preverb              object
pre2                 object
root                 object
sf2                  object
caus_sf              object
ending               object
tsch_class           object
morph_type           object
sub_id               object
id                    int64
vn                   object
dtype: object
282
72465
