# A notebook for cleaning up pitch data
### sources
1. Kishimoto's
2. NHK
3. Wadoku?

In [9]:
# %ls -la
# %lsmagic

In [14]:
# Some experiment

import pandas as pd
import numpy as np

df = pd.DataFrame(np.random.randn(10,5))
df.head()

Unnamed: 0,0,1,2,3,4
0,-0.059796,-1.196227,0.52614,-0.139966,0.30283
1,0.088032,-1.099832,-0.364072,-1.269601,-0.040648
2,0.142787,0.145549,1.001644,0.612252,0.753116
3,-0.730892,-0.460382,-1.396289,0.909458,1.355108
4,-0.229974,-0.970496,-0.495748,-0.367059,0.612684


### Installation

In [28]:
# !pip install pykakasi
# !pip install pandas
# !pip install numpy
# !pip install jupyterthemes

# You can get the list of available themes with:
!jt -l

#So change your theme with:
# jt -t theme-name

zsh:1: command not found: jt


In [None]:
import pykakasi
kks = pykakasi.kakasi()
text = "アーラーメンオンスーリツ"
result = kks.convert(text)
print(result)
for item in result:
    print("{}: kana '{}', hiragana '{}', romaji: '{}'".format(item['orig'], item['kana'], item['hira'], item['hepburn']))
            


In [16]:
import csv 
import json

### Cleanup kishimoto

### Constants

In [21]:
kishimotoPath = 'assets/input/kishimoto.csv'
nhkPath = 'assets/input/nhk.csv'

outputCsvPath = "assets/output/output.csv"
outputJsonPath = "assets/output/output.json"

In [22]:
def cleanKishimoto(csvFilePath):
    nullCount = 0  # Counter for null values
      
    #read csv file
    with open(csvFilePath, encoding='utf-8') as csvf: 
        #load csv file data using csv library's dictionary reader
        csvReader = csv.DictReader(csvf) 

        #convert each csv row into python dict
        for row in csvReader:
            # Exclude the "image" column
            del row["image"]
            
            # Check and replace empty cells with None
            for key, value in row.items():
                if not value:
                    row[key] = None
                    nullCount += 1
            
            if(row["accent"]):
                row["accent"] = int(row["accent"])  # Convert accent to integer
            if (row["morae"]):
                row["morae"] = int(row["morae"])  # Convert morae to integer
            
            #add this python dict to json array

        print("Number of null values:", nullCount)
  

cleanKishimoto(kishimotoPath)


Number of null values: 27


### Clean NHK

In [23]:

def cleanNHK(csvFilePath):
    nullCount = 0  # Counter for null values
      
    #read csv file
    with open(csvFilePath, encoding='utf-8') as csvf: 
        #load csv file data using csv library's dictionary reader
        csvReader = csv.DictReader(csvf) 

        #convert each csv row into python dict
        for row in csvReader:
            # Exclude the "image" column
#             del row["image"]
            
            # Check and replace empty cells with None
            for key, value in row.items():
                if not value:
                    row[key] = None
                    nullCount += 1
            
            if(row["accent"]):
                row["accent"] = int(row["accent"])  # Convert accent to integer
            if (row["morae"]):
                row["morae"] = int(row["morae"])  # Convert morae to integer
            
            #add this python dict to json array

        print("Number of null values:", nullCount)
  

cleanNHK(nhkPath)



Number of null values: 189103


### Combine tables

In [None]:
import pandas as pd

kishimotoTable = r'../input/pitchdb/ja_pitch_accents(kichimoto) - ja_pitch_accents.csv'
nhkTable = r'../input/nhk-pitch-3/nhk.csv'
combinedTable = r'/kaggle/working/combined.csv'


# Load the data from the first table
df1 = pd.read_csv(kishimotoTable)

# Load the data from the second table
df2 = pd.read_csv(nhkTable)

# Select the desired columns from each table
df1_selected = df1[['id', 'word', 'kana', 'accent', 'morae']]
df2_selected = df2[['id', 'word', 'kana', 'accent', 'morae']]

# Concatenate the two tables vertically
combined_df = pd.concat([df1_selected, df2_selected], ignore_index=True)

# Drop duplicates based on 'word' and 'kana' columns
combined_df.drop_duplicates(subset=['word', 'kana'], keep='first', inplace=True)

# Generate the 'id' column starting from 1 and incrementing by 1
combined_df['id'] = combined_df.index + 1

# Convert 'morae' column to integers
combined_df['morae'] = combined_df['morae'].astype(int)

# Save the new table to a CSV file
combined_df.to_csv(combinedTable, index=False) 


### Convert CSV to JSON

In [None]:
import csv 
import json


def csv_to_json(csvFilePath, jsonFilePath):
    jsonArray = []
    nullCount = 0  # Counter for null values
      
    #read csv file
    with open(csvFilePath, encoding='utf-8') as csvf: 
        #load csv file data using csv library's dictionary reader
        csvReader = csv.DictReader(csvf) 

        #convert each csv row into python dict
        for row in csvReader:
            # Exclude the "image" column
            del row["image"]
            
            # Check and replace empty cells with None
            for key, value in row.items():
                if not value:
                    row[key] = None
                    nullCount += 1
            
            if(row["accent"]):
                row["accent"] = int(row["accent"])  # Convert accent to integer
            if (row["morae"]):
                row["morae"] = int(row["morae"])  # Convert morae to integer
            
            #add this python dict to json array
            jsonArray.append(row)
    print("Number of null values:", nullCount)
  
    #convert python jsonArray to JSON String and write to file
    with open(jsonFilePath, 'w', encoding='utf-8') as jsonf: 
        jsonString = json.dumps(jsonArray, indent=4,  ensure_ascii=False)
        jsonf.write(jsonString)


csvFilePath = r'../input/pitchdb/ja_pitch_accents(kichimoto) - ja_pitch_accents.csv'
jsonFilePath = r'/kaggle/working/output.json'
csv_to_json(csvFilePath, jsonFilePath)

