In [79]:
import re

import pandas as pd
import numpy as np
import os


## Concatenate Dataset into one File

In [80]:
def myConcat(direcToScan: str, outputFilename: str, outputDirec: str = "all_data"):
    files = [f for f in os.listdir("{}/".format(direcToScan))]

    datasets = pd.concat([pd.read_csv("{}/{}".format(direcToScan, f)) for f in files], axis=0, ignore_index=True)

    datasets.to_csv("{}/{}.csv".format(outputDirec, outputFilename), index=False)

In [81]:
## Concatenate All Dataset into one File
myConcat(direcToScan="datasets", outputFilename="all_data_dataset")

## Concatenate All Entity-tag Dataset into one File
myConcat(direcToScan="entity-tag", outputFilename="all_entity_dataset")

## Find Amount of Sentences per intent as well as Total

In [82]:
## Find Amount of Sentences
data = pd.read_csv("all_data/all_data_dataset.csv")
data.head()

Unnamed: 0,Question,Intent
0,តើCADTបានប្រកាសថាមានការប្រកួតប្រជែងអ្វីខ្លះ?,AskAboutCompetition
1,តើCADTបានប្រកាសថាមានCompetitionអ្វីខ្លះ?,AskAboutCompetition
2,តើCADTបានបង្កើតនូវកម្មវិធីការប្រកួតប្រជែងអ្វីខ...,AskAboutCompetition
3,តើCADTបានបង្កើតនូវកម្មវិធីCompetitionអ្វីខ្លះ?,AskAboutCompetition
4,តើCADTបានhostនូវកម្មវិធីការប្រកួតប្រជែងអ្វីខ្លះ?,AskAboutCompetition


In [83]:
data.groupby(data["Intent"]).count()

Unnamed: 0_level_0,Question
Intent,Unnamed: 1_level_1
AskAboutCompetition,144
AskAboutSchoolMajors,852
AskWhereHRRoomIs,2520
FindCurrentTime,1770
FindTotalStudents,504
ListParkingSpace,81


In [84]:
# Find Total
data.Question.count()

5871

## Check amount of entities as well as for any errors

In [85]:
entityDataFrame = pd.read_csv("all_data/all_entity_dataset.csv")
entityFrame = pd.read_csv("Named Entity.csv")

In [86]:
entityFrame.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15 entries, 0 to 14
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Unnamed: 0    15 non-null     int64 
 1   Named Entity  15 non-null     object
 2   NE Code       15 non-null     object
 3   Remarks       14 non-null     object
dtypes: int64(1), object(3)
memory usage: 608.0+ bytes


In [87]:
# Remove white space from column name and apply it to csv file
entityFrame = entityFrame.rename(columns={entityFrame.columns.values[1]: entityFrame.columns.values[1].strip()})
entityFrame.to_csv("Named Entity.csv")

In [90]:
myDict = {}
entityTag = entityFrame["NE Code"].to_list()
entitySentences = entityDataFrame["Question"].to_list()

for x in entityTag:
    count = 0
    for j in entitySentences:
        sentence = j.split()
        for c in range(len(sentence)):
            # Matches /B-Word until space
            if re.search(r"/B-{}\w*".format(x), sentence[c]):
                count += 1
                split = sentence[c].rsplit("/B-", maxsplit=1)[1]
                myDict.update({split : count})

entityAmount = pd.DataFrame(myDict.items(), columns=["Intent", "Amount"])
entityAmount

Unnamed: 0,Intent,Amount
0,ROOM,2520
1,BUILD,48
2,PER,3476
3,GPE,90
4,ORG,2130


In [91]:
## Calculate the percentage

# Sum of Intent
totalIntentTag = sum(myDict[x] for x in myDict)
rise  = [ "{0:.2f}%".format(myDict[x]*100/totalIntentTag) for x in myDict]


# array = np.asarray(rise)
#
# arrayV2 = np.fromiter(("{0:.2f}%".format(myDict[x]*100/totalIntentTag) for x in myDict), dtype=str)

entityAmount["Percentage"] = rise
entityAmount

Unnamed: 0,Intent,Amount,Percentage
0,ROOM,2520,30.49%
1,BUILD,48,0.58%
2,PER,3476,42.06%
3,GPE,90,1.09%
4,ORG,2130,25.77%
