In [1]:
import fasttext
import pandas as pd
import re
import random
import os

from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import train_test_split
import shutil

In [6]:
def createDirec(directory: str):
    try:
        if not os.path.exists(directory):
            os.makedirs(directory)
        print("Success")
    except:
        print("fail")


def myOversampling(fileName: str):
    temp = pd.read_csv("{}.csv".format(fileName))

    y = temp["Intent"]
    X = temp.drop("Intent", axis=1)

    ros = RandomOverSampler(random_state=42)
    ros.fit(X, y)

    X_resampled, y_resampled = ros.fit_resample(X, y)

    newTable = X_resampled
    newTable["Intent"] = y_resampled

    createDirec(directory="output")
    newTable.to_csv("output/{}.csv".format(fileName), index=False)
    return newTable


def convertToTxt(filename: str):
    createDirec("raw_data")
    shutil.copy("all_data/{}.csv".format(filename), "raw_data/{}.txt".format(filename))
    f = open("raw_data/{}.txt".format(filename), mode="r", encoding="utf8")
    oldList = f.readlines()
    list = [x.split(",")[0] + "\n" for x in oldList]
    list = list[1:]
    f.close()
    f = open("raw_data/{}.txt".format(filename), mode="w", encoding="utf8")
    f.writelines(list)
    f.close()
    print(oldList[1])
    print(list[0])

# def augmentCharacterLevel():


## Text Augmentation

In [12]:
import nlpaug.augmenter.char as nac
import nlpaug.augmenter.word as naw

# https://towardsdatascience.com/text-augmentation-in-few-lines-of-python-code-cdd10cf3cf84

text = "ប្រាប់ ពី តម្លៃ សិក្សា របស់ សាលា CADT"
aug = nac.OcrAug()
augmentText = aug.augment(text)
print(text)
print(augmentText)

print("-------------------------")
print("---------Keyboard--------")
print("-------------------------")
aug = nac.KeyboardAug()
augmentText = aug.augment(text)
print(augmentText)
f = open("test2.txt", mode="w")
f.writelines(augmentText)
f.close()

print("-------------------------")
print("---------Keyboard--------")
print("-------------------------")
aug = nac.RandomCharAug(action="insert")
augmentText = aug.augment(text)
print(augmentText)

print("-------------------------")
print("------Word Augmenter-----")
print("-------------------------")

aug = naw.SpellingAug()
augmentText = aug.augment(text, n=3)

ប្រាប់ ពី តម្លៃ សិក្សា របស់ សាលា CADT
['ប ្ រ ា ប ់ ព ី តម ្ ល ៃ ស ិ ក ្ ស ា របស ់ ស ា ល ា GAOT']
-------------------------
---------Keyboard--------
-------------------------
['ប ្ រ ា ប ់ ព ី តម ្ ល ៃ ស ិ ក ្ ស ា របស ់ ស ា ល ា vXDT']
-------------------------
---------Keyboard--------
-------------------------
['ប ្ រ ា ប ់ ព ី តម ្ ល ៃ ស ិ ក ្ ស ា របស ់ ស ា ល ា iCUADT']
-------------------------
---------Keyboard--------
-------------------------


In [1]:
from textattack.augmentation import CharSwapAugmenter

text = "ប្រាប់ ពី តម្លៃ សិក្សា របស់ សាលា CADT"
charswap = CharSwapAugmenter()
rise = charswap.augment(text)
print(rise)

textattack: Updating TextAttack package dependencies.
textattack: Downloading NLTK required packages.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/icyfrost/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/icyfrost/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package omw to /home/icyfrost/nltk_data...
[nltk_data] Downloading package universal_tagset to
[nltk_data]     /home/icyfrost/nltk_data...
[nltk_data]   Unzipping taggers/universal_tagset.zip.
[nltk_data] Downloading package wordnet to /home/icyfrost/nltk_data...
[nltk_data] Downloading package punkt to /home/icyfrost/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


['ប្រាប់ ពី តម្លៃ សិក្សា របស់ សាលា CDT']


## Remove Stop words like តើ

In [2]:
df = pd.read_csv("all_data/all_segment_dataset.csv", encoding="UTF-8")
df["Question"] = [re.sub(r"តើ ", "", x) for x in df["Question"].to_list()]
df["Question"] = [re.sub(r"មាន ", "", x) for x in df["Question"].to_list()]
# df["Question"] = [re.sub(r"ប៉ុន្មាន ","", x) for x in df["Question"].to_list()]
df["Question"] = [re.sub(r"អាច ", "", x) for x in df["Question"].to_list()]
df["Question"] = [re.sub(r"ថា ", "", x) for x in df["Question"].to_list()]
df.to_csv("preprocess/remove_stop_word_segment.csv", index=False)
df.head()

FileNotFoundError: [Errno 2] No such file or directory: 'all_data/all_segment_dataset.csv'

## Limit sentences of Each Intents to highest 200

In [None]:
df = pd.read_csv("preprocess/remove_stop_word_segment.csv", encoding="UTF-8")
newDf = df.groupby("Intent", as_index=False).count()
# print(range(len(df.Intent.to_list())))
list = df.Intent.unique().tolist()
# newList = [df.lo if df[df["Intent"]==x].count()["Question"]>200 else  for x in list]

modifyDf = df
for x in list:
    temp = df[df["Intent"] == x]
    if temp["Question"].count() > 200:
        # Shuffle the row here
        temp.sample(frac=1, ignore_index=True)
        temp = temp.iloc[0:200]
        modifyDf = modifyDf.drop(modifyDf[modifyDf["Intent"] == x].index)
        modifyDf = modifyDf.append(temp, ignore_index=True)

modifyDf.to_csv("original_data/limit_200_segment_sentences.csv", encoding="UTF-8", index=False)
modifyDf.groupby("Intent").count()

## Convert CSV Format to Txt Format

In [96]:
df = pd.read_csv("limit_200_segment_sentences.csv", encoding="UTF-8")
oldIntentList = df.Intent.to_list()
questionList = df.Question.to_list()
f = open("data/data_file.txt", mode="w", encoding="UTF-8")
newIntentList = [re.sub("\s", "-", x) for x in oldIntentList]

for intent, question in zip(newIntentList, questionList):
    f.write("__label__{} {}\n".format(intent, question))
f.close()

## Split data to train and test file

In [97]:
f = open("data/data_file.txt", mode="r", encoding="UTF-8")
list = [x.strip() for x in f.readlines()]
f.close()
random.shuffle(list)
# 80% Training
# 20% Testing
train = list[int(len(list) * 0.21): int(len(list) * 1.0)]
test = list[int(len(list) * 0.0): int(len(list) * 0.2)]
f = open("data/train_file.txt", mode="w", encoding="UTF-8")
for x in train:
    f.write(x + "\n")
f.close()

f = open("Data/test_file.txt", mode="w", encoding="UTF-8")
for x in test:
    f.write(x + "\n")
f.close()

In [98]:
model = fasttext.train_supervised('data/train_file.txt')

In [99]:
model.test("data/test_file.txt")

(2120, 0.9433962264150944, 0.9433962264150944)

In [105]:
model.predict("សាលា ស៊ីអេឌីធី ណា?")
# model.predict("បន្ទប់ HR នៅ ណា ទៅ ?")

(('__label__AskAboutAcademicFee',), array([0.74408054]))

In [78]:
model.save_model("bin1.bin")

In [79]:
model = fasttext.load_model("bin1.bin")



In [82]:
model.predict("តើ ម៉ោង ប៉ុន្មាន ហើយ?")

(('__label__FindECInfo',), array([0.38426891]))