In [2]:
from ipynb.fs.full.vocab import *
from ipynb.fs.full.batch import *
import re
import pandas as pd
import ast

In [3]:
# Thanks to https://stackoverflow.com/questions/43018030/replace-apostrophe-short-words-in-python
# convert most of the apostrophe words in to it's full form
def decontracted(phrase):
    # specific
    phrase = re.sub(r"’", "'", phrase)
    phrase = re.sub(r"&", "and", phrase)
    phrase = re.sub(r"won\'t", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)
    phrase = re.sub(r"ain\'t", "am not", phrase)
    phrase = re.sub(r"how\'d", "how did", phrase)
    phrase = re.sub(r"let\'s", "let us", phrase)
    phrase = re.sub(r"ma\'ma", "madam", phrase)
    phrase = re.sub(r"o\'clock", "of the clock", phrase)
    phrase = re.sub(r"sha'n\'t", "shall not", phrase)
    phrase = re.sub(r"shan\'t", "shall not", phrase)
    phrase = re.sub(r"y\'all", "you all", phrase)
    

    # general
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase

In [4]:
# read the dictionary which hold most of the base english words plus it's different form
file = open("lemma_dict.txt", "r")
contents = file.read()
dictionary = ast.literal_eval(contents)
file.close()

In [40]:
# convert eaten, eats, eating words into base word eat
def convert_to_base_word(sentence):
    return ' '.join(
       dictionary[word] if word in dictionary.keys() else word for word in sentence.split(' ')
    )
# remove all english letter number and bangla number from given bangla content
def normalize_string_bangla(sentence):
    sentence = re.sub(r"[a-zA-Z0-9০-৯]+", r"", sentence)
    sentence = punctuation_remove(sentence)
    return sentence
# remove most of the punctuation from given sentence
def punctuation_remove(sentence):
    sentence = re.sub(r'[\xa0।_.?!,|:\/"”“’‘$)(-]+', r" ", sentence)
    sentence = re.sub(' +', ' ', sentence)
    return sentence.strip()
# helping function for normalizing english content also remove english number
def normalize_string_english(sentence):
    sentence = decontracted(sentence)
    sentence = re.sub(r"[0-9']+", r"", sentence)
    sentence = punctuation_remove(sentence)
    sentence = convert_to_base_word(sentence)
    return sentence

In [41]:
lines = open('data_news.txt', encoding = 'utf-8').read().strip().split('\n')

In [42]:
en = []
bn = []
for l in lines:
    contents = l.split('\t')
    en.append(contents[0])
    bn.append(contents[1])

In [43]:
for i in range(len(en)):
    en[i] = normalize_string_english(en[i].lower().strip())
    bn[i] = normalize_string_bangla(bn[i].strip())

In [44]:
preprocess_data = pd.DataFrame(list(zip(en, bn)), columns = ['en', 'bn'])

In [45]:
preprocess_data

Unnamed: 0,en,bn
0,twenty three accuse are currently abscond the ...,গত অক্টোবর শুরু হওয়ার পর প্রায় আড়াই মাসে কার্য...
1,donald trump reuters file photo they give safe...,ডোনাল্ড ট্রাম্প ছবি রয়টার্স আফগানিস্তানে মার্ক...
2,the dead are year old shantona akter and her d...,সবুজবাগ থানার পরিদর্শক তদন্ত মোস্তাফিজুর রহমান...
3,trump deny use the word to describe haiti and ...,এমাসের শুরু দিকে ওভাল অফিসে অভিবাসন নীতি নিয়ে ...
4,the settlement is one the large payment make b...,রয়টার্সের এক প্রতিবেদনে বলা হয় ইন্ডাস্ট্রিয়াল ...
...,...,...
1498,the year old minor file a case against her ste...,ভয় দেখিয়ে ছয় মাস ধরে ধর্ষণ করা হয়েছে অভিযোগ কর...
1499,it seem that the condition for register new pa...,নির্বাচন কমিশন ভবন নির্বাচন কমিশন সচিব হেলালুদ...
1500,the accident take place on the sonapur alexand...,সুধারামমডেল থানার ওসি মো আনোয়ার হোসেন জানান রো...
1501,police have recover yaba tablet roll of heroin...,বিডিনিউজ টোয়েন্টিফোর ডটকমকে তিনি বলেন তাদের কা...


In [46]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(preprocess_data, test_size=0.1, random_state = 1)

In [47]:
train.to_csv('train_news.txt', sep = '\t', index = False, header = False)

In [48]:
test.to_csv('test_news.txt', sep = '\t', index = False, header = False)