<a href="https://colab.research.google.com/github/dragonsan17/faq_retrieval_deep_learning/blob/main/data_concatenation_augmentation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Data Concatenation and Augmentation Script

This script first concatenates the required datasets (or can work on only 1, as specified in the list) and then generates similar sentences using 2 approaches: one is using iNLTK's library which has an api call for the same, and another is to use manually generated synonyms for key words in our dataset. The augmented data is then stored in the data directory, which can then be used in any of the workflow scripts. If there is no need for augmentation, save the train file after concatenation.

## Import Libraries

In [None]:
!pip install torch==1.3.1+cpu -f https://download.pytorch.org/whl/torch_stable.html
!pip install inltk

from inltk.inltk import get_similar_sentences
from tqdm import tqdm
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings('ignore') 

## Data Pre-processing (concatenation, if required)

In [None]:
train_set = ['jee', 'kab', 'paj'] # Put all data here that has to be concatenated
test_set = ['jee'] # Specify the test set here
train_dfs = []
test_dfs = []
for data in train_set:
    
    df_all_data = pd.read_csv('./data/' + data + '_all_data.csv', encoding = 'utf-8')
    df_train = pd.read_csv('./data' + data + '_train.csv', encoding = 'utf-8')
    if data == 'paj':
        df_all_data['Relevant Point'] = df_all_data['Answer Transcription']

    if data != 'kab':
        df_train['a2'] = [(list(df_all_data[df_all_data['STT Transcript'] == q]['Relevant Point']) + list(df_all_data[df_all_data['Caller query transcription'] == q]['Relevant Point']))[0] for q in list(df_train['q2']) ]
        df_train['q2_r'] = [(list(df_all_data[df_all_data['STT Transcript'] == q]['Relevant Topic']) + list(df_all_data[df_all_data['Caller query transcription'] == q]['Relevant Topic']))[0] for q in list(df_train['q2']) ]
    else:
        df_train['a2'] = [(list(df_all_data[df_all_data['Caller query transcription'] == q]['Relevant Point']))[0] for q in list(df_train['q2']) ]
        df_train['q2_r'] = [(list(df_all_data[df_all_data['Caller query transcription'] == q]['Relevant Topic']))[0] for q in list(df_train['q2']) ]
    
    
    train_dfs.append(df_train)
  
df_train = pd.concat(train_dfs) 
df_train = df_train.reset_index(drop=True)

In [None]:
"""
  Uncomment and save if only concatenated data is needed.
"""
# df_train.to_csv('./data/concat_train.csv', index = False)

In [None]:
df_syn = pd.read_excel('./data/synonyms.xlsx', sheet_name=0)

## Similar Sentences Generation

In [None]:
def main(sent, synonyms, word = None):
    out_sents = []

    if word != None:
      # Preprocess synonyms
      sent = sent.strip()
      # synonyms = '{}, {}'.format(word, synonyms)
      print(synonyms)
      synonyms = synonyms.split(',')
      synonyms = [syn.strip() for syn in synonyms]
      # print('Synonyms: {}'.format(synonyms))
      # Get the appended list
      for syn in (synonyms):
          new_sent = sent.replace(word, syn)
          out_sents.append(new_sent)

    syn_sents = get_similar_sentences(sent, 2, 'hi', 0.3)
    out_sents.extend(syn_sents)
    print(out_sents)
    return out_sents

In [None]:
# Run only once, to setup functions for Hindi in iNLTK
from inltk.inltk import setup
setup('hi')

In [None]:
import random

q1 = []
q2 = []
label = []
a2 = []
q2_r = []

words = df_syn['Word']
print(len(df_train))
for i,r in df_train.iterrows():
  
  q1_list = [r['q1']]
  for word in words:
    if word in r['q1']:
      q1_list.extend(main(r['q1'], list(df_syn[df_syn['Word'].str.contains(word)]['Synonyms'])[0], word))

  q2_list = [r['q2']]
  for word in words:
    if word in r['q2']:
      q1_list.extend(main(r['q2'], list(df_syn[df_syn['Word'].str.contains(word)]['Synonyms'])[0], word))

  if len(q1_list) == 1 and len(q2_list) == 1:
    continue 
  a2_list = [r['a2']]
  for word in words:
    if word in r['a2']:
      q1_list.extend(main(r['a2'], list(df_syn[df_syn['Word'].str.contains(word)]['Synonyms'])[0], word))

  q2_r_list = [r['q2_r']]
  for word in words:
    if word in r['q2_r']:
      q1_list.extend(main(r['q2_r'], list(df_syn[df_syn['Word'].str.contains(word)]['Synonyms'])[0], word))
      
  for j in range(5):
    q1.append(random.choice(q1_list))
    q2.append(random.choice(q2_list))
    label.append(r['label'])
    a2.append(random.choice(a2_list))
    q2_r.append(random.choice(q2_r_list))

## Saving Augmented data

In [None]:
df_new_train = pd.DataFrame({'q1' : q1, 'q2' : q2, 'label' : label, 'a2' : a2, 'q2_r' : q2_r})

df_new_train.to_csv('./data/aug_train.csv', index = False)