In [52]:
import re
import os
import pandas as pd
import numpy as np
import nltk

In [53]:
def divide_into_chunks(text, chunk_size=3):
    # Tokenize the text into sentences
    sentences = nltk.sent_tokenize(text)

    # Divide the sentences into chunks
    chunks = [sentences[i:i + chunk_size] for i in range(0, len(sentences), chunk_size)]

    return chunks

In [54]:
file_names = [f for f in os.listdir("E:\\BSCS 4-3\\Thesises\\genective\\corpus")]

samples = []
for name in file_names[:-2]:
    # open corpus file
    with open('./corpus/' + name, 'r', encoding="utf8") as file:
        corpus = file.read()
        
    # split into sentences
    document_chunks = divide_into_chunks(corpus)
    
    for chunk in document_chunks:
        sample = ' '.join(chunk)
        sample = ' '.join(filter(lambda s: 'http' not in s and 'www.' not in s, sample.split()))
        samples.append(sample)
        
samples = np.array(samples)
np.random.shuffle(samples)
    
len(samples)

3496

In [55]:
gen_df = pd.DataFrame(samples, columns=['corpus'])
gen_df

Unnamed: 0,corpus
0,risk factors for foot disease in people with d...
1,it is the duty of every health care profession...
2,sometimes your body doesn’t make enough insuli...
3,whereas monomers of insulin readily diffuse th...
4,you know your blood glucose levels over a long...
...,...
3491,"the priority assigned to genetic services, wit..."
3492,tcf1 and tcf2 share similar domains; they have...
3493,"but you can manage it with lifestyle changes, ..."
3494,genetic determinants of type 2 diabetes: hyper...


In [56]:
gen_df.to_csv('./datasets/genetics.csv', index=False)

In [57]:
med_df = pd.read_csv('./corpus/medquad.csv')
med_df = med_df.drop(columns=['source', 'focus_area'])

med_df['corpus'] = med_df['question'] + ' ' + med_df['answer']
med_df = med_df.drop(columns=['question', 'answer'])

med_df['corpus'] = med_df['corpus'].astype(str)
med_df['corpus'] = med_df['corpus'].apply(lambda x: ' '.join(filter(lambda s: 'http' not in s and 'www.' not in s, x.split())))

med_df

Unnamed: 0,corpus
0,What is (are) Glaucoma ? Glaucoma is a group o...
1,What causes Glaucoma ? Nearly 2.7 million peop...
2,What are the symptoms of Glaucoma ? Symptoms o...
3,What are the treatments for Glaucoma ? Althoug...
4,What is (are) Glaucoma ? Glaucoma is a group o...
...,...
16407,What is (are) Diabetic Neuropathies: The Nerve...
16408,How to prevent Diabetic Neuropathies: The Nerv...
16409,How to diagnose Diabetic Neuropathies: The Ner...
16410,What are the treatments for Diabetic Neuropath...


In [58]:
med_df.to_csv('./datasets/general.csv', index=False)

In [59]:
df = pd.concat([gen_df, med_df])
df = df.sample(frac=1, random_state=64)
df

Unnamed: 0,corpus
8091,How to diagnose Parasites - Lice - Head Lice ?...
4152,What are the symptoms of Doyne honeycomb retin...
7564,What causes Pyruvate kinase deficiency ? What ...
13878,Is Mainzer-Saldino syndrome inherited ? This c...
14388,Is Cornelia de Lange syndrome inherited ? When...
...,...
4491,How to diagnose Congenital adrenal hyperplasia...
5705,How to diagnose Glutaric acidemia type I ? Is ...
12942,What are the genetic changes related to recomb...
3238,in order to tackle the burden of diabetes effe...


In [60]:
df.to_csv('./datasets/dataset.csv', index=False)

In [61]:
with open('./corpus/QA-Pairs.txt', 'r', encoding='utf-8') as file:
    test_samples = [line for line in filter(lambda s: len(s.split()) > 3, file.readlines())]

questions = []
answers = []
for line in test_samples:
    line = re.sub(r'\n', '', line)
    line = ' '.join(line.split())
    
    if 'Q:' in line:
        questions.append(re.sub(r'Q:', '', line).strip())
    elif 'A:' in line:
        answers.append(re.sub(r'A:', '', line).strip())

print(len(test_samples))
print(len(answers))
print(len(questions))

264
132
132


In [62]:
test_samples = []
for i in range(len(questions)):
    test_samples.append(questions[i] + ' ' + answers[i])
    
test_df = pd.DataFrame(np.array(test_samples), columns=['corpus'])
test_df

Unnamed: 0,corpus
0,What are some natural ways to manage diabetes ...
1,How might aloe vera assist in managing diabete...
2,What properties does bitter gourd possess that...
3,What role does fenugreek play in managing diab...
4,What is the significance of gymnema in dealing...
...,...
127,Could you explain the chronic complications sp...
128,How can individuals prevent diabetes? Diabetes...
129,Why is financial preparation crucial in managi...
130,What symptoms might indicate early signs of di...


In [63]:
test_df.to_csv('./datasets/test.csv', index=False)