# Data Processing

In [1]:
import pandas as pd
import numpy as np
import os
import re
from tool_functions_copy import *

import warnings
import sys
warnings.filterwarnings('ignore')  # To ignore all warnings that arise here to enhance clarity

# Data Extraction

In [2]:
funding = MySQLPipline(database='funding')
NIHdata = funding.NIHDataset()
NSFdata = funding.NSFDataset()
ERCdata = funding.ERCDataset()
UKRIdata = funding.UKRIDataset()
funding.close_Conn()

df1 = pd.concat([NIHdata[["title", "abstract"]],
                ERCdata[["title", "abstract"]]])
df2 = pd.concat([NSFdata[["title", "abstract"]],
                UKRIdata[["title", "abstract"]]])
df = pd.concat([df1, df2])
df.reset_index(drop=True, inplace=True)

# Data Cleaning

In [3]:
mp = modelProcessor(dat = df)

In [4]:
mp.remove_abstract(regex=r'Abstracts are not currently available in GtR').reset_index(drop = True, inplace = True)
mp.remove_abstract(regex=r'No abstract available').reset_index(drop = True, inplace = True)
mp.clean_text()
mp.data = mp.data[~pd.isna(mp.data.title)]

This may take long time to run...
True
Finished! It takes 299.5411169528961 seconds to run.


Unnamed: 0,title,abstract
0,Novel Treatments for Ocular Surface Diseases,PROJECT DESCRIPTIONABSTRACT Dry eye disease DE...
1,SYNTHETIC GENE CIRCUITS FOR MONITORING T-CELL ...,Chimeric antigen receptor CAR Tcell therapy ha...
2,Training Program in Cancer Biology,Project Summary The Icahn School of Medicine a...
3,Understanding the influence of bone-metastatic...,While immunotherapies have made strides in the...
4,Modeling bladder cancer pathogenesis and tumor...,Project SummaryAbstract This new Program Proje...
...,...,...
233451,University of Northumbria at Newcastle and Smy...,To develop a set of Machine Learning tools tha...
233452,Bbsrc next generation bioprocessing studentshi...,Doctoral Training Partnerships a range of post...
233453,UK Involvement in LSST: Phase A,Summary We propose a programme of work to enab...
233454,University of Ulster and Rapid International L...,To design and develop the next generation conc...


In [None]:
df.to_csv("../Data/cleaned_data.csv", index = False, encoding = "utf-8-sig")

# Corpus Selection

In [1]:
# Import Packages
from imp import reload
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
from tool_functions import *
import warnings
warnings.filterwarnings('ignore')  # To ignore all warnings that arise here to enhance clarity

In [2]:
%store -r df_stemmed

In [3]:
df_stemmed.reset_index(drop = True, inplace = True)
df_stemmed

Unnamed: 0,title,abstract,institution,corpus,corpus_stemmed,corpusSet
0,Novel Treatments for Ocular Surface Diseases,PROJECT DESCRIPTION ABSTRACT Dry eye disease ...,NIH,"[dry, eye, disease, ded, ocular, surface, dise...","[dri, eye, diseas, ded, ocular, surfac, diseas...","{amelior, innov, relev, exposur, ophthalm, fou..."
1,SYNTHETIC GENE CIRCUITS FOR MONITORING T-CELL ...,Chimeric antigen receptor CAR T cell therapy...,NIH,"[chimeric, antigen, receptor, car, cell, thera...","[chimer, antigen, receptor, car, cell, therapi...","{synthet, lost, input, mentor, cellular, resou..."
2,Training Program in Cancer Biology,Project Summary The Icahn School of Medicine a...,NIH,"[icahn, school, medicine, mount, sinai, propos...","[icahn, school, medicin, mount, sinai, propos,...","{select, member, account, process, nci, mentor..."
3,Understanding the influence of bone-metastatic...,While immunotherapies have made strides in the...,NIH,"[immunotherapies, strides, treatment, cancers,...","[immunotherapi, stride, treatment, cancer, cas...","{factor, driven, focu, found, system, costimul..."
4,Modeling bladder cancer pathogenesis and tumor...,Project Summary Abstract This new Program Pr...,NIH,"[investigate, molecular, mechanisms, underlyin...","[investig, molecular, mechan, underli, pathoge...","{patholog, progress, histopatholog, gemm, gene..."
...,...,...,...,...,...,...
173607,Work of Words: Re-reading the poetry of Dylan ...,This project is the first to systematically ap...,UKRI,"[systematically, apply, modern, critical, theo...","[systemat, appli, modern, critic, theori, bodi...","{visual, war, pastor, form, rewrit, condit, fi..."
173608,Surface rupture in the 12 May 2008 Sichuan ear...,The May earthquake in Sichuan Province China ...,UKRI,"[earthquake, sichuan, province, china, magnitu...","[earthquak, sichuan, provinc, china, magnitud,...","{lost, orient, directli, infrastructur, eros, ..."
173609,Influences on negotiating clinical need &amp; ...,This study examines how patients and dentists ...,UKRI,"[patients, dentists, decide, pursue, treatment...","[patient, dentist, decid, pursu, treatment, in...","{patient, paid, north, treatment, costli, clin..."
173610,DiRAC2: 100 Tflop/s HPC cluster procurement,This award covers the capital cost of procurin...,UKRI,"[award, covers, capital, cost, procuring, tflo...","[award, cover, capit, cost, procur, tflop, hpc...","{cluster, procur, dirac, tflop, leicest, part,..."


In [4]:
mp3 = modelProcessor(dat = df_stemmed)

In [5]:
%store -r all_corpus

In [6]:
len(set(all_corpus))

246360

In [7]:
unique_corpus3 = list(set(all_corpus))[160001:]

In [10]:
unimpWords3 = []
for word in tqdm(unique_corpus3):
    occur = mp3.getWordOccurance(word)
    if occur < 0.001 or occur > 0.8:
        unimpWords3.append(word)

100%|███████████████████████████████████| 86359/86359 [5:57:05<00:00,  4.03it/s]


In [11]:
%store unimpWords3

Stored 'unimpWords3' (list)


In [13]:
len(unimpWords3)

83784

In [None]:
unimpWords3