# Keyword analysis of STA papers

In [56]:
import matplotlib.pyplot as plt # Plots
import re #regular expresions
import tarfile #open tarfiles
import arxiv   #arXiv wrapper
import numpy as np #numeric tools

In [57]:
import os #Operative system utilities
import sys #system
sys.path.insert(0,'LaTeXCounter-py/')
import LaTeXWordCounter as TeX #my functions to work with .tex files

## Get the data set from the arXiv API

Use the arXiv API wrapper to make a query of papers with the keywords *shortcuts* in the title or abstract and belonging to the arXiv category *quant_ph* (quantum physics).

In [58]:
STA_papers = arxiv.query(query='all:transitionless AND cat:quant-ph'
                         ,sort_by='submittedDate'
                         ,max_results=500
)
print('Found',len(STA_papers),'papers.')

Found 61 papers.


In [59]:
STA_papers_dict = dict()

In [60]:
for paper in STA_papers:
    ID = paper['id'].split('/')[-1]
    STA_papers_dict[ID] = paper

In [61]:
STA_papers_dict.keys()

dict_keys(['2002.06134v1', '1909.02366v1', '1908.06443v1', '1906.08065v1', '1905.09524v2', '1810.10233v2', '1809.00102v1', '1807.10227v2', '1804.10983v1', '1803.10410v2', '1712.06773v2', '1711.06140v2', '1707.04022v1', '1706.06785v2', '1706.03925v1', '1705.08578v2', '1705.01695v3', '1703.07933v1', '1703.03610v3', '1702.02239v3', '1612.08779v1', '1610.09938v3', '1609.04662v4', '1608.03669v1', '1607.06503v1', '1607.00095v1', '1606.06796v1', '1605.07523v1', '1604.03321v2', '1603.07778v2', '1603.05057v2', '1602.08833v1', '1602.00050v4', '1509.00097v2', '1507.07082v1', '1506.07268v1', '1505.04372v2', '1505.01624v3', '1502.07880v1', '1412.2848v2', '1411.6747v6', '1410.1555v2', '1406.7138v1', '1403.4140v2', '1401.1352v1', '1310.5323v2', '1309.3020v2', '1307.6922v3', '1305.4967v1', '1305.4724v2', '1305.4207v1', '1212.3294v1', '1211.1586v1', '1209.3153v3', '1206.2670v1', '1111.1579v1', '1111.0035v1', '1106.2776v2', '1105.4227v1', '1102.3449v1', '0912.4178v1'])

## Cut the papers that were published before *Shortcut to adiabatic passage in two and three level atoms*

#### Optionally, show all the results

In [8]:
# for paper in STA_papers:
#     print('DATE:',paper.get('published','N.A.'))
#     print('TITLE:',paper.get('title','UNTITLED'),'\n\n') 

DATE: 2020-02-26T16:49:01Z
TITLE: Smooth bang-bang shortcuts to adiabaticity for atomic transport in a
  moving harmonic trap 


DATE: 2020-02-14T17:35:31Z
TITLE: Non-equilibrium thermodynamics of quantum processes assisted by
  transitionless quantum driving: the role of initial state preparation 


DATE: 2020-02-14T11:31:48Z
TITLE: Transport of atoms across an optical lattice using an external harmonic
  potential 


DATE: 2020-02-11T19:00:32Z
TITLE: Fast dynamical exchange cooling with trapped ions 


DATE: 2020-02-10T17:05:31Z
TITLE: Noise Sensitivities for an Atom Shuttled by a Moving Optical Lattice via
  Shortcuts to Adiabaticity 


DATE: 2020-02-10T10:16:26Z
TITLE: Shortcuts to adiabaticity for an interacting Bose-Einstein condensate
  via exact solutions of the generalized Ermakov equation 


DATE: 2020-01-27T11:13:11Z
TITLE: Two-qubit quantum Fourier transform and entanglement protected by
  circulant symmetry 


DATE: 2019-12-18T16:02:11Z
TITLE: Many-body quantum heat engine

## Make a directory for the source files of the papers and download them

In [9]:
source_folder = 'paper_source_files/'

In [10]:
%mkdir paper_source_files

Use the arXiv API wrapper function `download` to download the papers tarfiles ([I contributed to this feature!!!!](https://github.com/lukasschwab/arxiv.py/graphs/contributors))

In [12]:
# This one takes a time to run, be patient

# store_folder = 'paper_source_files'
# for paper in STA_papers:
#     arxiv.download(paper,dirpath=store_folder,prefer_source_tarfile=True)

## Make a directory for the .tex files

In [13]:
TeX_folder = 'paper_TeX_files/'

In [14]:
%mkdir paper_TeX_files

## Get the TeX files 

### function that returns the member in the tarball with TeX extension if it exists

In [15]:
def returnTeXFileMember(tar_file):
    for member in tar_file.getmembers():
        if member.isfile() and member.name.lower().endswith('.tex'):
            return member
    return None

## Loop over the files and extract them into ```paper_TeX_files/``` 

Some of the tarballs do not have a ```.tex``` file. Make a list of the ones that do not have one

In [16]:
papers_withoutTeX = []
papers_withTeX = []

for filename in os.listdir(source_folder):
    file = tarfile.open(source_folder+filename,'r')
    TeXFileMember = returnTeXFileMember(file)
    if TeXFileMember:
        papers_withTeX.append(filename)
        file.extract(TeXFileMember,path=TeX_folder)
        #rename the file
        os.rename(TeX_folder+TeXFileMember.name,TeX_folder+filename.replace('.tar.gz','.tex'))
    else:
        papers_withoutTeX.append(filename)
    file.close()

Remove empty folders

In [17]:
for el in os.listdir(TeX_folder):
    if os.path.isdir(TeX_folder+el):
        os.removedirs(TeX_folder+el)
        print('removed:',el)

## Analyze the papers

### 1. Use a dictionary to put all the keywords and their counts

In [18]:
keywords = dict()

### 2. Analyze all the .tex files

In [19]:
succesful_attempts = 0 #count of analyzed files
failed_attempts = 0    #count of files that throwed an exception when analyzing them
failed = []            #list of failed files

for TeX_file in os.listdir(TeX_folder):
    #make sure only .tex files are treated: there are hidden files in the
    #folder with unwanted extensions
    if TeX_file.endswith('.tex'):
        try:
            TeX.analyzeTeXFile(TeX_folder+TeX_file,keywords)
            succesful_attempts +=1
        except:
            failed_attempts +=1
            failed.append(TeX_file)
            os.remove(TeX_folder+TeX_file) #A bit of a nasty hack
            
print('Failed: ',failed_attempts)
print('Successful: ',succesful_attempts)

Failed:  0
Successful:  15


### Put the contents of the dictionary in a .txt file

In [20]:
with open('raw_data.txt','w') as f_out:
    for entry in sorted(keywords, key = lambda k:(-keywords[k],k)):
        line = '{0}: {1}\n'.format(entry,keywords[entry])
        f_out.write(line)

In [21]:
keywords['shortcuts']

83