In [2]:
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd
import nltk
from nltk.corpus import stopwords

def review_to_words( raw_review ):
    # 2. 영문자가 아닌 문자는 공백으로 변환
    letters_only = re.sub('[^a-zA-Z]', ' ', raw_review)
    # 3. 소문자 변환
    words = letters_only.lower().split()
    # 4. 파이썬에서는 리스트보다 세트로 찾는게 훨씬 빠르다.
    # stopwords 를 세트로 변환한다.
    stops = set(stopwords.words('english'))
    # 5. Stopwords 불용어 제거
    meaningful_words = [w for w in words if not w in stops]
    # 6. 어간추출
    stemmer = nltk.stem.PorterStemmer()
    stemming_words = [stemmer.stem(w) for w in meaningful_words]
    # 7. 공백으로 구분된 문자열로 결합하여 결과를 반환
    return( ' '.join(stemming_words) )

abstract_list=[]
id_list = []
#웹크롤링하기 위한 사이트 소스를 받아온다.
for i in range(3):
    req = requests.get(
        'https://arxiv.org/search/advanced?advanced=1&terms-0-operator=AND&terms-0-term=&terms-0-field=all&classification-computer_science=y&classification-physics_archives=all&date-filter_by=all_dates&date-year=&date-from_date=&date-to_date=&date-date_type=submitted_date&abstracts=show&size=200&order=-announced_date_first&start=' + str(i*200))

    print(str(i*200))
    html = req.text

    # html.parser란 html 문법 규칙에 따라 문자열의 단어 의미나 구조를 분석하는 것이다.
    soup = BeautifulSoup(html, 'html.parser')

    # CSS Selector를 통한 원하는 부분 추출
    csPaperAbstract = soup.select(
        'span.abstract-full.has-text-grey-dark.mathjax')  # abstract-short가 아닌 abstract-full로 가져올 것!
    csPaperId = soup.select('p.list-title.level-left')  # 논문 아이디번호 같이 추출

    # Abstract 과 id 별로 list를 만들어준다.
    for abstract in csPaperAbstract:
        abstract_list.append(abstract.text)
    for ids in csPaperId:
        id_list.append('.'.join(re.findall(r'\b\d+\b', ids.text)[:2]))

0
200
400


In [3]:
# Abstract과 id 별로 DataFrame을 만든다.
ids = pd.DataFrame({'id': id_list})
abstracts = pd.DataFrame({'raw_text': abstract_list})

# abstract와 id 를 같이 묶어서 DataFrame을 만듣다.
dp = pd.concat([ids, abstracts], axis=1)
# 전처리한 내용을 DataFrame에 붙이기
dp['preprossed'] = dp['raw_text'].map(review_to_words)
dp

Unnamed: 0,id,raw_text,preprossed
0,1810.04158,\n While convolutional neural networks ...,convolut neural network domin field comput vis...
1,1810.04152,\n Deep latent variable models have bec...,deep latent variabl model becom popular model ...
2,1810.04150,\n The success of the exascale supercom...,success exascal supercomput larg debat remain ...
3,1810.04147,\n Building on the success of deep lear...,build success deep learn two modern approach l...
4,1810.04146,"\n In this work, we consider the integr...",work consid integr mpi one side commun non blo...
5,1810.04144,\n Advanced driver assistance systems a...,advanc driver assist system advanc rapid pace ...
6,1810.04142,\n We address fine-grained multilingual...,address fine grain multilingu languag identif ...
7,1810.04133,\n Significant advances have been made ...,signific advanc made recent train neural netwo...
8,1810.04125,\n We present new algorithms for the ra...,present new algorithm random construct hierarc...
9,1810.04119,\n Cartesian Genetic Programming (CGP) ...,cartesian genet program cgp mani modif across ...


In [4]:
dp.to_csv('output.csv', sep=',')