In [1]:
from requests import request
from requests.exceptions import HTTPError
from time import sleep

def download(url, params={}, method='GET', retries=3):
    resp = None
    
    try:
        resp = request(method, url,
                       params=params if method=='GET' else {},
                       data=params if method=='POST' else {},
                       headers={'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.366'}
                      )
        resp.raise_for_status()
    except HTTPError as e:
        if 500 <= e.response.status_code:
            if retries > 0:
                sleep(3)
                resp = download(url, params=params,
                                method=method,
                                retries=retries-1)
            else:
                print('재방문 횟수 초과')
        else:
            print('Request', resp.request.headers)
            print('Response', e.response.headers)
        
    return resp

In [2]:
# https://pythonscraping.com/pages/javascript/ajaxDemo.html

In [3]:
url = 'https://pythonscraping.com/pages/javascript/ajaxDemo.html'
resp = download(url)

In [4]:
resp.headers['content-type']

'text/html'

In [5]:
from bs4 import BeautifulSoup
dom = BeautifulSoup(resp.text, 'lxml')

In [6]:
dom.select_one('#content').text.strip()

"This is some content that will appear on the page while it's loading. You don't care about scraping this."

In [7]:
dom.select_one('script')

<script src="../js/jquery-2.1.1.min.js"></script>

In [8]:
dom

<html>
<head>
<title>Some JavaScript-loaded content</title>
<script src="../js/jquery-2.1.1.min.js"></script>
</head>
<body>
<div id="content">
This is some content that will appear on the page while it's loading. You don't care about scraping this.
</div>
<script>
$.ajax({
    type: "GET",
    url: "loadedContent.php",
    success: function(response){

	setTimeout(function() {
	    $('#content').html(response);
	}, 2000);
    }
  });

function ajax_delay(str){
 setTimeout("str",2000);
}
</script>
</body>
</html>

In [9]:
from requests.compat import urljoin

urljoin(resp.request.url, 'loadedContent.php')

'https://pythonscraping.com/pages/javascript/loadedContent.php'

In [10]:
newurl = urljoin(resp.request.url, 'loadedContent.php')
resp = download(newurl)

In [11]:
resp.headers['content-type']

'text/html; charset=UTF-8'

In [12]:
resp.headers

{'Server': 'nginx', 'Date': 'Mon, 25 Sep 2023 01:06:06 GMT', 'Content-Type': 'text/html; charset=UTF-8', 'Content-Length': '106', 'Connection': 'keep-alive', 'X-Powered-By': 'PHP/7.4.33, PleskLin', 'Vary': 'Accept-Encoding', 'Content-Encoding': 'gzip'}

In [13]:
# XHR 객체를 이용하여 AJAX를 통해 DHTML 한 결과
resp.text

'Here is some important text you want to retrieve! <p/><button id="loadedButton">A button to click!</button>'

In [14]:
url = 'https://vsuggest.search.daum.net/v2/sushi/pc/get'
params = {'q': '카'}
resp = download(url, params)
resp.headers['content-type']

'application/json;charset=UTF-8'

In [15]:
for subkey in resp.json()['subkeys']:
    print(subkey['keyword'])

카카오맵
카카오톡
카카오 페이지
카카오
카카오톡 pc버전 다운로드
카카오뱅크
칸투칸
카눈
카카오 메일
카르텔 뜻
신한카드
카나리아 바이오
카카오 주가
카니발
카니발 하이브리드


In [16]:
resp.headers

{'Date': 'Mon, 25 Sep 2023 01:06:10 GMT', 'Server': 'Apache', 'X-Content-Type-Options': 'nosniff', 'X-XSS-Protection': '1; mode=block', 'Cache-Control': 'no-cache, no-store, max-age=0, must-revalidate', 'Pragma': 'no-cache', 'Expires': '0', 'X-Frame-Options': 'DENY', 'Content-Type': 'application/json;charset=UTF-8', 'Vary': 'Accept-Encoding,User-Agent', 'Content-Encoding': 'gzip', 'Keep-Alive': 'timeout=1, max=3800', 'Connection': 'Keep-Alive', 'Transfer-Encoding': 'chunked'}

In [17]:
# del params['callback']

In [18]:
while True:
    q = input()

    if q=='종료':
        break
    params['q'] = q
    resp = download(url, params)
    print(', '.join([subkey['keyword']
                    for subkey in resp.json()['subkeys']]))

 카리나


카리나, 카리나 언니, 카리나 키, 카리나 고향, 에스파 카리나, 카리나 미드, 카리나 인스타, 카리나 사주, 카리나 윈터, 카리나 나이, 카리나 고등학교, 카리나 영어, 카리나 뜻, 카리나 프로필, 카리나 생일


KeyboardInterrupt: Interrupted by user

In [23]:
# 네이버 예제

naverurl = 'https://ac.search.naver.com/nx/ac?con=1&frm=nv&ans=2&r_format=json&r_enc=UTF-8&r_unicode=0&t_koreng=1&run=2&rev=4&q_enc=UTF-8&st=100&_callback=_jsonp_2'
params = {'q':'카'}
resp = download(naverurl, params)
resp.headers['content-type']

'application/javascript; charset=UTF-8'

In [25]:
resp.text

'_jsonp_2({\n"query" : ["카"],\n"answer" : [],\n"intend" : [{"query":"카카오","intend":"기업","transQuery":"(주)카카오"}],\n"items" : [\n[["카카오페이지", "0"],["카카오", "2"],["카카오웹툰", "0"],["카페", "0"],["카카오뱅크", "0"],["카카오톡", "0"],["카리나", "0"],["카니발", "0"],["카카오맵", "0"],["칼국수", "0"]]\n]\n})'

In [20]:
#브런치 예제
# https://brunch.co.kr/search?q=%EC%B9%B4%EB%A6%AC%EB%82%98&type=article
url = 'https://brunch.co.kr/search'
params = {
    'q': '카리나'
    , 'type' : 'article'
}

resp = download(url, params)
dom = BeautifulSoup(resp.text, 'lxml')


In [26]:
# 이 방법은 잘못된거라 함.
import re
dom.find_all(text=re.compile('카리나'))

  dom.find_all(text=re.compile('카리나'))


[]

In [33]:
dom.select_one('#resultArticle')

<div class="search_result" id="resultArticle"></div>

In [34]:
dom.select_one('#resultArticle').contents

[]

In [35]:
#이게 바로 DHTML 이라는 증거다. contents 가 비어있지만, 화면에서는 보이니까. 

In [36]:
url = 'https://api.brunch.co.kr/v1/search/article'
params ={
    'q': '카리나'
    , 'page' : '1'
    , 'pageSize' : '20'
    , 'highlighter' : 'n'
    , 'escape' : 'y'
    , 'sortBy' : 'accu'
}
resp = download(url, params)
resp.headers['content-type']

'application/json;charset=UTF-8'

In [37]:
for item in resp.json()['data']['list']:
    print(item['title'])

시에나를 제대로 즐기는 방법 - 만지아의 탑, 캄포광장,시에나 대성당, 카타리나 그리고 카리나
카리나가 선택한 공항패션 - Weekly Celeb
카리나 3행시 할게요 #에스파 #aespa
카리나 3행시 할게요 #에스파 #aespa
카리나와 차은우로 태교를 하고 있어요. - 21. 엄마가 기분 좋으면 그게 태교라는 아내
화제에 오르고 있는 육상계 카리나 ○○○ 선수는? - 에스파 카리나 닮은 꼴 육상 선수
의결권을 위임하면 카리나 사인을 준다고요? - SM 엔터테인먼트의 주주총회, 어떤 법적 구설수가 있었나
트리스탄이며 파르지팔인 그는 누구인가? - 단눈치오와 토스티의 &lt;아마란타의 네 개의 노래&gt;
짐승이 되어가고 있는 것 같다 - D+14 포르투갈길 14일 차
[고다르 시네마][2022] - 제 79회 베니스 국제 영화제 비경쟁부문
24. 외모의 중요성과 활개치는 외모지상주의
&#39;차쥐뿔&#39; 이영지가 주는 특별한 위로 - [문제적 여자들] &lt;차린건 쥐뿔도 없지만&gt; 호스트 이영지의 진정성
제법 뚠뚠한 일상. 4화 - 4. 짜증 난다의 의미
[트렌드 언박싱] 여름에도 내추럴한 멋을 포기 못한다면 - 2023년 5월의 패션, 뷰티, 라이프 트렌드
솔직함, 치부를 드러내는 강함에 대하여 - 고집쟁이의 영화추천 (10) : 미치광이 삐에로 리뷰
성형수술과 아이덴티티 - 시뮬레이션으로 알아본 원본 상실 순간
영화: 캐러비언의 해적 5-죽은 자는 말이 없다 - 저주받은 아버지 윌 터너를 찾아 나선 아들 헨리 터너
[IP 비즈니스] ③ &#39;광야&#39;로 향하는 엔터테인먼트
광기; 미쳐 가는 세상의 한 가운데 - 미친 세상에서 미치지 않는 법


In [None]:
#왜이렇게 하냐고? Dom 에는 안나오고, DHTML 이므로, 데이터를 뒷단에서 받아오려고 이렇게 하는거다. 

In [27]:
# 네이버 웹툰 예제

In [28]:
#comic.naver.com 부터 뒤져보겠다.
url = 'https://comic.naver.com'
resp = download(url)

In [30]:
resp.headers['content-type']

'text/html;charset=UTF-8'

In [31]:
dom = BeautifulSoup(resp.text, 'lxml')

In [34]:
dom.body
#네이버에 body 에는 아무것도 없다. .. 다 뒷단에서 돈다는 뜻이다. 

<body>
<div id="root"></div>
</body>

In [35]:
# dom

In [36]:
#1. 웹툰 목록 페이지
url = 'https://comic.naver.com/api/home/component?type=DAILY_WEBTOON'
#이번에는 크롤러 할거라서 파라미터 쪼개지 않을거다.
resp = download(url)
resp.headers

{'Content-Type': 'application/json', 'x-content-type-options': 'nosniff', 'x-xss-protection': '1; mode=block', 'Cache-Control': 'no-cache, no-store, max-age=0, must-revalidate', 'Pragma': 'no-cache', 'Expires': '0', 'x-frame-options': 'SAMEORIGIN', 'Content-Encoding': 'gzip', 'referrer-policy': 'unsafe-url', 'Server': 'nfront', 'Date': 'Mon, 25 Sep 2023 01:30:04 GMT', 'Content-Length': '8372', 'Connection': 'keep-alive', 'Vary': 'Accept-Encoding', 'Set-Cookie': 'XSRF-TOKEN=5b7851ca-91e5-47cb-a130-b14cb50d6c60; Path=/'}

In [39]:
resp.json()['titleList'][0]
#이게 첫번째 목록 페이지 이다.

{'titleId': 817019,
 'titleName': '날 먹는 건 금지양!',
 'thumbnailUrl': 'https://image-comic.pstatic.net/webtoon/817019/thumbnail/thumbnail_IMAG21_6845792d-40db-4cd9-ab7a-9b86995f3383.jpg',
 'thumbnailBadgeList': ['NEW'],
 'author': {'writers': [{'id': 346353, 'name': '조9'}],
  'painters': [{'id': 346353, 'name': '조9'}],
  'originAuthors': []},
 'displayAuthor': '조9',
 'up': True,
 'rest': False,
 'openToday': True}

In [40]:
url = 'https://comic.naver.com/webtoon/list?titleId=796152'
resp = download(url)
dom = BeautifulSoup(resp.text, 'lxml')
dom.body

<body>
<div id="root"></div>
</body>

In [41]:
#특정 웹툰에 들어갔을때 없는걸 보니, 또 뒷단에서 돈다는 뜻이다. 

In [42]:
#2. 특정 웹툰의 회차목록
url = 'https://comic.naver.com/api/article/list?titleId=796152&page=1'
resp = download(url)
resp.headers['content-type']

'application/json'

In [45]:
resp.json()

{'titleId': 796152,
 'webtoonLevelCode': 'WEBTOON',
 'totalCount': 70,
 'contentsNo': 472613,
 'finished': False,
 'dailyPass': False,
 'chargeBestChallenge': False,
 'articleList': [{'no': 67,
   'thumbnailUrl': 'https://image-comic.pstatic.net/webtoon/796152/67/thumbnail_202x120_c826820e-5517-47ee-b298-e38e336aea98.jpg',
   'subtitle': '67화. 마루의 선택은?!',
   'starScore': 9.98982,
   'bgm': False,
   'up': False,
   'charge': False,
   'serviceDateDescription': '23.09.18',
   'volumeNo': 67,
   'hasReadLog': False,
   'recentlyReadLog': False,
   'thumbnailClock': False,
   'thumbnailLock': False},
  {'no': 66,
   'thumbnailUrl': 'https://image-comic.pstatic.net/webtoon/796152/66/thumbnail_202x120_381a0b67-83f4-42bb-9df8-280daf8d9bc0.jpg',
   'subtitle': '66화. 마루, 고백받다',
   'starScore': 9.98965,
   'bgm': False,
   'up': False,
   'charge': False,
   'serviceDateDescription': '23.09.11',
   'volumeNo': 66,
   'hasReadLog': False,
   'recentlyReadLog': False,
   'thumbnailClock': False,


In [46]:
resp.json()['articleList'][0]

{'no': 67,
 'thumbnailUrl': 'https://image-comic.pstatic.net/webtoon/796152/67/thumbnail_202x120_c826820e-5517-47ee-b298-e38e336aea98.jpg',
 'subtitle': '67화. 마루의 선택은?!',
 'starScore': 9.98982,
 'bgm': False,
 'up': False,
 'charge': False,
 'serviceDateDescription': '23.09.18',
 'volumeNo': 67,
 'hasReadLog': False,
 'recentlyReadLog': False,
 'thumbnailClock': False,
 'thumbnailLock': False}

In [47]:
# 3. 특정회차 살펴보기
# 주소 일단 눌러서 확인 . 뭘 가져왔는지
url = 'https://comic.naver.com/webtoon/detail?titleId=796152&no=67&week=tue'
resp = download(url)
resp.headers['content-type']

'text/html;charset=UTF-8'

In [48]:
dom = BeautifulSoup(resp.text, 'lxml')
dom.body.select('img')

[<img alt="1" src="https://image-comic.pstatic.net/webtoon/796152/1/thumbnail_202x120_58ae8d34-812f-4cfb-9170-28b6dcf74c2b.jpg"/>,
 <img alt="2" src="https://image-comic.pstatic.net/webtoon/796152/2/thumbnail_202x120_0853c92e-b281-40fc-b1b9-b121b1f18022.jpg"/>,
 <img alt="3" src="https://image-comic.pstatic.net/webtoon/796152/3/thumbnail_202x120_f2e4e4db-0861-444f-b5fc-7133bc10e6e4.jpg"/>,
 <img alt="4" src="https://image-comic.pstatic.net/webtoon/796152/4/thumbnail_202x120_e1840bbc-2152-498b-9c1c-f970f4f1e8b4.jpg"/>,
 <img alt="5" src="https://image-comic.pstatic.net/webtoon/796152/5/thumbnail_202x120_8373ceb0-9117-49ce-82f0-a6ff6355dd67.jpg"/>,
 <img alt="6" src="https://image-comic.pstatic.net/webtoon/796152/6/thumbnail_202x120_cee13154-5393-4717-952f-cadab4deb515.jpg"/>,
 <img alt="7" src="https://image-comic.pstatic.net/webtoon/796152/7/thumbnail_202x120_79e6e826-8fd2-4f57-920d-51ac67bdbe2b.jpg"/>,
 <img alt="8" src="https://image-comic.pstatic.net/webtoon/796152/8/thumbnail_202x1

In [49]:
#크롤러의 구조를 따오는 방법을 배우려고 한다. 

In [60]:
from requests.compat import urljoin, urlparse, urlunparse, urlencode

#방문할 목록
url = 'https://comic.naver.com/api/home/component'
params = {'type':'DAILY_WEBTOON'}

URLs = []
URLs.append((url, params))

seens = []
domain = []

while URLs:
    seed = URLs.pop(0)

    resp = download(*seed) #항상 url 과 파라미터로 되어있을거다. 
    seens.append(resp.request.url)
    
    if resp.status_code !=200:
        continue

    # 3. 웹툰 해당 회차의 이미지 목록
    
    if re.search('text/html', resp.headers['content-type']):
        dom = BeautifulSoup(resp.text, 'html5lib')
        for link in dom.select('#sectionContWide img[src]'):
            href = link.attrs['src']
            newurl = urljoin(resp.request.url, href)

            urlc = tuple(newurl.split('?'))
            
            if newurl not in seens and\
                urlc not in URLs:
                URLs.append(urlc)

    elif re.search('image/(?:(?:jpeg)|(?:gif)|(?:png))',
                   resp.headers['content-type']):
        fname = re.sub('[?]', '', resp. request.url.split('/')[-1])
        with open(f'./webtoon/{fname}', 'wb') as fp:
            fp.write(resp.content)
    
    elif re.search('application/json', resp.headers['content-type']):
        result = resp.json()
        
        # 1번
        if 'titleList' in result.keys():
            #2번
            baseurl = 'https://comic.naver.com/api/article/list?titleId='
            for newurl in [baseurl +str(r['titleId'] )
                           for r in result['titleList']][:1]:
                urlc = tuple(newurl.split('?'))

                if newurl not in seens and\
                    urlc not in URLs:
                     URLs.append(urlc)

        #3번
        elif 'articleList' in result.keys():
            baseurl = 'https://comic.naver.com/webtoon/detail?'
            for newurl in [baseurl+seed[-1]+'&no='+str(r['no'])
                          for r in result['articleList']][:1]:
                urlc = tuple(newurl.split('?'))

                if newurl not in seens and\
                    urlc not in URLs:
                     URLs.append(urlc)

In [61]:
len(seens), len(URLs), seens[2:]

(108,
 0,
 ['https://comic.naver.com/webtoon/detail?titleId=817019&no=1',
  'https://image-comic.pstatic.net/static/agerate/age_all_white.jpg',
  'https://image-comic.pstatic.net/webtoon/817019/1/20230920115114_abb08c3ed917b664f91e72efbeed7875_IMAG01_1.jpg',
  'https://image-comic.pstatic.net/webtoon/817019/1/20230920115114_abb08c3ed917b664f91e72efbeed7875_IMAG01_2.jpg',
  'https://image-comic.pstatic.net/webtoon/817019/1/20230920115114_abb08c3ed917b664f91e72efbeed7875_IMAG01_3.jpg',
  'https://image-comic.pstatic.net/webtoon/817019/1/20230920115114_abb08c3ed917b664f91e72efbeed7875_IMAG01_4.jpg',
  'https://image-comic.pstatic.net/webtoon/817019/1/20230920115114_abb08c3ed917b664f91e72efbeed7875_IMAG01_5.jpg',
  'https://image-comic.pstatic.net/webtoon/817019/1/20230920115114_abb08c3ed917b664f91e72efbeed7875_IMAG01_6.jpg',
  'https://image-comic.pstatic.net/webtoon/817019/1/20230920115114_abb08c3ed917b664f91e72efbeed7875_IMAG01_7.jpg',
  'https://image-comic.pstatic.net/webtoon/817019/1

In [51]:
import os

In [52]:
os.mkdir('./webtoon')

In [53]:
os.listdir('.')

['.cache',
 '.config',
 '.eclipse',
 '.gitconfig',
 '.idlerc',
 '.ipynb_checkpoints',
 '.ipython',
 '.jupyter',
 '.keras',
 '.lemminx',
 '.m2',
 '.matplotlib',
 '.ms-ad',
 '.p2',
 '.vscode',
 '0610_환전페이지.sql',
 '0905.db',
 '0905.ipynb',
 '100_Numpy_exercises.ipynb',
 'a.txt',
 'AppData',
 'Application Data',
 'Contacts',
 'Cookies',
 'db_0906.ipynb',
 'DB_0906_2.ipynb',
 'DB_0907.ipynb',
 'DB_0907_실습.ipynb',
 'DB_0908.ipynb',
 'DB_0911.ipynb',
 'DB_0911_ex.ipynb',
 'DB_0912_cls.ipynb',
 'DB_0912_ex.ipynb',
 'DB_0913_cls.ipynb',
 'DB_0913_ex.ipynb',
 'DB_0915_cls.ipynb',
 'DB_0917_ex.ipynb',
 'DB_0918_cls.ipynb',
 'Desktop',
 'Documents',
 'Downloads',
 'Favorites',
 'function_0905.ipynb',
 'git',
 'hsperfdata_user',
 'jupyter0905.ipynb',
 'Links',
 'Local Settings',
 'Music',
 'My Documents',
 'NetHood',
 'NTUSER.DAT',
 'ntuser.dat.LOG1',
 'ntuser.dat.LOG2',
 'NTUSER.DAT{a2332f18-cdbf-11ec-8680-002248483d79}.TM.blf',
 'NTUSER.DAT{a2332f18-cdbf-11ec-8680-002248483d79}.TMContainer0000000

In [62]:
######
# Cookies / Sessions

In [63]:
url ='https://pythonscraping.com/pages/cookies/login.html'
resp = download(url)
dom = BeautifulSoup(resp.text, 'lxml')

In [64]:
dom

<html><body><h2>Log In Here!</h2>
<form action="welcome.php" method="post">
Username (use anything!): <input name="username" type="text"/><br/>
Password (try "password"): <input name="password" type="password"/><br/>
<input type="submit" value="Login"/>
</form></body></html>

In [65]:
dom.select('form')

[<form action="welcome.php" method="post">
 Username (use anything!): <input name="username" type="text"/><br/>
 Password (try "password"): <input name="password" type="password"/><br/>
 <input type="submit" value="Login"/>
 </form>]

In [66]:
dom.select('form > input')

[<input name="username" type="text"/>,
 <input name="password" type="password"/>,
 <input type="submit" value="Login"/>]

In [67]:
dom.select_one('form').attrs #이게 없을 수도 있다. 일부러 못알아보라고

{'method': 'post', 'action': 'welcome.php'}

In [69]:
for tag in dom.select('form > input[name]'):
    print(tag.attrs)

{'type': 'text', 'name': 'username'}
{'type': 'password', 'name': 'password'}


In [83]:
#상대방 서버는 아마도 아파치일거다.
newurl = urljoin(resp.request.url, dom.select_one('form').attrs['action'])

In [71]:
params = list()
for tag in dom.select('form > input[name]'):
    params.append(tag.attrs['name']+'='+'')
'&'.join(params)

'username=&password='

In [72]:
dom.select_one('form').attrs['method']

'post'

In [73]:
dom

<html><body><h2>Log In Here!</h2>
<form action="welcome.php" method="post">
Username (use anything!): <input name="username" type="text"/><br/>
Password (try "password"): <input name="password" type="password"/><br/>
<input type="submit" value="Login"/>
</form></body></html>

In [97]:
download(newurl, {'username':'111', 'password':'password'}, 'POST')

<Response [200]>

In [98]:
resp.status_code

200

In [99]:
resp.headers
# header에서 set-cookies 부분 잘 봐줘야해!

{'Server': 'nginx', 'Date': 'Mon, 25 Sep 2023 02:47:53 GMT', 'Content-Type': 'text/html; charset=UTF-8', 'Content-Length': '73', 'Connection': 'keep-alive', 'X-Powered-By': 'PHP/7.4.33, PleskLin', 'Vary': 'Accept-Encoding', 'Content-Encoding': 'gzip'}

In [100]:
resp.text

"Hey 111! Looks like you're still logged into the site!"

In [101]:
resp.request.headers, resp.request.method, resp.request.url, \
resp.request.body

({'User-Agent': 'python-requests/2.31.0', 'Accept-Encoding': 'gzip, deflate', 'Accept': '*/*', 'Connection': 'keep-alive', 'Cookie': 'loggedin=1; username=111'},
 'GET',
 'https://pythonscraping.com/pages/cookies/profile.php',
 None)

In [102]:
resp = download(urljoin(newurl, 'profile.php'))
resp.text

'You\'re not logged into the site!<br>Visit <a href="login.html">the login page</a> to log in'

In [103]:
from requests.cookies import cookiejar_from_dict

In [104]:
cookiejar_from_dict({'username':'어쩌고'})

<RequestsCookieJar[Cookie(version=0, name='username', value='어쩌고', port=None, port_specified=False, domain='', domain_specified=False, domain_initial_dot=False, path='/', path_specified=True, secure=False, expires=None, discard=True, comment=None, comment_url=None, rest={'HttpOnly': None}, rfc2109=False)]>

In [105]:
# loggedin=1; username=111
cookie = {
    'loggedin' : '1'
    , 'username' : '111'
}

In [106]:
resp = request('GET', urljoin(newurl, 'profile.php'), cookies=cookie)
resp.text

"Hey 111! Looks like you're still logged into the site!"

In [107]:
cookie['loggedin'] = '111'
resp = request('GET', urljoin(newurl, 'profile.php'), cookies=cookie)
resp.text

"Hey 111! Looks like you're still logged into the site!"

In [108]:
#이렇게 하면, 쿠키를 내가 매번 관리해줘야한다고 한다. 

In [114]:
#이번에는 세션으로 해보려고 한다.
from requests.sessions import Session
sess = Session()
sess.cookies.set('loggedin', '1')

Cookie(version=0, name='loggedin', value='1', port=None, port_specified=False, domain='', domain_specified=False, domain_initial_dot=False, path='/', path_specified=True, secure=False, expires=None, discard=True, comment=None, comment_url=None, rest={'HttpOnly': None}, rfc2109=False)

In [None]:
sess.cookies.set

In [115]:
from requests.compat import quote

In [116]:
from requests.compat import urlencode
sess.cookies.set('username', quote('한글'))

Cookie(version=0, name='username', value='%ED%95%9C%EA%B8%80', port=None, port_specified=False, domain='', domain_specified=False, domain_initial_dot=False, path='/', path_specified=True, secure=False, expires=None, discard=True, comment=None, comment_url=None, rest={'HttpOnly': None}, rfc2109=False)

In [117]:
sess.get(urljoin(newurl, 'profile.php')).text

"Hey 한글! Looks like you're still logged into the site!"

In [120]:
resp = sess.get(urljoin(newurl, 'profile.php'))

In [121]:
resp.request.headers

{'User-Agent': 'python-requests/2.31.0', 'Accept-Encoding': 'gzip, deflate', 'Accept': '*/*', 'Connection': 'keep-alive', 'Cookie': 'loggedin=1; username=%ED%95%9C%EA%B8%80'}

In [122]:
sess.cookies

<RequestsCookieJar[Cookie(version=0, name='loggedin', value='1', port=None, port_specified=False, domain='', domain_specified=False, domain_initial_dot=False, path='/', path_specified=True, secure=False, expires=None, discard=True, comment=None, comment_url=None, rest={'HttpOnly': None}, rfc2109=False), Cookie(version=0, name='username', value='%ED%95%9C%EA%B8%80', port=None, port_specified=False, domain='', domain_specified=False, domain_initial_dot=False, path='/', path_specified=True, secure=False, expires=None, discard=True, comment=None, comment_url=None, rest={'HttpOnly': None}, rfc2109=False)]>

In [123]:
newurl = 'https://pythonscraping.com/pages/cookies/welcome.php'
resp = sess.post(newurl, {'username':'111', 'password':'password'})
resp.headers

{'Server': 'nginx', 'Date': 'Mon, 25 Sep 2023 02:58:06 GMT', 'Content-Type': 'text/html; charset=UTF-8', 'Content-Length': '123', 'Connection': 'keep-alive', 'X-Powered-By': 'PHP/7.4.33, PleskLin', 'Set-Cookie': 'loggedin=1, username=111', 'Vary': 'Accept-Encoding', 'Content-Encoding': 'gzip'}

In [125]:
resp = sess.get(urljoin(newurl, 'profile.php'))
resp.text, resp.request.headers

("Hey 111! Looks like you're still logged into the site!",
 {'User-Agent': 'python-requests/2.31.0', 'Accept-Encoding': 'gzip, deflate', 'Accept': '*/*', 'Connection': 'keep-alive', 'Cookie': 'loggedin=1; username=111; loggedin=1; username=%ED%95%9C%EA%B8%80'})

In [126]:
#쿠키 지우기
sess.cookies.clear()

In [166]:
#lms 예제

c = '''

'''

cookies = dict()

for l in c.splitlines():
    if len(l) > 0:
        cookies[l.split('\t')[0]] = l.split('\t')[1]

In [167]:
for k, v in cookies.items():
    sess.cookies.set(k,v)

In [168]:
resp = download('https://lms.sunde41.net')
resp.text

'<!DOCTYPE html>\n<html lang="ko">\n<head>\n    <meta charset="UTF-8">\n    <meta http-equiv="X-UA-Compatible" content="IE=edge,chrome=1">\n    <meta name="viewport" content="width=device-width, initial-scale=1, maximum-scale=1, shrink-to-fit=no">\n    <meta name="description" content="">\n    <meta name="author" content="">\n    <title> 로그인 : 고려대학교 지능정보SW아카데미</title>\n    <link href="/static/vendors/login.css" rel="stylesheet">\n    <link href="/static/vendors/bootstrap.css" rel="stylesheet">\n    <link href="/static/vendors/style.css?v=1.7" rel="stylesheet">\n    <link rel="shortcut icon" href="/static/favicon/favicon.ico" type="image/x-icon">\n    <link rel="icon" href="/static/favicon/favicon.ico" type="image/x-icon">\n    <style>\n        .m-form .m-form__group {\n        padding-top: 0;\n        padding-bottom: 0;\n        }\n    </style>\n</head>\n<body>\n    <div class="form-body without-side">\n        <div class="row">\n            <div class="img-holder">\n                <d

In [169]:
from requests import get
resp = get('https://lms.sunde41.net')
resp.request.headers

{'User-Agent': 'python-requests/2.31.0', 'Accept-Encoding': 'gzip, deflate', 'Accept': '*/*', 'Connection': 'keep-alive'}

In [170]:
resp = sess.get('https://lms.sunde41.net')
resp.request.headers

{'User-Agent': 'python-requests/2.31.0', 'Accept-Encoding': 'gzip, deflate', 'Accept': '*/*', 'Connection': 'keep-alive'}

In [138]:
resp = get('https://lms.sunde41.net', cookies=sess.cookies)
resp.text

'<!DOCTYPE html>\n<html lang="ko">\n<head>\n    <meta charset="UTF-8">\n    <meta http-equiv="X-UA-Compatible" content="IE=edge,chrome=1">\n    <meta name="viewport" content="width=device-width, initial-scale=1, maximum-scale=1, shrink-to-fit=no">\n    <meta name="description" content="">\n    <meta name="author" content="">\n    <title> 로그인 : 고려대학교 지능정보SW아카데미</title>\n    <link href="/static/vendors/login.css" rel="stylesheet">\n    <link href="/static/vendors/bootstrap.css" rel="stylesheet">\n    <link href="/static/vendors/style.css?v=1.7" rel="stylesheet">\n    <link rel="shortcut icon" href="/static/favicon/favicon.ico" type="image/x-icon">\n    <link rel="icon" href="/static/favicon/favicon.ico" type="image/x-icon">\n    <style>\n        .m-form .m-form__group {\n        padding-top: 0;\n        padding-bottom: 0;\n        }\n    </style>\n</head>\n<body>\n    <div class="form-body without-side">\n        <div class="row">\n            <div class="img-holder">\n                <d

In [171]:
# 원래 로그인 절차
resp = get('https://lms.sunde41.net')
dom = BeautifulSoup(resp.text, 'lxml')

In [180]:
form = dom.select_one('form')

In [181]:
#### 하 이거 뭐야!!!!!
inputs = dom.select('form input[name]')
inputs

[<input id="next" name="next" type="hidden" value="/"/>,
 <input autocomplete="email" class="form-control m-input" id="email" name="email" placeholder="이메일을 입력하세요" type="email" value=""/>,
 <input autocomplete="current-password" class="form-control m-input" id="password" name="password" placeholder="비밀번호를 입력하세요" required="" type="password" value=""/>,
 <input checked="" class="form-control" id="remember" name="remember" type="checkbox"/>]

In [182]:
form.attrs, urljoin(resp.request.url, form.attrs['action'])

({'action': '/auth/login',
  'method': 'POST',
  'name': 'login_user_form',
  'class': ['m-form']},
 'https://lms.sunde41.net/auth/login')

In [183]:
for tag in inputs:
    print(tag.attrs['name']+'='+(tag.attrs['value']
                                 if tag.has_attr('value')
                                 else ''))

next=/
email=
password=
remember=


In [165]:
# LMS 로그인 -> 쿠키 활용
# 수업게시판
# 첨부자료가 있는 수업 목록만 추출
# 첨부자료 링크로 추출