In [3]:
from selenium import webdriver
import pandas as pd
import base64
import re

In [78]:
def link_source(link):
    if link.find('taobao')>0:
        return 'tb'
    else:
        return 'tm'
links_df = pd.read_excel('爬取链接.xlsx')
links_df['web'] = links_df['links'].apply(link_source)

In [109]:
links_df.head()

Unnamed: 0,links,web
0,https://item.taobao.com/item.htm?spm=a230r.1.1...,tb
1,https://item.taobao.com/item.htm?spm=a230r.1.1...,tb
2,https://detail.tmall.com/item.htm?spm=a230r.1....,tm
3,https://detail.tmall.com/item.htm?spm=a1z10.3-...,tm
4,https://item.taobao.com/item.htm?spm=a1z10.3-c...,tb


In [153]:
import re
import os
import json
import requests


s = requests.Session()
# cookies序列化文件
COOKIES_FILE_PATH = 'taobao_login_cookies.txt'


class UsernameLogin:

    def __init__(self, loginId, umidToken, ua, password2):
        """
        账号登录对象
        :param loginId: 用户名
        :param umidToken: 新版登录新增参数
        :param ua: 淘宝的ua参数
        :param password2: 加密后的密码
        """
        # 检测是否需要验证码的URL
        self.user_check_url = 'https://login.taobao.com/newlogin/account/check.do?appName=taobao&fromSite=0'
        # 验证淘宝用户名密码URL
        self.verify_password_url = "https://login.taobao.com/newlogin/login.do?appName=taobao&fromSite=0"
        # 访问st码URL
        self.vst_url = 'https://login.taobao.com/member/vst.htm?st={}'
        # 淘宝个人 主页
        self.my_taobao_url = 'http://i.taobao.com/my_taobao.htm'

        # 淘宝用户名
        self.loginId = loginId
        # 淘宝用户名
        self.umidToken = umidToken
        # 淘宝关键参数，包含用户浏览器等一些信息，很多地方会使用，从浏览器或抓包工具中复制，可重复使用
        self.ua = ua
        # 加密后的密码，从浏览器或抓包工具中复制，可重复使用
        self.password2 = password2

        # 请求超时时间
        self.timeout = 3

    def _user_check(self):
        """
        检测账号是否需要验证码
        :return:
        """
        data = {
            'loginId': self.loginId,
            'ua': self.ua,
        }
        try:
            response = s.post(self.user_check_url, data=data, timeout=self.timeout)
            response.raise_for_status()
        except Exception as e:
            print('检测是否需要验证码请求失败，原因：')
            raise e
        check_resp_data = response.json()['content']['data']
        needcode = False
        # 判断是否需要滑块验证，一般短时间密码错误多次可能出现
        if 'isCheckCodeShowed' in check_resp_data:
            needcode = True
        print('是否需要滑块验证：{}'.format(needcode))
        return needcode

    def _get_umidToken(self):
        """
        获取umidToken参数
        :return:
        """
        response = s.get('https://login.taobao.com/member/login.jhtml')
        st_match = re.search(r'"umidToken":"(.*?)"', response.text)
        print(st_match.group(1))
        return st_match.group(1)

    @property
    def _verify_password(self):
        """
        验证用户名密码，并获取st码申请URL
        :return: 验证成功返回st码申请地址
        """
        verify_password_headers = {
            'Origin': 'https://login.taobao.com',
            'content-type': 'application/x-www-form-urlencoded',
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36',
            'Content-Type': 'application/x-www-form-urlencoded',
            'Referer': 'https://login.taobao.com/member/login.jhtml?spm=a21bo.2017.754894437.1.5af911d9HjW9WC&f=top&redirectURL=https%3A%2F%2Fwww.taobao.com%2F',
        }
        # 验证用户名密码参数
        verify_password_data = {
            'ua': self.ua,
            'loginId': self.loginId,
            'password2': self.password2,
            'umidToken': self.umidToken,
            'appEntrance': 'taobao_pc',
            'isMobile': 'false',
            'returnUrl': 'https://www.taobao.com/',
            'navPlatform': 'MacIntel',
        }
        try:
            response = s.post(self.verify_password_url, headers=verify_password_headers, data=verify_password_data,
                              timeout=self.timeout)
            response.raise_for_status()
            # 从返回的页面中提取申请st码地址
        except Exception as e:
            print('验证用户名和密码请求失败，原因：')
            raise e
        # 提取申请st码url
        apply_st_url_match = response.json()['content']['data']['asyncUrls'][0]
        # 存在则返回
        if apply_st_url_match:
            print('验证用户名密码成功，st码申请地址：{}'.format(apply_st_url_match))
            return apply_st_url_match
        else:
            raise RuntimeError('用户名密码验证失败！response：{}'.format(response.text))

    def _apply_st(self):
        """
        申请st码
        :return: st码
        """
        apply_st_url = self._verify_password
        try:
            response = s.get(apply_st_url)
            response.raise_for_status()
        except Exception as e:
            print('申请st码请求失败，原因：')
            raise e
        st_match = re.search(r'"data":{"st":"(.*?)"}', response.text)
        if st_match:
            print('获取st码成功，st码：{}'.format(st_match.group(1)))
            return st_match.group(1)
        else:
            raise RuntimeError('获取st码失败！response：{}'.format(response.text))

    def login(self):
        """
        使用st码登录
        :return:
        """
        # 加载cookies文件
        if self._load_cookies():
            return True
        # 判断是否需要滑块验证
        self._user_check()
        st = self._apply_st()
        headers = {
            'Host': 'login.taobao.com',
            'Connection': 'Keep-Alive',
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'
        }
        try:
            response = s.get(self.vst_url.format(st), headers=headers)
            response.raise_for_status()
        except Exception as e:
            print('st码登录请求，原因：')
            raise e
        # 登录成功，提取跳转淘宝用户主页url
        my_taobao_match = re.search(r'top.location.href = "(.*?)"', response.text)
        if my_taobao_match:
            print('登录淘宝成功，跳转链接：{}'.format(my_taobao_match.group(1)))
            self.my_taobao_url = my_taobao_match.group(1)
            self._serialization_cookies()
            return True
        else:
            raise RuntimeError('登录失败！response：{}'.format(response.text))

    def _load_cookies(self):
        # 1、判断cookies序列化文件是否存在
        if not os.path.exists(COOKIES_FILE_PATH):
            return False
        # 2、加载cookies
        s.cookies = self._deserialization_cookies()
        # 3、判断cookies是否过期
        try:
            self.get_taobao_nick_name()
        except Exception as e:
            os.remove(COOKIES_FILE_PATH)
            print('cookies过期，删除cookies文件！')
            return False
        print('加载淘宝cookies登录成功!!!')
        return True

    def _serialization_cookies(self):
        """
        序列化cookies
        :return:
        """
        cookies_dict = requests.utils.dict_from_cookiejar(s.cookies)
        with open(COOKIES_FILE_PATH, 'w+', encoding='utf-8') as file:
            json.dump(cookies_dict, file)
            print('保存cookies文件成功！')

    def _deserialization_cookies(self):
        """
        反序列化cookies
        :return:
        """
        with open(COOKIES_FILE_PATH, 'r+', encoding='utf-8') as file:
            cookies_dict = json.load(file)
            cookies = requests.utils.cookiejar_from_dict(cookies_dict)
            return cookies

    def get_taobao_nick_name(self):
        """
        获取淘宝昵称
        :return: 淘宝昵称
        """
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'
        }
        try:
            response = s.get(self.my_taobao_url, headers=headers)
            response.raise_for_status()
        except Exception as e:
            print('获取淘宝主页请求失败！原因：')
            raise e
        # 提取淘宝昵称
        nick_name_match = re.search(r'<input id="mtb-nickname" type="hidden" value="(.*?)"/>', response.text)
        if nick_name_match:
            print('登录淘宝成功，你的用户名是：{}'.format(nick_name_match.group(1)))
            return nick_name_match.group(1)
        else:
            raise RuntimeError('获取淘宝昵称失败！response：{}'.format(response.text))


if __name__ == '__main__':
    # 说明：loginId、umidToken、ua、password2这4个参数都是从浏览器登录页面复制过来的。
    # 如何复制4个参数：
    # # 1、浏览器打开：https://login.taobao.com/member/login.jhtml
    # # 2、F12打开调试窗口，左边有个Preserve log，勾选上，这样页面跳转请求记录不会丢失
    # # 3、输入用户名密码登录，然后找到请求：newlogin/login.do 这个是登录请求
    # # 4、复制上面的4个参数到下面，基本就可以运行了

    # 淘宝用户名：手机 用户名 都可以
    loginId = '18118608048'
    # 改版后增加的参数，后面考虑解密这个参数
    umidToken = '54aa64530104f0e7cd24afe390c385e954c00aec',
    # 淘宝重要参数，从浏览器或抓包工具中复制，可重复使用
    ua = '137#Qxc9hE9o98gWwDYIr9O6izAGIndfKPdENhe2//UmaenRMEbpS1ebV2t9/fOehmKnnf59QVtWl4PGB+0CU7DByeEcuY2Kz9c9I4DhCq+UyZ3697EGE3WW4GE+T+B3cy0JXaBy16iRbGrgQ5XcT0Y+dO/kuoin5mE+O3QQvOrHe6mXW/ikdT1bQf/TKeJe7HpPzaQc94hra0YrZqYIMqUwwE2VE/0jGDS0KRBObo42bJgP9FosXmwBP9MuSDH05BPxaDU2gJdZs6uzkZrcy7suMSGO7UrCa3m6xcSU1SVwAmrajM3c2T1iw0FFp/IKzOX2YB/b2Z030mshdLbMT+I40B1aA/dRX+3HIMkXH/Ua0dmXeGT6Et9oPArv7Bc7wTIWdz2TtvHHE8sEJSQ/Eylwb3gbQM+FySsaqrddzDvq6JMpzNnS5nt4z1Gyan1M2Msjnv/3bWxgqztBLORGzJcLMZQALFfaWEMPA+zPOdZmQofJ+GxV2n1m1BQI+q+hYrxWnqEIEQ6hQefS+GDpF+L0zUKi+tiVYSUS1lQyqdicQDJL5/6MpRJs1Iey+tiVYTUS8lWZ9dicue3t++Y9tRDu1yIRAdPZwXbx1qQipXpmQAfJ+GXppRSf1Iei+piVYTUx1lgippimQNw/ObZRiBqXEn1sax3QW66qkQMBVqjePSqrquMLc9XHCPiRI4qKQNEk8mEIdeBKjJF4PrhLL8oJ5Z5pdafDsE9zEn1tur7ryfOfiJw99c1F9WpyryUBo5rqmEmo0JsNmGryUP8MIYATMqyjrpQnOGvp/g+Bt0Xz6U/wNCahVmPxLbu89icqI6wXQYwQTSvYVgRv0rwV0yD1GGRq1ntYKvArOOSs6Z6qnpdnXCEk/dLzwj5sMQ0wjtHwUpj/AMOUat7U28SRgaavjfsxuAunJ8gs6PEQE5M76dUPnHhfhwKEEvLW6t5eczNRupRgY3scKKro0Vz5srONGk4zfmGXFB5a6AAxf0a95TFR+GPlYySc8yvNkiz80umBgfJlGEmRWWrQ6sZadNlGbEAoRIqCe8PFm0DDtpnDwiHI+YI='
    # 加密后的密码，从浏览器或抓包工具中复制，可重复使用
    password2 = '71f70155a14b08c920b3ee5cfd810b78b724ee93f809ea9ebcc13ba0c4180e0486c9d4f7c5949f4a8fd24cb925fa000c06d493e535ccd595909eae613c465a4fb85d2bda8e2d2d953583c4a9394dbb7085dd7b3f7b988062a6ec7442c8e969a88f7b0eeb3e7303df14bff70659e1f515d7e65b25e09537013ac177100bacc365'

    ul = UsernameLogin(loginId, umidToken, ua, password2)
    ul.login()
    ul.get_taobao_nick_name()

登录淘宝成功，你的用户名是：张小磨莫
加载淘宝cookies登录成功!!!
登录淘宝成功，你的用户名是：张小磨莫


In [154]:
a=s.get('https://detail.tmall.com/item.htm?spm=a230r.1.14.9.464f3c5a9uwmF1&id=560560153011&cm_id=140105335569ed55e27b&abbucket=10',verify=True)

In [155]:
a.encoding = a.apparent_encoding
from bs4 import BeautifulSoup
soup= BeautifulSoup(a.text,'lxml')
soup

<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8"/>
<meta content="webkit" name="renderer"/>
<meta content="IE=Edge" http-equiv="X-UA-Compatible"/>
<script>
    (function(w,d){
    try{
        var l,url='//mdskip.taobao.com/core/initItemDetail.htm?isUseInventoryCenter=false&cartEnable=true&service3C=false&isApparel=false&isSecKill=false&tmallBuySupport=true&isAreaSell=true&tryBeforeBuy=false&offlineShop=false&itemId=560560153011&showShopProm=false&isPurchaseMallPage=false&itemGmtModified=1600755582000&isRegionLevel=true&household=false&sellerPreview=false&queryMemberRight=true&addressLevel=3&isForbidBuyItem=false',isg=document.cookie.match('(^|;) ?l=([^;]*)(;|$)'),isg2 =document.cookie.match('(^|;) ?isg=([^;]*)(;|$)');
        if(!url){return;}
        var arr=["callback=setMdskip","timestamp="+(+new Date()),"isg="+(isg&&isg[2]),"isg2="+(isg2&&isg2[2])],reg=/[?&^](ip|campaignId|key|abt|cat_id|q|u_channel|areaId|sdShopId)=([^&]+)/g,params=w.location.search;
        while(r=reg.exec(p

In [144]:
title = soup.select('.tb-title > h3')[0].text#标题
under_line_price = soup.select('#J_StrPrice > em')[1].text#划线价
promote_price = soup.select('.tb-promo-item-bd')#促销价 
# activity_price = tree.xpath('//*[@id="J_ActivityPrice"]/text()')#活动价

<!DOCTYPE html>
<html><!-- cph -->
<head>
<meta content="IE=edge" http-equiv="X-UA-Compatible"/>
<meta charset="utf-8"/>
<meta content="telephone=no, address=no" name="format-detection"/>
<link href="//g.alicdn.com" rel="dns-prefetch"/>
<link href="//gtms01.alicdn.com" rel="dns-prefetch"/>
<link href="//gtms02.alicdn.com" rel="dns-prefetch"/>
<link href="//gtms03.alicdn.com" rel="dns-prefetch"/>
<link href="//gtms04.alicdn.com" rel="dns-prefetch"/>
<link href="//gd1.alicdn.com" rel="dns-prefetch"/>
<link href="//gd2.alicdn.com" rel="dns-prefetch"/>
<link href="//gd3.alicdn.com" rel="dns-prefetch"/>
<link href="//gd4.alicdn.com" rel="dns-prefetch"/>
<link href="https://item.taobao.com/item.htm?id=595151148187" rel="canonical"/>
<link href="https://www.taobao.com/list/item-amp/595151148187.htm" hreflang="zh-Hans" rel="amphtml"/>
<link href="https://world.taobao.com/item/595151148187.htm" hreflang="zh-Hant" rel="alternate"/>
<meta content="webkit" name="renderer"/>
<meta content="always" 

In [105]:
[0].text



copoun_info = tree.xpath('//*[@id="J_OtherDiscount"]/div/div/div/text()')#促销价

In [106]:
tree.xpath('//*[@id="J_PromoPriceNum"]/em[2]/text()')

[]

In [107]:
tree.xpath('//*[@id="J_StrPrice"]/em[2]/text()')

['128.00']

In [108]:
title

['\n            老宋的微醺23点 英国原装进口孟买蓝宝石金酒金汤力鸡尾洋酒750ml\n        ']

In [113]:
soup

<!DOCTYPE html>
<html><!-- cph -->
<head>
<meta content="IE=edge" http-equiv="X-UA-Compatible"/>
<meta charset="utf-8"/>
<meta content="telephone=no, address=no" name="format-detection"/>
<link href="//g.alicdn.com" rel="dns-prefetch"/>
<link href="//gtms01.alicdn.com" rel="dns-prefetch"/>
<link href="//gtms02.alicdn.com" rel="dns-prefetch"/>
<link href="//gtms03.alicdn.com" rel="dns-prefetch"/>
<link href="//gtms04.alicdn.com" rel="dns-prefetch"/>
<link href="//gd1.alicdn.com" rel="dns-prefetch"/>
<link href="//gd2.alicdn.com" rel="dns-prefetch"/>
<link href="//gd3.alicdn.com" rel="dns-prefetch"/>
<link href="//gd4.alicdn.com" rel="dns-prefetch"/>
<link href="https://item.taobao.com/item.htm?id=595151148187" rel="canonical"/>
<link href="https://www.taobao.com/list/item-amp/595151148187.htm" hreflang="zh-Hans" rel="amphtml"/>
<link href="https://world.taobao.com/item/595151148187.htm" hreflang="zh-Hant" rel="alternate"/>
<meta content="webkit" name="renderer"/>
<meta content="always" 

<Element html at 0x230ce41e488>