## Get Industry & Sector List

Credit : https://www.dataquest.io/blog/web-scraping-tutorial-python/

In [37]:
import requests
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
from datetime import datetime

ณ วันที่พัฒนานี้รองรับ URL "https://www.settrade.com/C13_MarketSummary.jsp?detail=INDUSTRY" ได้

In [72]:
def getIndustryAndSector(url):
    page = requests.get(url) # ดูดหน้าเว็บมา
    
    if page.status_code == 200 :
        # All HTML of Page
        soup = BeautifulSoup(page.content, "html5lib")
        # ซอยข้อมูลทั้ง page เป็นทีละส่วนๆ แล้วลงลึกไปเอาข้อมูลทีละจุดๆ
        html = list(soup.children)[1]
        head = list(html.children)[0]
        body = list(html.children)[2]
        table_body = list(body.find(class_="table-hover"))[3]
        
        # Industry and Sector
        IAS_rows = table_body.find_all('tr')
        IAS_name_list = np.array([ IAS_row.find(class_="link-stt").text for IAS_row in IAS_rows ])
        IAS_url_list = np.array([ IAS_row.find(class_="link-stt").get('href') for IAS_row in IAS_rows ])
        
        # Only Industry
        Ind_rows = table_body.find_all(style="background: #cccccc;")
        Ind_name_list = np.array([ Ind_row.find(class_="link-stt").text for Ind_row in Ind_rows ])
        
        # Label Sector by Industry
        Ind_index = -1
        labels = np.array([])
        for IAS_name in IAS_name_list:
            if IAS_name in Ind_name_list:
                Ind_index+=1
            Ind_name = Ind_name_list[Ind_index]
            labels = np.append(labels, Ind_name)

        # Convert to DataFrame         
        df = pd.DataFrame({'Industry': labels, 'Sector': IAS_name_list, 'URL': IAS_url_list})
        industry = df[df['Industry'] == df['Sector']]
        del industry['Sector']
        sector = df[df['Industry'] != df['Sector']]
        return [industry,sector]
    else:
        print("Can't get content from this URL!!!")

In [73]:
[industry_df,sector_df] = getIndustryAndSector("https://www.settrade.com/C13_MarketSummary.jsp?detail=INDUSTRY")

In [74]:
industry_df

Unnamed: 0,Industry,URL
0,AGRO,/C13_MarketSummary.jsp?detail=INDUSTRY&industr...
3,CONSUMP,/C13_MarketSummary.jsp?detail=INDUSTRY&industr...
7,FINCIAL,/C13_MarketSummary.jsp?detail=INDUSTRY&industr...
11,INDUS,/C13_MarketSummary.jsp?detail=INDUSTRY&industr...
18,PROPCON,/C13_MarketSummary.jsp?detail=INDUSTRY&industr...
23,RESOURC,/C13_MarketSummary.jsp?detail=INDUSTRY&industr...
26,SERVICE,/C13_MarketSummary.jsp?detail=INDUSTRY&industr...
33,TECH,/C13_MarketSummary.jsp?detail=INDUSTRY&industr...


In [75]:
sector_df[0:10]

Unnamed: 0,Industry,Sector,URL
1,AGRO,AGRI,/C13_MarketSummary.jsp?detail=INDUSTRY&sector=...
2,AGRO,FOOD,/C13_MarketSummary.jsp?detail=INDUSTRY&sector=...
4,CONSUMP,FASHION,/C13_MarketSummary.jsp?detail=INDUSTRY&sector=...
5,CONSUMP,HOME,/C13_MarketSummary.jsp?detail=INDUSTRY&sector=...
6,CONSUMP,PERSON,/C13_MarketSummary.jsp?detail=INDUSTRY&sector=...
8,FINCIAL,BANK,/C13_MarketSummary.jsp?detail=INDUSTRY&sector=...
9,FINCIAL,FIN,/C13_MarketSummary.jsp?detail=INDUSTRY&sector=...
10,FINCIAL,INSUR,/C13_MarketSummary.jsp?detail=INDUSTRY&sector=...
12,INDUS,AUTO,/C13_MarketSummary.jsp?detail=INDUSTRY&sector=...
13,INDUS,IMM,/C13_MarketSummary.jsp?detail=INDUSTRY&sector=...


In [76]:
industry_df.to_csv('../../data/explore/industry_list.csv', index=False)

In [77]:
sector_df.to_csv('../../data/explore/sector_list.csv', index=False)

In [78]:
len(industry_df)

8

In [79]:
len(sector_df)

28

In [80]:
str(datetime.now().strftime('%Y-%m-%d'))

'2018-04-26'

## Get Stock List

In [222]:
sector_table = pd.read_csv('../../data/explore/sector_list.csv')
sector_table.head()

Unnamed: 0,Industry,Sector,URL
0,AGRO,AGRI,/C13_MarketSummary.jsp?detail=INDUSTRY&sector=...
1,AGRO,FOOD,/C13_MarketSummary.jsp?detail=INDUSTRY&sector=...
2,CONSUMP,FASHION,/C13_MarketSummary.jsp?detail=INDUSTRY&sector=...
3,CONSUMP,HOME,/C13_MarketSummary.jsp?detail=INDUSTRY&sector=...
4,CONSUMP,PERSON,/C13_MarketSummary.jsp?detail=INDUSTRY&sector=...


สร้าง function Get หุ้นจาก แต่ละ Sector

In [223]:
def getStockList(url,sector):
    page = requests.get(url) # ดูดหน้าเว็บมา
    
    if page.status_code == 200 :
        # All HTML of Page
        soup = BeautifulSoup(page.content, "html5lib")
        # ซอยข้อมูลทั้ง page เป็นทีละส่วนๆ แล้วลงลึกไปเอาข้อมูลทีละจุดๆ
        html = list(soup.children)[1]
        head = list(html.children)[0]
        body = list(html.children)[2]
        # ***
        target_table = list(body.find_all(class_="table-hover"))[1]
        target_table_body = list(target_table.children)[3]

        # Stocks
        stock_rows = target_table_body.find_all('tr')
        stock_name_list = np.array([ stock_row.find(class_="link-stt").text for stock_row in stock_rows ])
        stock_url_list = np.array([ stock_row.find(class_="link-stt").get('href') for stock_row in stock_rows ])

        # Convert to DataFrame         
        stock_df = pd.DataFrame({'Stock': stock_name_list, 'ราคาล่าสุด(Home)': stock_url_list})
        stock_df['Sector'] = sector
        return stock_df
    else:
        print("Can't get content from this URL!!!")

In [224]:
getStockList('https://www.settrade.com/C13_MarketSummary.jsp?detail=INDUSTRY&sector=FOOD&market=SET','FOOD').head()

Unnamed: 0,Stock,ราคาล่าสุด(Home),Sector
0,APURE,/C13_FastQuote_Main.jsp?txtSymbol=APURE,FOOD
1,ASIAN,/C13_FastQuote_Main.jsp?txtSymbol=ASIAN,FOOD
2,BR,/C13_FastQuote_Main.jsp?txtSymbol=BR,FOOD
3,BRR,/C13_FastQuote_Main.jsp?txtSymbol=BRR,FOOD
4,CBG,/C13_FastQuote_Main.jsp?txtSymbol=CBG,FOOD


สร้าง Loop มาไล่วนดึง List หุ้นจากแต่ละ sector

In [225]:
prefix_url = 'https://www.settrade.com'
stock_frames = []

for index,sector in sector_table.iterrows():
    suffix_url = sector['URL']
    settrade_url = prefix_url + suffix_url
    stock_df = getStockList(settrade_url,sector['Sector'])
    stock_frames.append(stock_df)

stock_lists = pd.concat(stock_frames)

In [226]:
stock_lists.head()

Unnamed: 0,Stock,ราคาล่าสุด(Home),Sector
0,CHOTI,/C13_FastQuote_Main.jsp?txtSymbol=CHOTI,AGRI
1,EE,/C13_FastQuote_Main.jsp?txtSymbol=EE,AGRI
2,GFPT,/C13_FastQuote_Main.jsp?txtSymbol=GFPT,AGRI
3,LEE,/C13_FastQuote_Main.jsp?txtSymbol=LEE,AGRI
4,STA,/C13_FastQuote_Main.jsp?txtSymbol=STA,AGRI


In [227]:
stock_lists.to_csv('../../data/explore/stock_list.csv', index=False)

*Note : ตรงส่วนนี้ข้อมูลไม่ตรง จาก settrade ดึงหุ้นมาได้ 597 ตัว แต่ของกวีมีหุ้น 629 ตัว และข้อมูลเก่าที่เคยหาไว้มี 646 ตัว* ?? 

In [228]:
len(stock_lists)

597

# Get URL of Stock

In [229]:
stock_table = pd.read_csv('../../data/explore/stock_list.csv')
print(len(stock_table))
stock_table.head()

597


Unnamed: 0,Stock,ราคาล่าสุด(Home),Sector
0,CHOTI,/C13_FastQuote_Main.jsp?txtSymbol=CHOTI,AGRI
1,EE,/C13_FastQuote_Main.jsp?txtSymbol=EE,AGRI
2,GFPT,/C13_FastQuote_Main.jsp?txtSymbol=GFPT,AGRI
3,LEE,/C13_FastQuote_Main.jsp?txtSymbol=LEE,AGRI
4,STA,/C13_FastQuote_Main.jsp?txtSymbol=STA,AGRI


ข้อมูลจากตารางข้างบน URL จะชี้ไปหน้า **ราคาล่าสุด** แต่เราอยากไปหน้า **งบการเงิน** ดังนั้นต้องสร้างฟังก์ชั่น GET Financial Report URL ของหุ้น ก่อน

In [233]:
def getStockLink(url,stock):
    page = requests.get(url) # ดูดหน้าเว็บมา
    
    if page.status_code == 200 :
        # All HTML of Page
        soup = BeautifulSoup(page.content, "html5lib")
        # ซอยข้อมูลทั้ง page เป็นทีละส่วนๆ แล้วลงลึกไปเอาข้อมูลทีละจุดๆ
        html = list(soup.children)[1]
        head = list(html.children)[0]
        body = list(html.children)[2]
        
        # ***
        nav_menu = list(body.find(class_="nav-tabs-stt"))
        
        previous_price_name = nav_menu[3].find('a').text #ราคาย้อนหลัง
        previous_price_url = nav_menu[3].find('a').get('href')
        company_stock_name = nav_menu[5].find('a').text #บริษัท/หลักทรัพย์
        company_stock_url = nav_menu[5].find('a').get('href') 
        board_of_director_name = nav_menu[7].find('a').text #กรรมการบริษัท
        board_of_director_url = nav_menu[7].find('a').get('href') 
        major_share_holder_name = nav_menu[9].find('a').text #ผู้ถือหุ้นรายใหญ่
        major_share_holder_url = nav_menu[9].find('a').get('href')  
        financial_name = nav_menu[11].find('a').text #งบการเงิน
        financial_url = nav_menu[11].find('a').get('href')

        # Convert to DataFrame         
        stock_df = pd.DataFrame({previous_price_name: [previous_price_url], 
                                 company_stock_name: [company_stock_url],
                                 board_of_director_name: [board_of_director_url],
                                 major_share_holder_name: [major_share_holder_url],
                                 financial_name: [financial_url]
                                })
        stock_df['Stock'] = stock
        return stock_df
    else:
        print("Can't get content from this URL!!!")

In [234]:
other_url = getStockLink('https://www.settrade.com//C13_FastQuote_Main.jsp?txtSymbol=CHOTI','CHOTI')
other_url

Unnamed: 0,กรรมการบริษัท,งบการเงิน,บริษัท/หลักทรัพย์,ผู้ถือหุ้นรายใหญ่,ราคาย้อนหลัง,Stock
0,/C13_FastQuote_Main.jsp?txtSymbol=CHOTI&ssoPag...,/C13_FastQuote_Main.jsp?txtSymbol=CHOTI&ssoPag...,/C13_FastQuote_Main.jsp?txtSymbol=CHOTI&ssoPag...,/C13_FastQuote_Main.jsp?txtSymbol=CHOTI&ssoPag...,/C13_FastQuote_Main.jsp?txtSymbol=CHOTI&ssoPag...,CHOTI


สร้าง Loop ดึงข้อมูลหุ้นทุกตัว

In [235]:
prefix_url = 'https://www.settrade.com'
stock_url_frames = []

for stock_index,stock in stock_table.iterrows():
    suffix_url = stock['ราคาล่าสุด(Home)']
    settrade_url = prefix_url + suffix_url
    stock_url_df = getStockLink(settrade_url,stock['Stock'])
    stock_url_frames.append(stock_url_df)

stock_url_lists = pd.concat(stock_url_frames)

In [237]:
stock_url_lists.head()

Unnamed: 0,กรรมการบริษัท,งบการเงิน,บริษัท/หลักทรัพย์,ผู้ถือหุ้นรายใหญ่,ราคาย้อนหลัง,Stock
0,/C13_FastQuote_Main.jsp?txtSymbol=CHOTI&ssoPag...,/C13_FastQuote_Main.jsp?txtSymbol=CHOTI&ssoPag...,/C13_FastQuote_Main.jsp?txtSymbol=CHOTI&ssoPag...,/C13_FastQuote_Main.jsp?txtSymbol=CHOTI&ssoPag...,/C13_FastQuote_Main.jsp?txtSymbol=CHOTI&ssoPag...,CHOTI
0,/C13_FastQuote_Main.jsp?txtSymbol=EE&ssoPageId...,/C13_FastQuote_Main.jsp?txtSymbol=EE&ssoPageId...,/C13_FastQuote_Main.jsp?txtSymbol=EE&ssoPageId...,/C13_FastQuote_Main.jsp?txtSymbol=EE&ssoPageId...,/C13_FastQuote_Main.jsp?txtSymbol=EE&ssoPageId...,EE
0,/C13_FastQuote_Main.jsp?txtSymbol=GFPT&ssoPage...,/C13_FastQuote_Main.jsp?txtSymbol=GFPT&ssoPage...,/C13_FastQuote_Main.jsp?txtSymbol=GFPT&ssoPage...,/C13_FastQuote_Main.jsp?txtSymbol=GFPT&ssoPage...,/C13_FastQuote_Main.jsp?txtSymbol=GFPT&ssoPage...,GFPT
0,/C13_FastQuote_Main.jsp?txtSymbol=LEE&ssoPageI...,/C13_FastQuote_Main.jsp?txtSymbol=LEE&ssoPageI...,/C13_FastQuote_Main.jsp?txtSymbol=LEE&ssoPageI...,/C13_FastQuote_Main.jsp?txtSymbol=LEE&ssoPageI...,/C13_FastQuote_Main.jsp?txtSymbol=LEE&ssoPageI...,LEE
0,/C13_FastQuote_Main.jsp?txtSymbol=STA&ssoPageI...,/C13_FastQuote_Main.jsp?txtSymbol=STA&ssoPageI...,/C13_FastQuote_Main.jsp?txtSymbol=STA&ssoPageI...,/C13_FastQuote_Main.jsp?txtSymbol=STA&ssoPageI...,/C13_FastQuote_Main.jsp?txtSymbol=STA&ssoPageI...,STA


merge เข้ากับตารางเดิม

In [241]:
stock_url_table = pd.merge(stock_table,stock_url_lists, on=['Stock'], how='left')
print(len(stock_url_table))
stock_url_table.head()

597


Unnamed: 0,Stock,ราคาล่าสุด(Home),Sector,กรรมการบริษัท,งบการเงิน,บริษัท/หลักทรัพย์,ผู้ถือหุ้นรายใหญ่,ราคาย้อนหลัง
0,CHOTI,/C13_FastQuote_Main.jsp?txtSymbol=CHOTI,AGRI,/C13_FastQuote_Main.jsp?txtSymbol=CHOTI&ssoPag...,/C13_FastQuote_Main.jsp?txtSymbol=CHOTI&ssoPag...,/C13_FastQuote_Main.jsp?txtSymbol=CHOTI&ssoPag...,/C13_FastQuote_Main.jsp?txtSymbol=CHOTI&ssoPag...,/C13_FastQuote_Main.jsp?txtSymbol=CHOTI&ssoPag...
1,EE,/C13_FastQuote_Main.jsp?txtSymbol=EE,AGRI,/C13_FastQuote_Main.jsp?txtSymbol=EE&ssoPageId...,/C13_FastQuote_Main.jsp?txtSymbol=EE&ssoPageId...,/C13_FastQuote_Main.jsp?txtSymbol=EE&ssoPageId...,/C13_FastQuote_Main.jsp?txtSymbol=EE&ssoPageId...,/C13_FastQuote_Main.jsp?txtSymbol=EE&ssoPageId...
2,GFPT,/C13_FastQuote_Main.jsp?txtSymbol=GFPT,AGRI,/C13_FastQuote_Main.jsp?txtSymbol=GFPT&ssoPage...,/C13_FastQuote_Main.jsp?txtSymbol=GFPT&ssoPage...,/C13_FastQuote_Main.jsp?txtSymbol=GFPT&ssoPage...,/C13_FastQuote_Main.jsp?txtSymbol=GFPT&ssoPage...,/C13_FastQuote_Main.jsp?txtSymbol=GFPT&ssoPage...
3,LEE,/C13_FastQuote_Main.jsp?txtSymbol=LEE,AGRI,/C13_FastQuote_Main.jsp?txtSymbol=LEE&ssoPageI...,/C13_FastQuote_Main.jsp?txtSymbol=LEE&ssoPageI...,/C13_FastQuote_Main.jsp?txtSymbol=LEE&ssoPageI...,/C13_FastQuote_Main.jsp?txtSymbol=LEE&ssoPageI...,/C13_FastQuote_Main.jsp?txtSymbol=LEE&ssoPageI...
4,STA,/C13_FastQuote_Main.jsp?txtSymbol=STA,AGRI,/C13_FastQuote_Main.jsp?txtSymbol=STA&ssoPageI...,/C13_FastQuote_Main.jsp?txtSymbol=STA&ssoPageI...,/C13_FastQuote_Main.jsp?txtSymbol=STA&ssoPageI...,/C13_FastQuote_Main.jsp?txtSymbol=STA&ssoPageI...,/C13_FastQuote_Main.jsp?txtSymbol=STA&ssoPageI...


In [243]:
stock_url_table.to_csv('../../data/explore/stock_url_list.csv', index=False)