In [23]:
import numpy as np
import pandas as pd
import time
import datetime
import urllib.parse
from urllib.error import HTTPError

In [31]:
class j_league_data_site():
    
    def scraping(self, only_new_data_flg):
        # match_schedule.csv
        self._get_match_schedule()
        # j_rank_table.csv / j_rank_table_has_st.csv
        self._get_j_rank_table(only_new_data_flg)
        # player_list.csv
        self._get_player_list(only_new_data_flg)
        # starting_member.csv
        
        return "完了"
    
    def _get_player_list(self, only_new_data_flg):
        
        if only_new_data_flg:
            return None
        
        head = "https://data.j-league.or.jp/SFIX03/createPlayerListInfoByFirstAlphabetList?player_name_first_alphabet="
        idx_list = ['あ', 'い', 'う', 'え', 'お', 'か', 'き', 'く', 'け', 'こ', 'さ', 'し', 'す', 'せ', 'そ', 'た', 'ち', 'つ', 'て', 'と', 'な', 'に', 'ぬ', 'ね', 'の', 'は', 'ひ', 'ふ', 'へ', 'ほ', 'ま', 'み', 'む', 'め', 'も', 'や', 'ゆ', 'よ', 'ら', 'り', 'る', 'れ', 'ろ', 'わ', 'を', 'ん']
        columns = ['全てチェック', '選手名（英語）', '最終所属', 'ポジション', '生年月日', '身長/体重', 'url']
        df_result = pd.DataFrame(columns = columns)

        for idx in idx_list:
            index = urllib.parse.quote(idx)
            url = head + index
            print('処理中 : ' + idx)

            time.sleep(1) # スリープ処理
            data = pd.read_html(url)
            df_tmp = pd.DataFrame(data[0])
            df_tmp['url'] = url
            if df_tmp.columns.values[0] == columns[0]:
                df_result = pd.concat([df_result, df_tmp])

        df_result = df_result[df_result['全てチェック']!='全てチェック']
        df_result.rename(columns={'全てチェック': '選手名'})

        df_result.to_csv("data/j_league_data_site/player_list.csv")
    
    def _get_match_schedule(self):

        team_id_list = ["14", "362", "269", "54", "270", "29", "271", "1", "94", "40", "35", "3", "27", "2", "11", "22", "4", "45", "21", "5", "34", "272", "6", "12", "273", "28", "46", "274", "78", "41", "275", "7", "13", "276", "347", "8", "39", "24", "9", "20", "18", "44", "42", "10", "330", "48", "36", "37", "369", "23", "43", "33", "47", "38", "31", "371", "338", "277", "339", "340", "341", "278"]
        team_name_list = ["札幌", "八戸", "岩手", "仙台", "秋田", "山形", "福島", "鹿島", "水戸", "栃木", "群馬", "浦和", "大宮", "千葉", "柏", "FC東京", "東京Ｖ", "町田", "川崎Ｆ", "横浜FM", "横浜FC", "YS横浜", "横浜Ｆ", "湘南", "相模原", "甲府", "松本", "長野", "新潟", "富山", "金沢", "清水", "磐田", "藤枝", "沼津", "名古屋", "岐阜", "京都", "Ｇ大阪", "Ｃ大阪", "神戸", "鳥取", "岡山", "広島", "山口", "讃岐", "徳島", "愛媛", "今治", "福岡", "北九州", "鳥栖", "長崎", "熊本", "大分", "宮崎", "鹿児島", "琉球", "Ｆ東23", "Ｇ大23", "Ｃ大23", "J-22"]
        url_head = "https://data.j-league.or.jp/SFMS01/search?team_ids="
        url_foot = "&home_away_select=0&tv_relay_station_name="
        df_result = None
        for i in range(len(team_id_list)):

            url = url_head + team_id_list[i] + url_foot
            time.sleep(1) # スリープ処理
            j_data_list = pd.read_html(url)
            df_j_data = pd.DataFrame(j_data_list[0]) # データフレームに変換

            # チーム名のカラムを追加
            df_j_data["Team"] = team_name_list[i]

            df_result = pd.concat([df_result, df_j_data])

        df_result.to_csv("data/j_league_data_site/match_schedule.csv")
    
    def _get_j_rank_table(self, only_new_data_flg):

        url1 = "https://data.j-league.or.jp/SFRT01/?competitionSectionIdLabel="
        url2 = "&competitionIdLabel="
        competitionIdLabel_j1 = ["Ｊリーグ　ディビジョン１", "Ｊリーグ　ディビジョン１", "Ｊリーグ　ディビジョン１", "明治安田生命Ｊ１リーグ １ｓｔ", "明治安田生命Ｊ１リーグ ２ｎｄ", "明治安田生命Ｊ１リーグ １ｓｔ", "明治安田生命Ｊ１リーグ ２ｎｄ", "明治安田生命Ｊ１リーグ", "明治安田生命Ｊ１リーグ", "明治安田生命Ｊ１リーグ", "明治安田生命Ｊ１リーグ", "明治安田生命Ｊ１リーグ"]
        competitionIdLabel_j2 = ["Ｊリーグ　ディビジョン２", "Ｊリーグ　ディビジョン２", "Ｊリーグ　ディビジョン２", "明治安田生命Ｊ２リーグ", "明治安田生命Ｊ２リーグ", "明治安田生命Ｊ２リーグ", "明治安田生命Ｊ２リーグ", "明治安田生命Ｊ２リーグ", "明治安田生命Ｊ２リーグ", "明治安田生命Ｊ２リーグ"]
        competitionIdLabel_j3 = ["明治安田生命Ｊ３リーグ", "明治安田生命Ｊ３リーグ", "明治安田生命Ｊ３リーグ", "明治安田生命Ｊ３リーグ", "明治安田生命Ｊ３リーグ", "明治安田生命Ｊ３リーグ", "明治安田生命Ｊ３リーグ", "明治安田生命Ｊ３リーグ"]
        competitionId_j1 = ["322", "347", "372", "397", "398", "411", "412", "428", "444", "460", "477", "492"]
        competitionId_j2 = ["323", "348", "373", "400", "413", "429", "445", "467", "478", "493"]
        competitionId_j3 = ["380", "399", "414", "430", "446", "468", "479", "494"]
        url3 = "&yearIdLabel="
        url4 = "&yearId="
        url5 = "&competitionId="
        url6 = "&competitionSectionId="
        url7 = "&search=search"
        df_result = None
        df_result_has_stage = None
        now_year= datetime.datetime.now().year

        if only_new_data_flg:
            df_result = pd.read_csv("data/j_league_data_site/j_rank_table.csv", index_col=0)
            df_result = df_result[df_result["year"] != now_year].copy()
            df_result_has_stage = pd.read_csv("data/j_league_data_site/j_rank_table_has_st.csv", index_col=0)
            df_result_has_stage = df_result_has_stage[df_result_has_stage["year"] != now_year].copy()

        for year in range(2012, now_year+1):
            if only_new_data_flg and now_year !=year:
                continue
            year_id = urllib.parse.quote(str(year)+"年")

            for section in range(1, 50):
                section_Id = urllib.parse.quote("第"+str(section)+"節")
                is_ok_j1_1 = False
                is_ok_j1_2 = False
                is_ok_j2 = False
                is_ok_j3 = False
                print("処理中 :", year, section)
                # J1
                for i in range(len(competitionIdLabel_j1)):
                    competition_Id = urllib.parse.quote(competitionIdLabel_j1[i], safe='　')
                    competition_Id = competition_Id.replace(' ', '%E3%80%80')
                    url_j1 = url1+section_Id+url2+competition_Id+url3+year_id+url4+str(year)+url5+competitionId_j1[i]+url6+str(section)+url7
                    time.sleep(0.5) # スリープ処理
                    try:
                        data_list = pd.read_html(url_j1)
                        df_data = pd.DataFrame(data_list[0]) # データフレームに変換
                        df_data["year"] = year
                        df_data["カテゴリ"] = "J1"
                        df_data["節"] = section

                        comLabel = competitionIdLabel_j1[i]
                        if comLabel == "明治安田生命Ｊ１リーグ １ｓｔ":
                            is_ok_j1_1 = True
                            df_data["stage"] = "１ｓｔ"
                            df_result_has_stage = pd.concat([df_result_has_stage, df_data])
                        elif comLabel == "明治安田生命Ｊ１リーグ ２ｎｄ":
                            is_ok_j1_2 = True
                            df_data["stage"] = "2ｓｔ"
                            df_result_has_stage = pd.concat([df_result_has_stage, df_data])
                        else:
                            is_ok_j1_1 = True
                            is_ok_j1_2 = True
                            df_result = pd.concat([df_result, df_data])
                    except HTTPError:
                        continue

                # J2
                for i in range(len(competitionIdLabel_j2)):
                    competition_Id = urllib.parse.quote(competitionIdLabel_j２[i], safe='　')
                    competition_Id = competition_Id.replace(' ', '%E3%80%80')
                    url_j2 = url1+section_Id+url2+competition_Id+url3+year_id+url4+str(year)+url5+competitionId_j2[i]+url6+str(section)+url7
                    time.sleep(0.5) # スリープ処理
                    try:
                        data_list = pd.read_html(url_j2)
                        df_data = pd.DataFrame(data_list[0]) # データフレームに変換
                        df_data["year"] = year
                        df_data["カテゴリ"] = "J2"
                        df_data["節"] = section
                        df_result = pd.concat([df_result, df_data])
                        is_ok_j2 = True
                    except HTTPError:
                        continue

                # J3
                for i in range(len(competitionIdLabel_j3)):
                    competition_Id = urllib.parse.quote(competitionIdLabel_j3[i])
                    url_j3 = url1+section_Id+url2+competition_Id+url3+year_id+url4+str(year)+url5+competitionId_j3[i]+url6+str(section)+url7
                    time.sleep(0.5) # スリープ処理
                    try:
                        data_list = pd.read_html(url_j3)
                        df_data = pd.DataFrame(data_list[0]) # データフレームに変換
                        df_data["year"] = year
                        df_data["カテゴリ"] = "J3"
                        df_data["節"] = section
                        df_result = pd.concat([df_result, df_data])
                        is_ok_j3 = True
                    except HTTPError:
                        continue
        # csv出力
        df_result.to_csv("data/j_league_data_site/j_rank_table.csv")
        df_result_has_stage.to_csv("data/j_league_data_site/j_rank_table_has_st.csv")