# Transition Matrices: Fortune Global 500 Ranking web scraper

## 1. Web Scraping

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import os
import json
from time import time, sleep
from random import randint
import re

def get_json_script(link):
    """
    To extract a json script from the Fortune website
 
    Args:
        link : str
        
    Returns:
        dict
    """
    page = requests.get(link)
    soup = BeautifulSoup(page.content, 'html.parser')
    return json.loads(soup.find('script', {'id': '__NEXT_DATA__'}).string)

def parse_norm_data(json_script):
    """
    To parse and normalize json data 
 
    Args:
        json_script : dict
        
    Returns:
        DataFrame
    """ 
    data = json_script['props']['pageProps']['franchiseList']['items']
    return pd.json_normalize(data)

def text_cleaning(text):
    """
    To clean text    
    
    Args:
        text : str         

    Returns:
        str
    """
    txt_temp = re.sub(r'[^a-zA-Z0-9\s]+', '', text)
    txt_temp = txt_temp.lower()
    return txt_temp.replace('  ',' ').replace(' ','_')

print("The script is running...")
t_start = time()

path_parent_dir = os.path.dirname(os.getcwd())
path_data_web_scraper = f'{path_parent_dir}\data\web_scraper'
website = 'https://fortune.com/ranking/global500/search/'
website_parent = os.path.dirname(website[0:-1])
json_script = get_json_script(website)
start_year = int(json_script['props']['pageProps']['initialYear'])
year_range = [i for i in range(start_year-1,start_year-4,-1)]

df_fortune_rank = parse_norm_data(json_script)
df_fortune_rank.insert(0, "year", start_year)

dict_links = {} 
for i in year_range:
    dict_links.update({i: f'{website_parent}/{str(i)}{website[-8:]}'})
    
for k, v in dict_links.items():
    json_script = get_json_script(v)
    sleep(randint(2,5))
    df_temp = parse_norm_data(json_script)
    df_temp.insert(0, "year", k)
    df_fortune_rank = pd.concat([df_fortune_rank, df_temp])

## 2. Data cleaning and transformation

In [None]:
## RANKING GLOBAL PREPARATION
df_fortune_rank.drop(columns=['data.Rank'], inplace = True)
df_fortune_rank = df_fortune_rank.sort_values(['year','rank'], ascending = [False, True]).reset_index(drop=True)
df_fortune_rank.columns = [text_cleaning(col[5:len(col)]) if col.startswith('data.') \
                           else col for col in  df_fortune_rank.columns]
    
df_fortune_rank = df_fortune_rank.loc[~(
                                       (df_fortune_rank["name"]=="Shandong Energy Group")
                                       &    
                                       (df_fortune_rank["year"]==2020)
                                       & 
                                       (df_fortune_rank["rank"]==295)
                                       )]

df_fortune_rank.to_csv(f'{path_data_web_scraper}/fortune_ranking_global_500_raw.csv', header=True, index=False, encoding='utf-8-sig',sep=';')

## COMPANY MASTER DATA
df_fortune_rank['max_year'] = np.where(df_fortune_rank.groupby(['name'])['year'].rank("dense", ascending=False)==1, 1, 0)
df_company_masterdata = df_fortune_rank.iloc[:,np.r_[1,4:21,-4:-1]][(df_fortune_rank['max_year']==1)].reset_index(drop=True).copy()
df_company_masterdata.fillna('N/A', inplace = True)
df_company_masterdata = df_company_masterdata.replace('','N/A',regex = True)

## RANKING GLOBAL
df_fortune_rank = df_fortune_rank.iloc[:,np.r_[0:2,21,23,25:27]].copy()

for k, v in {'\$': '','%': '', ',': '', '': '0'}.items():
    df_fortune_rank = df_fortune_rank.replace(k, v, regex=True)

df_fortune_rank.fillna('0', inplace = True)
df_fortune_rank = df_fortune_rank.melt(id_vars=['year', 'name'], var_name = 'measure')
df_fortune_rank.to_csv(f'{path_data_web_scraper}/fortune_ranking_global_500.csv', header=True, index=False, encoding='utf-8-sig',sep=';')
df_company_masterdata.to_csv(f'{path_data_web_scraper}/fortune_company_master_data.csv', header=True, index=False, encoding='utf-8-sig',sep=';')
print("...it has been successfully executed in %0.1fs." % (time() - t_start))