In [1]:
import pandas as pd 
import numpy as np
import requests 
from bs4 import BeautifulSoup
import pickle
import gc
import multiprocessing
from multiprocessing import Pool
from time import sleep
import requests
import re

In [2]:
cores = multiprocessing.cpu_count()
cores

8

### Get a list of website 

In [None]:
# First page
websites = ['https://mrolinks.mro-network.com/exhibitordirectory?product_categories=20821&countryfield=US'] # First Page

In [None]:
# Get the rest of the web pages 
for i in range(1,87):
    web = 'https://mrolinks.mro-network.com/exhibitordirectory?page=' +str(i) +'&product_categories=20821&countryfield=US'
    websites.append(web)

In [None]:
len(websites)

### Get the link to each company page 

#### Define function 

In [None]:
def unique_list(links): 
    '''
    Return a unique sub-list 
    '''
   # order preserving
    checked = []
    
    for link in links:
        if link not in checked:
            checked.append(link)
            
    return checked

In [None]:
# Retrun list from a website 

def get_list(web):
    '''
    Retrun an MRO list from a website 
    '''
    page = requests.get(str(web)) # Access to the web page 
   
    soup = BeautifulSoup(page.content, 'html.parser')  # Create a BeautifulSoup object
    
    # Return a list that contains all "a herf" object from the soup object
    company_list = soup.find_all("a", class_ = False, href=re.compile("/company/"), text = True)
   
    company_links = [a['href'].strip() for a in company_list] # Get all herf link
    
    company_links = unique_list(company_links) #Return a unique list 
    
    company_links = [link for link in company_links if not ('https' in link)] # Remove irrelevant website 
    
    return company_links

    

##### Add Multiprocessor for the for loop if I have more time

In [None]:
# Define an empty website to store the data 
MRO = []

In [None]:
%%time 
for web in websites:
    company_links = get_list(web)
    MRO.extend(company_links)

In [None]:
len(MRO)

In [None]:
MRO[0:10]

In [None]:
%%time

# Step 1: Init multiprocessing.Pool()
p = Pool(processes=3)

# Step 2: `pool.apply` the `howmany_within_range()`
MRO1 = [p.apply(get_list, args=web) for web in websites]

# Step 3: Don't forget to close
#p.close()   

### Get Company Name and Address 

In [None]:
# Retrun list from a website 

def get_company_info(web):
    '''
    Retrun company name and zip code from a website 
    '''
    web = 'https://mrolinks.mro-network.com'+str(web)
    page = requests.get(str(web)) # Access to the web page 
   
    soup = BeautifulSoup(page.content, 'html.parser')  # Create a BeautifulSoup object
    
    # Return a list that contains all "a herf" object from the soup object
    company_name = soup.find_all("h1", text = True)[0].text.strip()
    zipcode = soup.find_all("span", {'class': 'postal-code'}, text = True)[0].text.strip()
    
    return company_name, zipcode

In [None]:
%%time 
company_names = []
zipcodes = []

for web in MRO_website:
    try:
        company_name, zipcode = get_company_info(web)
        company_names.append(company_name)
        zipcodes.append(zipcode)
    except:
        print(web)

In [None]:
len(company_names)
len(zipcodes)

In [None]:
miles30 = [98134,98106,98126,98161,98104,98174,98144,98108,98164,98154,98111,98114,98124,98129,98138,98145,
           98170,98181,98185,98190,98191,98113,98127,98139,98141,98165,98175,98194,98101,98116,98121,98122,
           98136,98118,98109,98102,98112,98040,98119,98146,98039,98195,98168,98199,98178,98105,98004,98107,
           98103,98009,98015,98062,98131,98056,98115,98005,98117,98158,98166,98386,98148,98057,98006,98188,
           98033,98007,98083,98353,98384,98061,98125,98055,98110,98008,98366,98034,98059,98133,98070,98177,
           98378,98073,98052,98031,98032,98155,98198,98028,98310,98082,98013,98011,98160,98337,98058,98064,
           98035,98089,98075,98314,98074,98041,98311,98029,98359,98393,98392,98043,98342,98030,98345,98053,
           98367,98036,98020,98021,98072,98027,98322,98046,98042,98077,98370,98003,98026,98001,98023,98037,
           98063,98071,98093,98332,98002,98383,98346,98038,98395,98050,98312,98422,98012,98315,98407,98087,
           98014,98421,98417,98403,98047,98402,98092,98354,98329,98296,98416,98413,98401,98411,98412,98415,
           98431,98464,98471,98481,98493,98497,98448,98025,98204,98406,98364,98424,98405,98335,98275,98208,
           98010,98465,98419,98418,98051,98372,98404,98390,98371,98443,98466,98409,98352,98340,98024,98490,
           98065,98291,98333,98203,98408,98380,98394,98236,98467,98019,98349,98365,98391,98496,98524,98207,
           98528,98213,98499,98376,98373,98272,98201,98290,98447,98444,98498,98206,98398,98445,98388,98446,
           98374,98438,98439,98385,98351,98375,98260,98588,98546,98249,98430,98303,98270,98293,98325,98396,
           98258,98387,98339,98271,98327,98433,98516,98358,98045,98253,98592,98338,98251,98555,98368]

In [None]:
df = pd.DataFrame({'MRO': company_names,
                  'zipcode': zipcodes})

In [None]:
df.head()

In [None]:
df.to_csv('MRO_List.csv')