In [67]:
import pandas as pd
import os
import pymongo  
import csv
import JLParser as ps
import JLFileMgr as fm
import JLScraper as sc  


### Janie's Data Science Job Analysis for Tennessee  2017
----
* Scrape the website, "https://www.indeed.com", for current jobs and save each webpage as html file
* Read each html file and store formatted data into MongoDB job database
* Download excel file from "https://www.bls.gov" for employment by occupation and state
* Read the excel files into dataframe, munge the dataset and merge with US population by state
* Load the formatted job employment data to MongoDB job database
* Select the desired columns from MongoDB and read back to dataframe for analysis
* Save result dataset into excel sheet in Output directory

#### Scrape data from website & save as html files

In [68]:
# Scraping 2 states, 2 job sites, and job title as data scientist, data analyst, data engineer, software developer   
# Inititalize all the variables for scraping
state_list = ["NJ", "TN"] 
scrape_list = []
scrape_dict = {}
scrape_dict["Web_Url"] = "https://www.indeed.com/jobs?q=Data+Analyst%2C+Data+Scientist%2C+Data+Engineer%2C+Software+Development&l="
scrape_dict["Dir_Path"] = "Resources/Job_Search/indeed"
scrape_dict["Base_Url"] = "https://www.indeed.com"
scrape_list.append(scrape_dict)  
scrape_dict = {}
scrape_dict["Web_Url"] = "https://www.itsmycareer.com/results?q=Data-Scientist&l="
scrape_dict["Dir_Path"] = "Resources/Job_Search/itsmycareer"
scrape_dict["Base_Url"] = "https://www.itsmycareer.com"
scrape_list.append(scrape_dict)
print(scrape_list)
# Local Chrome drive path:  os.getcwd() + "/chromedriver"
# test url = 'http://quotes.toscrape.com/' "https://www.itsmycareer.com/results?q=Data-Scientist&l=
driver_path = "C:/Users/Janie228/SCHOOL/Browser_Drivers/chromedriver" 
file_name = "job_list" # partial file name string
timer = 5  # second(s)
user = "Janie"
ttl_pgs = 1


[{'Web_Url': 'https://www.indeed.com/jobs?q=Data+Analyst%2C+Data+Scientist%2C+Data+Engineer%2C+Software+Development&l=', 'Dir_Path': 'Resources/Job_Search/indeed', 'Base_Url': 'https://www.indeed.com'}, {'Web_Url': 'https://www.itsmycareer.com/results?q=Data-Scientist&l=', 'Dir_Path': 'Resources/Job_Search/itsmycareer', 'Base_Url': 'https://www.itsmycareer.com'}]


In [None]:
# Scrape website by looping thru state list and scrape site list
for state in state_list:
    # Loop thru each website
    for record in scrape_list:
        #print(record["Web_Url"])
        # Initialize scraper by setting all required parameters
        # __init__(self, web_url, file_name, ttl_pgs=1, dir_path=os.getcwd(), sleep_time=0, driver_path=os.getcwd()+"/chromedriver"):
        msg = sc.Scraper(str(record["Web_Url"] + state), file_name, ttl_pgs, record["Dir_Path"], timer, driver_path).scrape()

        # Execute scraping function
        print(msg)
 

#### Initialize Mongo database 

In [69]:
# Initialize PyMongo to work with MongoDBs
conn = 'mongodb://localhost:27017'
client = pymongo.MongoClient(conn)

# Define database and collection     
db = client.Job_DB

####  Read html files from directory, parse, return recods as dictionary list, and upsert each to MongoDB

In [70]:
# Loop thru each directory path and read all files
for record in scrape_list:
    # Initialize reading file class for parsing 
    #__init__(self, source_path=os.getcwd(), action_type=None, current_user=None): 
    parse = fm.FileMgr(record["Dir_Path"], "html", "Janie")
    # Return parsed data dictionary list
    result_data = parse.read_all_files(record["Base_Url"])

    #------------------------------------------------------------------------------
    # upload all the parsed records to MongoDB
    # Set record to unique before upsert else no record will upload to database
    [db.jobs.update_one({"title": record["title"], "desc": record["desc"]}, {'$set': record}, upsert=True) for record in result_data]


    print(result_data)


[{'title': 'Software Engineer', 'desc': 'Experience with modern software development practices. The Scientific Data Group in the Computer Science and Mathematics Division at the Oak Ridge National...', 'salary': '', 'web_url': 'https://www.indeed.com', 'job_link': 'https://www.indeed.com/viewjob?jk=5a03dbb55fd0f526&fccid=64e028df9b3fd2c4&vjs=3', 'company': 'Oak Ridge National Laboratory', 'city': 'Oak Ridge', 'state': 'TN', 'zipcode': '37831', 'created_date': '02/26/2019', 'created_by': 'Janie'}, {'title': 'Performance Analyst', 'desc': 'Professional development opportunities and incentives. Weekly Career Development Meetings for your first 60 days....', 'salary': '', 'web_url': 'https://www.indeed.com', 'job_link': 'https://www.indeed.com/viewjob?jk=78465edf350f8279&fccid=2f00ae4e43e5dac3&vjs=3', 'company': 'TechnologyAdvice', 'city': 'Nashville', 'state': 'TN', 'zipcode': '37210', 'created_date': '02/26/2019', 'created_by': 'Janie'}, {'title': 'Data Analyst II - Integration', 'desc':

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



#### Read both excel files, employment statistic and population, from directory path, clean and format, and upload to MongoDB


In [79]:
# Get employment job data from path, read to dataframe, and display
job_filepath = os.path.join(".", "Resources/Employment_Statistic", "state_M2017_dl.xlsx")
job_data = pd.read_excel(job_filepath, index_col=0)
job_df = pd.DataFrame(job_data)
job_df.columns


Index(['ST', 'STATE', 'OCC_CODE', 'OCC_TITLE', 'OCC_GROUP', 'TOT_EMP',
       'EMP_PRSE', 'JOBS_1000', 'LOC_Q', 'H_MEAN', 'A_MEAN', 'MEAN_PRSE',
       'H_PCT10', 'H_PCT25', 'H_MEDIAN', 'H_PCT75', 'H_PCT90', 'A_PCT10',
       'A_PCT25', 'A_MEDIAN', 'A_PCT75', 'A_PCT90', 'ANNUAL', 'HOURLY'],
      dtype='object')

In [80]:
job_df.head()

Unnamed: 0_level_0,ST,STATE,OCC_CODE,OCC_TITLE,OCC_GROUP,TOT_EMP,EMP_PRSE,JOBS_1000,LOC_Q,H_MEAN,...,H_MEDIAN,H_PCT75,H_PCT90,A_PCT10,A_PCT25,A_MEDIAN,A_PCT75,A_PCT90,ANNUAL,HOURLY
AREA,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,AL,Alabama,00-0000,All Occupations,total,1922570,0.4,1000.0,1.0,20.76,...,15.77,25.01,37.83,17770,21740,32800,52020,78690,,
1,AL,Alabama,11-0000,Management Occupations,major,69950,1.0,36.385,0.71,53.44,...,46.63,64.12,90.8,52130,70130,96980,133360,188860,,
1,AL,Alabama,11-1011,Chief Executives,detailed,1120,6.0,0.585,0.4,99.56,...,94.45,#,#,84520,125290,196460,#,#,,
1,AL,Alabama,11-1021,General and Operations Managers,detailed,27150,1.6,14.123,0.91,58.04,...,49.19,71.09,#,54040,72610,102320,147860,#,,
1,AL,Alabama,11-1031,Legislators,detailed,1100,12.0,0.572,1.59,*,...,*,*,*,16310,17280,18910,24630,47510,True,


In [82]:
# Clean data: replacing all nan, *, #
# 101.1 & 208000.1: indicates a wage that is equal to or greater than $100.00 per hour or $208,000 per year
# 0 indicates that a wage estimate is not available
# 0 indicates that an employment estimate is not available
job_df.replace('#', '101.1')
job_df = job_df.replace("*", 0).replace("**", 0) #astype(str)
job_df.fillna(False,inplace=True)
job_df["HOURLY"].fillna(False,inplace=True)
job_df["H_PCT10"] = job_df["H_PCT10"].astype(str).replace("#", 101.1).astype(float)
job_df["H_PCT25"] = job_df["H_PCT25"].astype(str).replace("#", 101.1).astype(float)
job_df["H_MEAN"] = job_df["H_MEAN"].astype(str).replace("#", 101.1).astype(float)
job_df["H_MEDIAN"] = job_df["H_MEDIAN"].astype(str).replace("#", 101.1).astype(float)
job_df["H_PCT75"] = job_df["H_PCT75"].astype(str).replace("#", 101.1).astype(float)
job_df["H_PCT90"] = job_df["H_PCT90"].astype(str).replace("#", 101.1).astype(float)
job_df["A_PCT10"] = job_df["A_PCT10"].astype(str).replace("#", 208000.1).astype(float)
job_df["A_PCT25"] = job_df["A_PCT25"].astype(str).replace("#", 208000.1).astype(float)
job_df["A_PCT75"] = job_df["A_PCT75"].astype(str).replace("#", 208000.1).astype(float)
job_df["A_PCT90"] = job_df["A_PCT90"].astype(str).replace("#", 208000.1).astype(float)
job_df["A_MEAN"] = job_df["A_MEAN"].astype(str).replace("#", 208000.1).astype(float)
job_df["A_MEDIAN"] = job_df["A_MEDIAN"].astype(str).replace("#", 208000.1).astype(float)
# Rename
job_rt = job_df.rename(columns={"EMP_PRSE": "EMP_STNDERROR", "MEAN_PRSE": "MEAN_STNDERROR"})
job_rt.head()

Unnamed: 0_level_0,ST,STATE,OCC_CODE,OCC_TITLE,OCC_GROUP,TOT_EMP,EMP_STNDERROR,JOBS_1000,LOC_Q,H_MEAN,...,H_MEDIAN,H_PCT75,H_PCT90,A_PCT10,A_PCT25,A_MEDIAN,A_PCT75,A_PCT90,ANNUAL,HOURLY
AREA,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,AL,Alabama,00-0000,All Occupations,total,1922570,0.4,1000.0,1.0,20.76,...,15.77,25.01,37.83,17770.0,21740.0,32800.0,52020.0,78690.0,False,False
1,AL,Alabama,11-0000,Management Occupations,major,69950,1.0,36.385,0.71,53.44,...,46.63,64.12,90.8,52130.0,70130.0,96980.0,133360.0,188860.0,False,False
1,AL,Alabama,11-1011,Chief Executives,detailed,1120,6.0,0.585,0.4,99.56,...,94.45,101.1,101.1,84520.0,125290.0,196460.0,208000.1,208000.1,False,False
1,AL,Alabama,11-1021,General and Operations Managers,detailed,27150,1.6,14.123,0.91,58.04,...,49.19,71.09,101.1,54040.0,72610.0,102320.0,147860.0,208000.1,False,False
1,AL,Alabama,11-1031,Legislators,detailed,1100,12.0,0.572,1.59,0.0,...,0.0,0.0,0.0,16310.0,17280.0,18910.0,24630.0,47510.0,True,False


In [83]:
# Numbers of rows & columns in dataset
job_rt.shape

(36992, 24)

In [84]:
# Columns datatype
job_rt.dtypes

ST                 object
STATE              object
OCC_CODE           object
OCC_TITLE          object
OCC_GROUP          object
TOT_EMP             int64
EMP_STNDERROR     float64
JOBS_1000         float64
LOC_Q             float64
H_MEAN            float64
A_MEAN            float64
MEAN_STNDERROR    float64
H_PCT10           float64
H_PCT25           float64
H_MEDIAN          float64
H_PCT75           float64
H_PCT90           float64
A_PCT10           float64
A_PCT25           float64
A_MEDIAN          float64
A_PCT75           float64
A_PCT90           float64
ANNUAL               bool
HOURLY               bool
dtype: object

In [85]:
# Count all the rows for all coloumns
job_rt.count()


ST                36992
STATE             36992
OCC_CODE          36992
OCC_TITLE         36992
OCC_GROUP         36992
TOT_EMP           36992
EMP_STNDERROR     36992
JOBS_1000         36992
LOC_Q             36992
H_MEAN            36992
A_MEAN            36992
MEAN_STNDERROR    36992
H_PCT10           36992
H_PCT25           36992
H_MEDIAN          36992
H_PCT75           36992
H_PCT90           36992
A_PCT10           36992
A_PCT25           36992
A_MEDIAN          36992
A_PCT75           36992
A_PCT90           36992
ANNUAL            36992
HOURLY            36992
dtype: int64

In [86]:
# All unique occupations
job_rt['OCC_TITLE'].unique()


array(['All Occupations', 'Management Occupations', 'Chief Executives',
       'General and Operations Managers', 'Legislators',
       'Advertising and Promotions Managers', 'Marketing Managers',
       'Sales Managers', 'Public Relations and Fundraising Managers',
       'Administrative Services Managers',
       'Computer and Information Systems Managers', 'Financial Managers',
       'Industrial Production Managers', 'Purchasing Managers',
       'Transportation, Storage, and Distribution Managers',
       'Compensation and Benefits Managers', 'Human Resources Managers',
       'Training and Development Managers', 'Construction Managers',
       'Education Administrators, Preschool and Childcare Center/Program',
       'Education Administrators, Elementary and Secondary School',
       'Education Administrators, Postsecondary',
       'Education Administrators, All Other',
       'Architectural and Engineering Managers', 'Food Service Managers',
       'Funeral Service Managers', '

In [87]:
# Number of uniqe occupations
len(job_rt['OCC_TITLE'].unique())

831

In [88]:
# Unique states
job_rt['STATE'].unique()

array(['Alabama', 'Alaska', 'Arizona', 'Arkansas', 'California',
       'Colorado', 'Connecticut', 'Delaware', 'District of Columbia',
       'Florida', 'Georgia', 'Hawaii', 'Idaho', 'Illinois', 'Indiana',
       'Iowa', 'Kansas', 'Kentucky', 'Louisiana', 'Maine', 'Maryland',
       'Massachusetts', 'Michigan', 'Minnesota', 'Mississippi',
       'Missouri', 'Montana', 'Nebraska', 'Nevada', 'New Hampshire',
       'New Jersey', 'New Mexico', 'New York', 'North Carolina',
       'North Dakota', 'Ohio', 'Oklahoma', 'Oregon', 'Pennsylvania',
       'Rhode Island', 'South Carolina', 'South Dakota', 'Tennessee',
       'Texas', 'Utah', 'Vermont', 'Virginia', 'Washington',
       'West Virginia', 'Wisconsin', 'Wyoming', 'Guam', 'Puerto Rico',
       'Virgin Islands'], dtype=object)

In [89]:
# Number of unique states
len(job_rt['STATE'].unique())

54

In [91]:
# Upload job statistic to employment table   
db.employment.insert_many(job_rt.to_dict('records'))


<pymongo.results.InsertManyResult at 0x1a10af6b6c8>

In [90]:
# Get population data from path, read to dataframe, and display
population_filepath = os.path.join(".", "Resources/Employment_Statistic", "state_pop_2010-17.csv")
population_data = pd.read_csv(population_filepath, index_col=0)
population_df = pd.DataFrame(population_data)
population_df.head()


Unnamed: 0,State,Census,Estimates Base,2010,2011,2012,2013,2014,2015,2016,2017
0,Alabama,4779736,4780135,4785579,4798649,4813946,4827660,4840037,4850858,4860545,4874747
1,Alaska,710231,710249,714015,722259,730825,736760,736759,737979,741522,739795
2,Arizona,6392017,6392309,6407002,6465488,6544211,6616124,6706435,6802262,6908642,7016270
3,Arkansas,2915918,2916031,2921737,2938640,2949208,2956780,2964800,2975626,2988231,3004279
4,California,37253956,37254518,37327690,37672654,38019006,38347383,38701278,39032444,39296476,39536653


In [63]:
# Filter only state & 2017 population
pop_df = population_df[["State", "2017"]].rename(columns={"2017": "Population"})
pop_df["Population"] = pop_df["Population"].str.replace(",", "").astype(int)
pop_df.head()

Unnamed: 0,State,Population
0,Alabama,4874747
1,Alaska,739795
2,Arizona,7016270
3,Arkansas,3004279
4,California,39536653


In [92]:
# Numbers of states
len(pop_df['State'].unique())                                       

51

In [93]:
# Columns datatype
pop_df.dtypes

State         object
Population     int32
dtype: object

In [94]:
# Upload state population to population table   
db.population.insert_many(pop_df.to_dict('records'))

<pymongo.results.InsertManyResult at 0x1a101368a48>

In [96]:

documents = db.employment.find({'OCC_GROUP': 'total'})
df = pd.DataFrame(list(documents))

len(df)

54

In [None]:
# Wealthy_rt.drop(Wealthy_rt.columns[[3,4]],inplace=True, axis=1)