In [292]:
import pandas as pd
import os
import pymongo  
import csv
import JLParser as ps
import JLFileMgr as fm
import JLScraper as sc 
import datetime as dt
import numpy as np


### Janie's Data Science Job Analysis for Tennessee  2017
----
* Select the desired columns from MongoDB and read back to dataframe for analysis
* Save result dataset into excel sheet in Output directory

#### Get selected data back from MongoDB, merge, and output to csv.

In [377]:
# Get employment total by state
employ_data = pd.DataFrame.from_records(db.employment.find({'Occ_Group': 'total'}, {'State': 1, 'Tot_Emp': 1,
                                                        'Yr_Mean_Wage': 1, 'Hr_Mean_Wage': 1, '_id': 0 }))
employ_data = employ_data.rename(columns={"Tot_Emp": "Employment"})
employ_data.shape


(54, 4)

In [378]:
# Get employment total by state
pop_data = pd.DataFrame.from_records(db.population.find({}, {'State': 1, 'Population': 1, '_id': 0 }))

pop_data.shape

(51, 2)

In [379]:
employ_df = pd.merge(employ_data, pop_data, how="left", on="State")
employ_df = employ_df.fillna(0)
employ_df["Population"] = employ_df["Population"].astype(int)
employ_df["Yr_Mean_Wage"] = employ_df["Yr_Mean_Wage"].astype(int)
employ_rt = employ_df[["State", "Population", "Employment", "Yr_Mean_Wage", "Hr_Mean_Wage"]]
employ_rt.head()


Unnamed: 0,State,Population,Employment,Yr_Mean_Wage,Hr_Mean_Wage
0,Alabama,4874747,1922570,43170,20.76
1,Alaska,739795,318170,57750,27.77
2,Arizona,7016270,2704050,48160,23.15
3,Arkansas,3004279,1200130,40530,19.49
4,California,39536653,16695010,57190,27.5


In [380]:
# Employment and average wage by state output excel file
Employ_rt_path = os.path.join(".", "Output", "Employment_State_2017.csv")
employ_rt.to_csv(Employ_rt_path)

# Sort by annual wage
employ_rt = employ_rt.sort_values("Yr_Mean_Wage", ascending=False)
employ_rt.head()

Unnamed: 0,State,Population,Employment,Yr_Mean_Wage,Hr_Mean_Wage
8,District of Columbia,693972,708220,85720,41.21
21,Massachusetts,6859819,3528070,62110,29.86
32,New York,19849399,9207870,60100,28.9
6,Connecticut,3588184,1654420,59410,28.56
1,Alaska,739795,318170,57750,27.77


In [381]:
# US Employment and average wage by occupation category
employ_data = pd.DataFrame.from_records(db.employment.find({'Occ_Group': 'major'}, {'State': 1, 'Occupation': 1, 'Tot_Emp': 1,
                                                        'Yr_Mean_Wage': 1, 'Hr_Mean_Wage': 1, 'Jobs_1000': 1, '_id': 0}))

employ_data["Occupation"] = employ_data["Occupation"].str.replace(" Occupations", "")
occ_grp = employ_data.groupby("Occupation")
avg_yr_wage = occ_grp["Yr_Mean_Wage"].mean().map("${:,.2f}".format)

employ_summary = pd.DataFrame({"Avg Wage/Yr": avg_yr_wage})
employ_summary["Avg Wage/Hr"] = occ_grp["Hr_Mean_Wage"].mean().map("${:,.2f}".format)
employ_summary["Employment"] = occ_grp["Tot_Emp"].sum().map("{:,}".format)

employ_summary

Unnamed: 0_level_0,Avg Wage/Yr,Avg Wage/Hr,Employment
Occupation,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Architecture and Engineering,"$80,754.44",$38.83,2531630
"Arts, Design, Entertainment, Sports, and Media","$50,777.78",$24.41,1933340
Building and Grounds Cleaning and Maintenance,"$28,180.37",$13.55,4470290
Business and Financial Operations,"$70,007.59",$33.66,7518560
Community and Social Service,"$45,864.26",$22.05,2113720
Computer and Mathematical,"$79,656.30",$38.30,4273060
Construction and Extraction,"$48,161.48",$23.15,5762060
"Education, Training, and Library","$52,036.48",$25.02,8800500
"Farming, Fishing, and Forestry","$32,433.15",$15.59,472900
Food Preparation and Serving Related,"$24,333.33",$11.70,13275170


In [382]:
# Employment and average wage by state output excel file
occ_path = os.path.join(".", "Output", "Employment_Occupation_2017.csv")
employ_data.to_csv(occ_path)


In [383]:
tn_data = employ_data.loc[(employ_data["Occupation"] == "Computer and Mathematical") & ((employ_data["State"] == "New Jersey") | (employ_data["State"] == "Tennessee"))]  
tn_data


Unnamed: 0,Hr_Mean_Wage,Jobs_1000,Occupation,State,Tot_Emp,Yr_Mean_Wage
662,48.34,34.846,Computer and Mathematical,New Jersey,139640,100540.0
926,35.6,18.77,Computer and Mathematical,Tennessee,54730,74050.0


In [384]:
# US Employment and average wage by specific occupation
occ_data = pd.DataFrame.from_records(db.employment.find({'Occ_Group': 'detailed'}, {'State': 1, 'Occupation': 1, 'Tot_Emp': 1,
                                                        'Yr_Mean_Wage': 1, 'Hr_Mean_Wage': 1, 'Jobs_1000': 1, '_id': 0}))

occ_data["Occupation"].unique()

array(['Chief Executives', 'General and Operations Managers',
       'Legislators', 'Advertising and Promotions Managers',
       'Marketing Managers', 'Sales Managers',
       'Public Relations and Fundraising Managers',
       'Administrative Services Managers',
       'Computer and Information Systems Managers', 'Financial Managers',
       'Industrial Production Managers', 'Purchasing Managers',
       'Transportation, Storage, and Distribution Managers',
       'Compensation and Benefits Managers', 'Human Resources Managers',
       'Training and Development Managers', 'Construction Managers',
       'Education Administrators, Preschool and Childcare Center/Program',
       'Education Administrators, Elementary and Secondary School',
       'Education Administrators, Postsecondary',
       'Education Administrators, All Other',
       'Architectural and Engineering Managers', 'Food Service Managers',
       'Funeral Service Managers', 'Lodging Managers',
       'Medical and Health

In [385]:
# Filter by specific occupations
occ_rt = occ_data.loc[ (occ_data["Occupation"].str.contains("Computer|Software|Developer"))]  
occ_rt.shape


(1007, 6)

In [386]:
# Summary of specific occupation wage
occ_grp = occ_rt.groupby("Occupation")
avg_yr_wage = occ_grp["Yr_Mean_Wage"].mean().map("${:,.2f}".format)

occ_summary = pd.DataFrame({"Avg Wage/Yr": avg_yr_wage})
occ_summary["Avg Wage/Hr"] = occ_grp["Hr_Mean_Wage"].mean().map("${:,.2f}".format)
occ_summary["Employment"] = occ_grp["Tot_Emp"].sum().map("{:,}".format)

occ_summary

Unnamed: 0_level_0,Avg Wage/Yr,Avg Wage/Hr,Employment
Occupation,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Computer Hardware Engineers,"$91,900.00",$44.18,59650
Computer Network Architects,"$94,698.27",$45.53,158460
Computer Network Support Specialists,"$60,674.63",$29.17,186870
"Computer Numerically Controlled Machine Tool Programmers, Metal and Plastic","$50,574.52",$24.32,23720
"Computer Occupations, All Other","$83,320.38",$40.06,307640
Computer Operators,"$44,043.65",$21.17,39650
Computer Programmers,"$80,909.81",$38.90,242140
"Computer Science Teachers, Postsecondary","$84,824.40",$0.00,32270
Computer Systems Analysts,"$83,373.52",$40.08,583230
Computer User Support Specialists,"$50,695.56",$24.37,617350


In [387]:
# Employment and average wage by specific occupation output excel file
occ_path = os.path.join(".", "Output", "Specific_Occupation_2017.csv")
occ_rt.to_csv(occ_path)


In [388]:
tn_data = occ_rt.loc[(occ_rt["State"] == "New Jersey") | (occ_rt["State"] == "Tennessee")]  
tn_data = tn_data.sort_values(["Occupation", "State"])

tn_data

Unnamed: 0,Hr_Mean_Wage,Jobs_1000,Occupation,State,Tot_Emp,Yr_Mean_Wage
20475,51.82,0.2,Computer Hardware Engineers,New Jersey,800,107790.0
28742,41.8,0.248,Computer Hardware Engineers,Tennessee,720,86950.0
20458,61.72,1.312,Computer Network Architects,New Jersey,5260,128370.0
28728,47.7,0.753,Computer Network Architects,Tennessee,2200,99210.0
20460,38.94,1.546,Computer Network Support Specialists,New Jersey,6200,81000.0
28730,28.42,1.226,Computer Network Support Specialists,Tennessee,3570,59110.0
21003,30.4,0.13,Computer Numerically Controlled Machine Tool P...,New Jersey,520,63230.0
29275,25.89,0.132,Computer Numerically Controlled Machine Tool P...,Tennessee,380,53850.0
20461,46.97,2.364,"Computer Occupations, All Other",New Jersey,9470,97690.0
28731,36.65,1.242,"Computer Occupations, All Other",Tennessee,3620,76240.0


In [389]:
# Get Jobs 
job_data = pd.DataFrame.from_records(db.jobs.find({}, {"state": 1, "city": 1, "zipcode": 1, "title": 1, "salary": 1, "_id": 0 }))
job_data = job_data.replace(np.nan, '', regex=True)
job_data.shape


(2906, 5)

In [390]:
# Filter by specific occupations
search_str = 'Software Developer|Software|Computer Scientist|Analytical|Data Scientist|Artificial Intelligence|AI|BI|Business Intelligence|Developer|Information Technology|Data Analyst|Programmer|Business Analyst|Web Services|Web Development|Software Development|Data Engineer|Software Engineering|Data Science|Application|Machine Learning|eCommerce Analyst|Web Analyst|VB.NET|C#|SQL'
job_rt = job_data.loc[(job_data["title"].str.contains(search_str))] 
job_rt

Unnamed: 0,city,salary,state,title,zipcode
0,Oak Ridge,,TN,Software Engineer,37831
2,Chattanooga,,TN,Data Analyst II - Integration,37402
5,Pleasant View,,TN,Software Support Engineer,
6,Nashville,,TN,Data Analyst III,
7,Tennessee,,,Software Engineer,
10,Nashville,$40 - $45 an hour,TN,Data Analyst (Spotfire/Tableau),
12,Knoxville,"$90,000 - $120,000 a year",TN,Senior Software Engineer,
13,Nashville,,TN,Data Analyst,
15,Elmwood Park,,NJ,SQL Developer/Data Engineer,07407
16,Jersey City,"$63,000 - $68,000 a year",NJ,Junior Business Analyst,


In [391]:
job_rt.shape

(1181, 5)

In [392]:
job_rt

Unnamed: 0,city,salary,state,title,zipcode
0,Oak Ridge,,TN,Software Engineer,37831
2,Chattanooga,,TN,Data Analyst II - Integration,37402
5,Pleasant View,,TN,Software Support Engineer,
6,Nashville,,TN,Data Analyst III,
7,Tennessee,,,Software Engineer,
10,Nashville,$40 - $45 an hour,TN,Data Analyst (Spotfire/Tableau),
12,Knoxville,"$90,000 - $120,000 a year",TN,Senior Software Engineer,
13,Nashville,,TN,Data Analyst,
15,Elmwood Park,,NJ,SQL Developer/Data Engineer,07407
16,Jersey City,"$63,000 - $68,000 a year",NJ,Junior Business Analyst,


In [393]:
# Jobs output excel file
job_path = os.path.join(".", "Output", "Jobs_2017.csv")
job_rt.to_csv(job_path)


In [394]:
# Jobs Count within 2 states by job title
job_grp = job_rt.groupby("title")
job_ct = job_grp["state"].count()
job_summary = pd.DataFrame({"Job_Count": job_ct})
job_summary = job_summary.reset_index()
job_summary = job_summary.sort_values("Job_Count", ascending=False)

job_summary 

Unnamed: 0,title,Job_Count
611,Software Engineer,56
101,Business Analyst,47
159,Data Analyst,41
203,Data Scientist,39
567,Software Developer,30
544,Senior Software Engineer,26
194,Data Engineer,19
442,Programmer Analyst,14
513,Senior Data Scientist,8
67,Associate Data Scientist,6


In [395]:
# Sort by salary
job_st = job_rt.sort_values(["salary", "state"], ascending=False)
job_st.head(100) 

Unnamed: 0,city,salary,state,title,zipcode
300,Knoxville,"$95,000 - $100,000 a year",TN,Technical Data Analyst,37923
12,Knoxville,"$90,000 - $120,000 a year",TN,Senior Software Engineer,
698,Nashville,"$90,000 - $120,000 a year",TN,Full-Stack Software Developer,
1736,Nashville,"$90,000 - $110,000 a year",TN,Senior Software Developer,
160,Knoxville,"$90,000 - $105,000 a year",TN,Software Developer,
757,Cedar Knolls,"$90,000 - $100,000 a year",NJ,Software Programmer - Crestron Systems Exp Req...,
155,Kendall Park,"$88,546 a year",NJ,Business Analyst,08824
1598,Township of Warren,"$85,000 - $128,000 a year (Indeed est.)",NJ,Data Analytics and Developer,07059
806,Nashville,"$80,000 a year",TN,Assembler Programmer IBM Mainframe,
1993,Nashville,"$80,000 a year",TN,Software Product Marketing Manager,


In [400]:
# Jobs Count within 2 states by job title
job_grp = job_rt.groupby(["state", "city"])
job_ct = job_grp["title"].count()
job_summary = pd.DataFrame({"Job_Count": job_ct})
job_summary = job_summary.sort_values(["state", "city"], ascending=False)

job_summary 

Unnamed: 0_level_0,Unnamed: 1_level_0,Job_Count
state,city,Unnamed: 2_level_1
TN,Tullahoma,3
TN,Springfield,1
TN,Spring Hill,1
TN,Sevierville,1
TN,Pleasant View,1
TN,Piney Flats,3
TN,Oak Ridge,20
TN,Nashville,146
TN,Mount Juliet,1
TN,Millington,1
