In [None]:
# Install the required libraries
!pip install prophet

In [None]:
# Dependencies
# NOTE: We might not use all of these. I just improrted everything I can think of for now. We'll delete the ones we don't need later
import requests
import time
from dotenv import load_dotenv
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import json
#import path

In [None]:
#Read data into the notebook
linkedin_postings_df = pd.read_csv('./data_sets/postings.csv').dropna()
machine_learning_jobs_df = pd.read_json('./data_sets/job_data.json', lines=True)

***Introduction*** 
The goal of this exploratory data analysis is to characterize and investigate the growth of machine learning as a job skill. We are interested in looking at this topic along a number angles. TBC....

QUESTION US Census Predictions

### Additional assets required

Importing Prophet and the previously compiled data from the US Census API ACS 5-Year rolling estimates

In [None]:
# Additional dependencies
from prophet import Prophet

In [None]:
# Additional data reading
combined_acs5 = pd.read_csv('./data_sets/combined_acs5_12-22_data.csv')

### Preparing the Data

Once imported, the data from the US Census API needs to be manipulated for EDA purposes. This is accomplished by slicing necessary columns from the source DataFrame and calculating other potentially necessary fields from there.

The final working DataFrame will be structured as follows;

**Name:**
> State name

**Year:**
> Year of 5-year estimate from US Census

**population:**
> Total population (by count)

**employment_employed:**
> Total employed population (by count)

**employment_unemployed:**
> Total unemployed population (by count)

**education_none:**
> Total population with no formal education (by count)

**education_high_school:**
> Total population with a high school education (by count)

**education_ged:**
> Total population with a GED (by count)

**education_associates:**
> Total population with an associates degree (by count)

**education_bachelors:**
> Total population with a bachelors degree (by count)

**education_masters:**
> Total population with a masters degree (by count)

**education_professional:**
> Total population with on-the-job training (by count)

**education_doctorate:**
> Total population with a doctorate (by count)

**Percent Employed:**
> Total employed population (by percentage)

**Percent Unemployed:**
> TTotal unemployed population (by percentage)

**Total Pop in Tech Fields:**
> Total population working in directly tech-related sectors (by count)

**Percent in Tech Fields:**
> Total population working in directly tech-related sectors (by percentage)

**Percent No Education:**
> Total population with no formal education (by percentage)

**Percent High School:**
> Total population with a high school education (by percentage)

**Percent GED:**
> Total population with with a GED (by percentage)

**Percent Associates:**
> Total population with an associates degree (by percentage)

**Percent Bachelors:**
> Total population with a bachelors degree (by percentage)

**Percent Masters:**
> Total population with a masters degree (by percentage)

**Percent Professional Education:**
> Total population with with on-the-job training (by percentage)

**Percent Doctorate:**
> Total population with with a doctorate (by percentage)

**Note:** While not all columns were used in the final analysis, each played a role during the EDA process.

In [None]:
# Creating a working DF for population calculations
for row in combined_acs5:
    # Slicing columns from the UC Sensus data source DataFrame
    pop_data = combined_acs5.loc[:,[
                'Name',
                'Year',
                'population',
                'employment_employed',
                'employment_unemployed',
                'education_none',
                'education_high_school',
                'education_ged',
                'education_associates',
                'education_bachelors',
                'education_masters',
                'education_professional',
                'education_doctorate'
                ]]
    # Calculating the percentage of employed and unemployed populations
    pop_data['Percent Employed'] = (combined_acs5.loc[:,'employment_employed']/combined_acs5.loc[:,'population']) * 100
    pop_data['Percent Unemployed'] = (combined_acs5.loc[:,'employment_unemployed']/combined_acs5.loc[:,'population']) * 100
    # Calculating the total and percentage of populations working in tech-related fields
    pop_data['Total Pop in Tech Fields'] = (
                                            combined_acs5.loc[:,'employment_male_business_and_financial_operations_occupations'] +
                                            combined_acs5.loc[:,'employment_male_computer_engineering_and_science_occupations'] +
                                            combined_acs5.loc[:,'employment_male_computer_and_mathematical_occupations'] +
                                            combined_acs5.loc[:,'employment_female_business_and_financial_operations_occupations'] +
                                            combined_acs5.loc[:,'employment_female_computer_engineering_and_science_occupations'] +
                                            combined_acs5.loc[:,'employment_female_computer_and_mathematical_occupations']
                                        )
    pop_data['Percent in Tech Fields'] = (pop_data.loc[:,'Total Pop in Tech Fields']/combined_acs5.loc[:,'population']) * 100
    # Calculating the percentage of populations with a given education level
    pop_data['Percent No Education'] = (combined_acs5.loc[:,'education_none']/combined_acs5.loc[:,'population']) * 100
    pop_data['Percent High School'] = (combined_acs5.loc[:,'education_high_school']/combined_acs5.loc[:,'population']) * 100
    pop_data['Percent GED'] = (combined_acs5.loc[:,'education_ged']/combined_acs5.loc[:,'population']) * 100
    pop_data['Percent Associates'] = (combined_acs5.loc[:,'education_associates']/combined_acs5.loc[:,'population']) * 100
    pop_data['Percent Bachelors'] = (combined_acs5.loc[:,'education_bachelors']/combined_acs5.loc[:,'population']) * 100
    pop_data['Percent Masters'] = (combined_acs5.loc[:,'education_masters']/combined_acs5.loc[:,'population']) * 100
    pop_data['Percent Professional Education'] = (combined_acs5.loc[:,'education_professional']/combined_acs5.loc[:,'population']) * 100
    pop_data['Percent Doctorate'] = (combined_acs5.loc[:,'education_doctorate']/combined_acs5.loc[:,'population']) * 100

# Confirming working DF populated correctly
display(pop_data.head(3))
display(pop_data.tail(3))

In [None]:
# Converting `Year` to Datetime format for future use with Prophet
# (`YYYY-12-30 11:59:59` chosen for Datetime format since data only came with `YYYY`)
pop_data['Year'] = pd.to_datetime({'year': pop_data['Year'],
                                   'month': 12,
                                   'day': 30,
                                   'hour': 11,
                                   'minute': 59,
                                   'second': 59
                                    })

# Verifying applied correctly
# Confirming working DF populated correctly
display(pop_data.head(3))
display(pop_data.tail(3))

### Further Data Exploration

Examining values and data types to prepare for more direct analysis

In [None]:
# Confirming unique States
pop_data['Name'].value_counts()

In [None]:
# Confirming data types for `pop_data`
pop_data.dtypes

### Slicing Data

Preparing slices of data for use with Prophet

In [None]:
# Slicing data into working DFs for future use with Prophet
# For total population
slice_pop = pop_data.loc[:,[
                        'Name',
                        'Year',
                        'population'
                        ]]

# For total employed population
slice_emp_tot = pop_data.loc[:,[
                            'Name',
                            'Year',
                            'employment_employed'
                            ]]

# For percentage of employed population
slice_emp_pct = pop_data.loc[:,[
                            'Name',
                            'Year',
                            'Percent Employed'
                            ]]

# For total unemployed population
slice_unemp_tot = pop_data.loc[:,[
                            'Name',
                            'Year',
                            'employment_unemployed'
                            ]]

# For percentage of unemployed population
slice_unemp_pct = pop_data.loc[:,[
                            'Name',
                            'Year',
                            'Percent Unemployed'
                            ]]

# For total with no education
slice_edu_non_tot = pop_data.loc[:,[
                                'Name',
                                'Year',
                                'education_none'
                                ]]

# For percentage with no education
slice_edu_non_pct = pop_data.loc[:,[
                                'Name',
                                'Year',
                                'Percent No Education'
                                ]]

# For total with high school diploma
slice_edu_hs_tot = pop_data.loc[:,[
                                'Name',
                                'Year',
                                'education_high_school'
                                ]]

# For percentage with high school diploma
slice_edu_hs_pct = pop_data.loc[:,[
                                'Name',
                                'Year',
                                'Percent High School'
                                ]]

# For total with GED
slice_edu_ged_tot = pop_data.loc[:,[
                                'Name',
                                'Year',
                                'education_ged'
                                ]]

# For percentage with GED
slice_edu_ged_pct = pop_data.loc[:,[
                                'Name',
                                'Year',
                                'Percent GED'
                                ]]

# For total with associates degree
slice_edu_asc_tot = pop_data.loc[:,[
                                'Name',
                                'Year',
                                'education_associates'
                                ]]

# For percentage with associates degree
slice_edu_asc_pct = pop_data.loc[:,[
                                'Name',
                                'Year',
                                'Percent Associates'
                                ]]

# For total with bachelors degree
slice_edu_bch_tot = pop_data.loc[:,[
                                'Name',
                                'Year',
                                'education_bachelors'
                                ]]

# For percentage with bachelors degree
slice_edu_bch_pct = pop_data.loc[:,[
                                'Name',
                                'Year',
                                'Percent Bachelors'
                                ]]

# For total with masters degree
slice_edu_mst_tot = pop_data.loc[:,[
                                'Name',
                                'Year',
                                'education_masters'
                                ]]

# For percentage with masters degree
slice_edu_mst_pct = pop_data.loc[:,[
                                'Name',
                                'Year',
                                'Percent Masters'
                                ]]

# For total with professional education
slice_edu_prf_tot = pop_data.loc[:,[
                                'Name',
                                'Year',
                                'education_professional'
                                ]]

# For percentage with professional education 
slice_edu_prf_pct = pop_data.loc[:,[
                                'Name',
                                'Year',
                                'Percent Professional Education'
                                ]]

# For total with doctorate
slice_edu_doc_tot = pop_data.loc[:,[
                                'Name',
                                'Year',
                                'education_doctorate'
                                ]]

# For percentage with doctorate 
slice_edu_doc_pct = pop_data.loc[:,[
                                'Name',
                                'Year',
                                'Percent Doctorate'
                                ]]

# For total working in tech-related fields
slice_tech_tot = pop_data.loc[:,[
                            'Name',
                            'Year',
                            'Total Pop in Tech Fields'
                            ]]

# For percentage working in tech-related fields
slice_tech_pct = pop_data.loc[:,[
                            'Name',
                            'Year',
                            'Percent in Tech Fields'
                            ]]

### National Trends

Applying slices to a National scale to explore trends and correlations

In [None]:
# Grouping data from slices for National analysis

# Grouping percentatge of unemployed population by year
national_unemp = slice_unemp_pct[['Year', 'Percent Unemployed']].groupby('Year').mean()

# Resetting the index for the grouped slice
national_unemp.reset_index(inplace=True)

# Renaming columns for use with Prophet
national_unemp.rename(columns={
                        'Year': 'ds',
                        'Percent Unemployed': 'y'
                    }, inplace=True)


# Grouping percentatge of population employed in tech-related fields
national_tech = slice_tech_pct[['Year', 'Percent in Tech Fields']].groupby('Year').mean()

# Resetting the index for the grouped slice
national_tech.reset_index(inplace=True)

# Renaming columns for use with Prophet
national_tech.rename(columns={
                        'Year': 'ds',
                        'Percent in Tech Fields': 'y'
                    }, inplace=True)


# Grouping percentatge of population with no education by year
national_edu_non = slice_edu_non_pct[['Year', 'Percent No Education']].groupby('Year').mean()

# Resetting the index for the grouped slice
national_edu_non.reset_index(inplace=True)

# Renaming columns for use with Prophet
national_edu_non.rename(columns={
                            'Year': 'ds',
                            'Percent No Education': 'y'
                        }, inplace=True)


# Grouping percentatge of population with a high school education by year
national_edu_hs = slice_edu_hs_pct[['Year', 'Percent High School']].groupby('Year').mean()

# Resetting the index for the grouped slice
national_edu_hs.reset_index(inplace=True)

# Renaming columns for use with Prophet
national_edu_hs.rename(columns={
                            'Year': 'ds',
                            'Percent High School': 'y'
                        }, inplace=True)


# Grouping percentatge of population with a GED by year
national_edu_ged = slice_edu_ged_pct[['Year', 'Percent GED']].groupby('Year').mean()

# Resetting the index for the grouped slice
national_edu_ged.reset_index(inplace=True)

# Renaming columns for use with Prophet
national_edu_ged.rename(columns={
                            'Year': 'ds',
                            'Percent GED': 'y'
                        }, inplace=True)


# Grouping percentatge of population with an associates degree by year
national_edu_asc = slice_edu_asc_pct[['Year', 'Percent Associates']].groupby('Year').mean()

# Resetting the index for the grouped slice
national_edu_asc.reset_index(inplace=True)

# Renaming columns for use with Prophet
national_edu_asc.rename(columns={
                            'Year': 'ds',
                            'Percent Associates': 'y'
                        }, inplace=True)


# Grouping percentatge of population with a bachelors degree by year
national_edu_bch = slice_edu_bch_pct[['Year', 'Percent Bachelors']].groupby('Year').mean()

# Resetting the index for the grouped slice
national_edu_bch.reset_index(inplace=True)

# Renaming columns for use with Prophet
national_edu_bch.rename(columns={
                            'Year': 'ds',
                            'Percent Bachelors': 'y'
                        }, inplace=True)


# Grouping percentatge of population with a professional education by year
national_edu_prf = slice_edu_prf_pct[['Year', 'Percent Professional Education']].groupby('Year').mean()

# Resetting the index for the grouped slice
national_edu_prf.reset_index(inplace=True)

# Renaming columns for use with Prophet
national_edu_prf.rename(columns={
                            'Year': 'ds',
                            'Percent Professional Education': 'y'
                        }, inplace=True)

# Below

Testing Prophet functions, nothing committed as of yet.

In [None]:
test_slice = slice_tech_pct[['Year', 'Percent in Tech Fields']]\
            .loc[slice_tech_pct['Name'] == 'Washington']
            
            
test_slice.rename(columns={
                'Year': 'ds',
                'Percent in Tech Fields': 'y'
                }, inplace=True)

test_slice.head(11)

In [None]:
national_avg_pct = pop_data[[
                            'Year',
                            'Percent Unemployed',
                            'Percent in Tech Fields',
                            'Percent No Education',
                            'Percent High School',
                            'Percent GED',
                            'Percent Associates',
                            'Percent Bachelors',
                            'Percent Professional Education'
                           ]].groupby('Year').mean()

In [None]:
national_avg_pct[[
                  'Percent in Tech Fields',
                  'Percent Unemployed',
                  'Percent No Education',
                  'Percent High School',
                  'Percent GED',
                  'Percent Professional Education'
                ]].corr()

In [None]:
m_n_t = Prophet()

In [None]:
m_n_t.fit(national_tech)

In [None]:
n_tech_future = m_n_t.make_future_dataframe(periods=5, freq='YE')

In [None]:
n_tech_forecast = m_n_t.predict(n_tech_future)

In [None]:
m_n_t.plot(n_tech_forecast)

In [None]:
m_n_e_p = Prophet()

In [None]:
m_n_e_p.fit(national_edu_prf)

In [None]:
n_prf_future = m_n_e_p.make_future_dataframe(periods=5, freq='YE')

In [None]:
n_prf_forecast = m_n_e_p.predict(n_prf_future)

In [None]:
m_n_e_p.plot(n_prf_forecast)

In [None]:
m_n_e_a = Prophet()

In [None]:
m_n_e_a.fit(national_edu_asc)

In [None]:
n_asc_future = m_n_e_a.make_future_dataframe(periods=5, freq='YE')

In [None]:
n_asc_forecast = m_n_e_a.predict(n_asc_future)

In [None]:
m_n_e_a.plot(n_asc_forecast)

In [None]:
m_n_e_h = Prophet()

In [None]:
m_n_e_h.fit(national_edu_hs)

In [None]:
n_hs_future = m_n_e_h.make_future_dataframe(periods=5, freq='YE')

In [None]:
n_hs_forecast = m_n_e_h.predict(n_hs_future)

In [None]:
m_n_e_h.plot(n_hs_forecast)

In [None]:
m_n_e_g = Prophet()

In [None]:
m_n_e_g.fit(national_edu_ged)

In [None]:
n_ged_future = m_n_e_g.make_future_dataframe(periods=5, freq='YE')

In [None]:
n_ged_forecast = m_n_e_g.predict(n_ged_future)

In [None]:
m_n_e_g.plot(n_ged_forecast)

In [None]:
m_n_u = Prophet()

In [None]:
m_n_u.fit(national_unemp)

In [None]:
n_unemp_future = m_n_u.make_future_dataframe(periods=5, freq='YE')

In [None]:
n_unemp_forecast = m_n_u.predict(n_unemp_future)

In [None]:
m_n_u.plot(n_unemp_forecast)

In [None]:
# Insert Question 1 analysis and visualizations here. Insert new cells if necessary 

Q1 Summary \[INSERT SUMMARY HERE] ... write a little about what the findings above seem to indicate about question 1

Question 2 Relative proportions ML to not-ML

In [None]:
# Insert question 2 analysis etc here. Insert new cells if necessary

Q2 Summary \[INSERT SUMMARY HERE]

Question Job Skills

In [None]:
# etc 

Q3 Summary \[INSERT SUMMARY HERE]

Question Seniority/Job level

In [None]:
# etc 

Question 5 Industry demand 