# Part 1: extracting text from online job postings

## Data extraction and cleaning

In [175]:
import pandas as pd
import os

%matplotlib inline

In [177]:
directory = r'C:\Manning\resume-job-posting-nlp-project\data\html_job_postings'
os.chdir(directory)

data = []
for file in os.listdir(directory):
    file_name = os.path.basename(file)
    with open(file, mode='r', encoding='utf-8') as f:
        file_content = f.read()
        data.append([file_name, file_content])

df = pd.DataFrame(data, columns=['file_name', 'html'])

In [178]:
df.head()

Unnamed: 0,file_name,html
0,001b92395ed0fb62_fccid.html,"<html><head><title>Data Engineer - Columbus, G..."
1,00321a48d04fe754_fccid.html,"<html><head><title>Data Analyst - St. Louis, M..."
2,0079c11b2611349f_fccid.html,"<html><head><title>Data Scientist - Newark, CA..."
3,007d9d7b5c09d820_fccid.html,<html><head><title>Patient Care Assistant / PC...
4,0125eabc844281c9_fccid.html,<html><head><title>Scientific Programmer - Ber...


In [179]:
df.describe

<bound method NDFrame.describe of                         file_name  \
0     001b92395ed0fb62_fccid.html   
1     00321a48d04fe754_fccid.html   
2     0079c11b2611349f_fccid.html   
3     007d9d7b5c09d820_fccid.html   
4     0125eabc844281c9_fccid.html   
...                           ...   
1332  fee42538d4a7bb9a_fccid.html   
1333  fefb9642ac28b85e_fccid.html   
1334  ff0eb6d0b4da2289_fccid.html   
1335  ff81a90403a3f37e_fccid.html   
1336                      _p.html   

                                                   html  
0     <html><head><title>Data Engineer - Columbus, G...  
1     <html><head><title>Data Analyst - St. Louis, M...  
2     <html><head><title>Data Scientist - Newark, CA...  
3     <html><head><title>Patient Care Assistant / PC...  
4     <html><head><title>Scientific Programmer - Ber...  
...                                                 ...  
1332  <html><head><title>Data Scientist - Glen Mills...  
1333  <html><head><title>Data Analyst (Part-Time) - ...  

In [180]:
df['duplicate'] = df.duplicated(subset='html', keep=False)

In [182]:
df.loc[df['duplicate'] == True]

Unnamed: 0,file_name,html,duplicate
10,0203a50423c1dff3_fccid.html,<html><head><title>Deep Learning Engineer - We...,True
190,22ec5abff932581e_fccid.html,<html><head><title>Machine Learning Engineer -...,True
726,85a24b62479fad81_fccid.html,<html><head><title>Staff Data Scientist - NLP ...,True
774,905ed810a40c9bb1_fccid.html,<html><head><title>Machine Learning Engineer -...,True
781,92b3776c33af25d7_fccid.html,<html><head><title>Page Not Found - Indeed Mob...,True
789,94ae6fa4bc90abbf_fccid.html,<html><head><title>Personal Care Assistant - Y...,True
859,a1d949a71fe0ab4b_fccid.html,<html><head><title>Principal Data Scientist - ...,True
860,a1d949a71fe0ab4b_from.html,<html><head><title>Principal Data Scientist - ...,True
894,a74cbd9c94a1536e_fccid.html,<html><head><title>Data Scientist - Customer A...,True
939,af7a8336b016a707_fccid.html,<html><head><title>Deep Learning Engineer - We...,True


In [183]:
df.drop_duplicates(subset='html', keep='first', inplace=True)

In [184]:
df.describe

<bound method NDFrame.describe of                         file_name  \
0     001b92395ed0fb62_fccid.html   
1     00321a48d04fe754_fccid.html   
2     0079c11b2611349f_fccid.html   
3     007d9d7b5c09d820_fccid.html   
4     0125eabc844281c9_fccid.html   
...                           ...   
1331  fe385aa11883568f_fccid.html   
1332  fee42538d4a7bb9a_fccid.html   
1333  fefb9642ac28b85e_fccid.html   
1334  ff0eb6d0b4da2289_fccid.html   
1335  ff81a90403a3f37e_fccid.html   

                                                   html  duplicate  
0     <html><head><title>Data Engineer - Columbus, G...      False  
1     <html><head><title>Data Analyst - St. Louis, M...      False  
2     <html><head><title>Data Scientist - Newark, CA...      False  
3     <html><head><title>Patient Care Assistant / PC...      False  
4     <html><head><title>Scientific Programmer - Ber...      False  
...                                                 ...        ...  
1331  <html><head><title>Research Stat

In [185]:
df.drop(columns='duplicate', inplace=True)

In [186]:
df.head()

Unnamed: 0,file_name,html
0,001b92395ed0fb62_fccid.html,"<html><head><title>Data Engineer - Columbus, G..."
1,00321a48d04fe754_fccid.html,"<html><head><title>Data Analyst - St. Louis, M..."
2,0079c11b2611349f_fccid.html,"<html><head><title>Data Scientist - Newark, CA..."
3,007d9d7b5c09d820_fccid.html,<html><head><title>Patient Care Assistant / PC...
4,0125eabc844281c9_fccid.html,<html><head><title>Scientific Programmer - Ber...


## Parse title, body, and bullet points

In [191]:
from bs4 import BeautifulSoup as bs

def parse_title(html):
    soup = bs(html, 'lxml')
    title = soup.find('title').text
    
    return title

def parse_body(html):
    soup = bs(html, 'lxml')
    body = soup.body.text
    
    return body

def parse_bullets(html):
    soup = bs(html, 'lxml')
    bullets = [bullet.text for bullet in soup.body.find_all('li')]
    
    return bullets


In [192]:
df2 = pd.DataFrame()
df['title'] = df['html'].apply(parse_title)
df['body'] = df['html'].apply(parse_body)
df['bullets'] = df['html'].apply(parse_bullets, result_type=)


In [193]:
df.head()

Unnamed: 0,file_name,html,title,body,bullets
0,001b92395ed0fb62_fccid.html,"<html><head><title>Data Engineer - Columbus, G...","Data Engineer - Columbus, GA 31909","Data Engineer - Columbus, GA 31909\nCelebratin...","[Bachelor’s or Master’s degree in statistics, ..."
1,00321a48d04fe754_fccid.html,"<html><head><title>Data Analyst - St. Louis, M...","Data Analyst - St. Louis, MO","Data Analyst - St. Louis, MO\nDuties\nSummary\...",[Job family (Series)\n1501 General Mathematics...
2,0079c11b2611349f_fccid.html,"<html><head><title>Data Scientist - Newark, CA...","Data Scientist - Newark, CA","Data Scientist - Newark, CA\nData Scientist\n\...","[ Design, develop, document and maintain machi..."
3,007d9d7b5c09d820_fccid.html,<html><head><title>Patient Care Assistant / PC...,Patient Care Assistant / PCA - Med/Surg (Fayet...,Patient Care Assistant / PCA - Med/Surg (Fayet...,[Provides all personal care services in accord...
4,0125eabc844281c9_fccid.html,<html><head><title>Scientific Programmer - Ber...,"Scientific Programmer - Berkeley, CA","Scientific Programmer - Berkeley, CA\nCaribou ...","[Demonstrated proficiency with Python, JavaScr..."


## Save dataframe to disk

In [194]:
df.to_csv(r'..\job_postings.csv')