# Cleaning Scrapped Data

## Importing Libraries

In [1]:
import pandas as pd 
import time
import json
import numpy as np

### Reading the scrapped json file

In [2]:
filename = 'indeed_'+time.strftime("%d-%m-%Y")+'.json'
df = pd.read_json(filename,orient='columns')

In [3]:
# df.sort_values('job_title',ascending=False)

In [4]:
df['job_title'].value_counts()

data scientist       57
technical writer     57
software engineer    57
Name: job_title, dtype: int64

### Dropping the nulls and duplicate jobs 

In [5]:
df = df.drop_duplicates()
df.reset_index(drop=True, inplace=True)

In [6]:
df['job_title'].value_counts()

software engineer    42
technical writer     39
data scientist       38
Name: job_title, dtype: int64

In [7]:
# df['job_posting_salary'].value_counts()

### Cleaning Salary column:
Generated 4 columns: 
- salary_period : the time period of salary
- salary_lower : the lower bound of salary
- salary_upper : the upper bound of salary
- salary average : the average bound of salary - calculated from lower aan upper bound

In [8]:
df['job_posting_salary'] = df['job_posting_salary'].str.replace('\n', '')
df['job_posting_salary'] = df['job_posting_salary'].str.replace(',', '')
df['job_posting_salary'] = df['job_posting_salary'].str.replace('$', '')

In [9]:
df.head()

Unnamed: 0,company,job_posting_desc,job_posting_salary,job_posting_title,job_posting_url,job_title,location,scrap_date
0,inXile entertainment,Location: Remote\n\nThe Wasteland 3 team is lo...,,Senior Writer - Wasteland 3,https://www.indeed.com/pagead/clk?mo=r&ad=-6NY...,technical writer,"San Francisco, CA",15-04-2019
1,oneZero Financial Systems,Come join oneZero Financial Systems! An exciti...,,Junior Technical Writer,https://www.indeed.com/pagead/clk?mo=r&ad=-6NY...,technical writer,"San Francisco, CA",15-04-2019
2,Allwyn Corporation,"$50,000 - $70,000 a yearContractThe role is in...",50000 - 70000 a year,Technical Writer (Healthcare),https://www.indeed.com/pagead/clk?mo=r&ad=-6NY...,technical writer,"San Francisco, CA",15-04-2019
3,"Gables Engineering, Inc.","Since 1946, our goal at Gables Engineering has...",,Associate Technical Writer,https://www.indeed.com/pagead/clk?mo=r&ad=-6NY...,technical writer,"San Francisco, CA",15-04-2019
4,"GISbiz, Inc.","ContractData Scientist:Databricks with Spark, ...",,Data Scientist,https://www.indeed.com/pagead/clk?mo=r&ad=-6NY...,data scientist,"Boston, MA",15-04-2019


In [10]:
df['salary_period'] = 'None'
df['job_posting_salary'] = df['job_posting_salary'].fillna('None')
df.loc[(df['job_posting_salary'].str.contains("year")),'salary_period'] = 'year'
df.loc[(df['job_posting_salary'].str.contains("month")),'salary_period'] = 'month'
df.loc[(df['job_posting_salary'].str.contains("week")),'salary_period'] = 'week'
df.loc[(df['job_posting_salary'].str.contains("day")),'salary_period'] = 'day'
df.loc[(df['job_posting_salary'].str.contains("hour")),'salary_period'] = 'hour'

In [11]:
df.head()

Unnamed: 0,company,job_posting_desc,job_posting_salary,job_posting_title,job_posting_url,job_title,location,scrap_date,salary_period
0,inXile entertainment,Location: Remote\n\nThe Wasteland 3 team is lo...,,Senior Writer - Wasteland 3,https://www.indeed.com/pagead/clk?mo=r&ad=-6NY...,technical writer,"San Francisco, CA",15-04-2019,
1,oneZero Financial Systems,Come join oneZero Financial Systems! An exciti...,,Junior Technical Writer,https://www.indeed.com/pagead/clk?mo=r&ad=-6NY...,technical writer,"San Francisco, CA",15-04-2019,
2,Allwyn Corporation,"$50,000 - $70,000 a yearContractThe role is in...",50000 - 70000 a year,Technical Writer (Healthcare),https://www.indeed.com/pagead/clk?mo=r&ad=-6NY...,technical writer,"San Francisco, CA",15-04-2019,year
3,"Gables Engineering, Inc.","Since 1946, our goal at Gables Engineering has...",,Associate Technical Writer,https://www.indeed.com/pagead/clk?mo=r&ad=-6NY...,technical writer,"San Francisco, CA",15-04-2019,
4,"GISbiz, Inc.","ContractData Scientist:Databricks with Spark, ...",,Data Scientist,https://www.indeed.com/pagead/clk?mo=r&ad=-6NY...,data scientist,"Boston, MA",15-04-2019,


In [12]:
df['salary_period'].value_counts()

None    92
year    23
hour     4
Name: salary_period, dtype: int64

In [13]:
df["job_posting_salary"] = df["job_posting_salary"].str.replace(" a year", '')
df["job_posting_salary"] = df["job_posting_salary"].str.replace(" a month", '')
df["job_posting_salary"] = df["job_posting_salary"].str.replace(" a week", '')
df["job_posting_salary"] = df["job_posting_salary"].str.replace(" a day", '')
df["job_posting_salary"] = df["job_posting_salary"].str.replace(" an hour", '')

In [14]:
df

Unnamed: 0,company,job_posting_desc,job_posting_salary,job_posting_title,job_posting_url,job_title,location,scrap_date,salary_period
0,inXile entertainment,Location: Remote\n\nThe Wasteland 3 team is lo...,,Senior Writer - Wasteland 3,https://www.indeed.com/pagead/clk?mo=r&ad=-6NY...,technical writer,"San Francisco, CA",15-04-2019,
1,oneZero Financial Systems,Come join oneZero Financial Systems! An exciti...,,Junior Technical Writer,https://www.indeed.com/pagead/clk?mo=r&ad=-6NY...,technical writer,"San Francisco, CA",15-04-2019,
2,Allwyn Corporation,"$50,000 - $70,000 a yearContractThe role is in...",50000 - 70000,Technical Writer (Healthcare),https://www.indeed.com/pagead/clk?mo=r&ad=-6NY...,technical writer,"San Francisco, CA",15-04-2019,year
3,"Gables Engineering, Inc.","Since 1946, our goal at Gables Engineering has...",,Associate Technical Writer,https://www.indeed.com/pagead/clk?mo=r&ad=-6NY...,technical writer,"San Francisco, CA",15-04-2019,
4,"GISbiz, Inc.","ContractData Scientist:Databricks with Spark, ...",,Data Scientist,https://www.indeed.com/pagead/clk?mo=r&ad=-6NY...,data scientist,"Boston, MA",15-04-2019,
5,ISMIE Mutual Insurance Company,Job Purpose: Analyze statistical data effectin...,,Data Scientist,https://www.indeed.com/pagead/clk?mo=r&ad=-6NY...,data scientist,"Boston, MA",15-04-2019,
6,"Rippe & Kingston Systems, Inc.","Job Summary: Rippe & Kingston, a Cincinnati-ba...",,Technical Writer,https://www.indeed.com/pagead/clk?mo=r&ad=-6NY...,technical writer,"San Francisco, CA",15-04-2019,
7,Bloomberg,The Bloomberg Financial Products Documentation...,,Technical Writer,https://www.indeed.com/pagead/clk?mo=r&ad=-6NY...,technical writer,"San Francisco, CA",15-04-2019,
8,Adaptive Biotechnologies,Every immune system has a story to tell— the k...,,Technical Writer,https://www.indeed.com/pagead/clk?mo=r&ad=-6NY...,technical writer,"San Francisco, CA",15-04-2019,
9,Watts Water Technologies,"$55,000 - $60,000 a yearRESPONSIBILITIES AND D...",55000 - 60000,Technical Writer,https://www.indeed.com/pagead/clk?mo=r&ad=-6NY...,technical writer,"San Francisco, CA",15-04-2019,year


In [15]:
df['salary_lower'] = df["job_posting_salary"].str.split(" - ", n=1, expand=True)[0]
df['salary_upper'] = df["job_posting_salary"].str.split(" - ", n=1, expand=True)[1]

In [16]:
df['salary_lower']=df['salary_lower'].replace('None',0)
df['salary_upper']=df['salary_upper'].fillna(0)
df

Unnamed: 0,company,job_posting_desc,job_posting_salary,job_posting_title,job_posting_url,job_title,location,scrap_date,salary_period,salary_lower,salary_upper
0,inXile entertainment,Location: Remote\n\nThe Wasteland 3 team is lo...,,Senior Writer - Wasteland 3,https://www.indeed.com/pagead/clk?mo=r&ad=-6NY...,technical writer,"San Francisco, CA",15-04-2019,,0,0
1,oneZero Financial Systems,Come join oneZero Financial Systems! An exciti...,,Junior Technical Writer,https://www.indeed.com/pagead/clk?mo=r&ad=-6NY...,technical writer,"San Francisco, CA",15-04-2019,,0,0
2,Allwyn Corporation,"$50,000 - $70,000 a yearContractThe role is in...",50000 - 70000,Technical Writer (Healthcare),https://www.indeed.com/pagead/clk?mo=r&ad=-6NY...,technical writer,"San Francisco, CA",15-04-2019,year,50000,70000
3,"Gables Engineering, Inc.","Since 1946, our goal at Gables Engineering has...",,Associate Technical Writer,https://www.indeed.com/pagead/clk?mo=r&ad=-6NY...,technical writer,"San Francisco, CA",15-04-2019,,0,0
4,"GISbiz, Inc.","ContractData Scientist:Databricks with Spark, ...",,Data Scientist,https://www.indeed.com/pagead/clk?mo=r&ad=-6NY...,data scientist,"Boston, MA",15-04-2019,,0,0
5,ISMIE Mutual Insurance Company,Job Purpose: Analyze statistical data effectin...,,Data Scientist,https://www.indeed.com/pagead/clk?mo=r&ad=-6NY...,data scientist,"Boston, MA",15-04-2019,,0,0
6,"Rippe & Kingston Systems, Inc.","Job Summary: Rippe & Kingston, a Cincinnati-ba...",,Technical Writer,https://www.indeed.com/pagead/clk?mo=r&ad=-6NY...,technical writer,"San Francisco, CA",15-04-2019,,0,0
7,Bloomberg,The Bloomberg Financial Products Documentation...,,Technical Writer,https://www.indeed.com/pagead/clk?mo=r&ad=-6NY...,technical writer,"San Francisco, CA",15-04-2019,,0,0
8,Adaptive Biotechnologies,Every immune system has a story to tell— the k...,,Technical Writer,https://www.indeed.com/pagead/clk?mo=r&ad=-6NY...,technical writer,"San Francisco, CA",15-04-2019,,0,0
9,Watts Water Technologies,"$55,000 - $60,000 a yearRESPONSIBILITIES AND D...",55000 - 60000,Technical Writer,https://www.indeed.com/pagead/clk?mo=r&ad=-6NY...,technical writer,"San Francisco, CA",15-04-2019,year,55000,60000


In [17]:
df['salary_lower'] = pd.to_numeric(df['salary_lower'])
df['salary_upper'] = pd.to_numeric(df['salary_upper'])
df

Unnamed: 0,company,job_posting_desc,job_posting_salary,job_posting_title,job_posting_url,job_title,location,scrap_date,salary_period,salary_lower,salary_upper
0,inXile entertainment,Location: Remote\n\nThe Wasteland 3 team is lo...,,Senior Writer - Wasteland 3,https://www.indeed.com/pagead/clk?mo=r&ad=-6NY...,technical writer,"San Francisco, CA",15-04-2019,,0.0,0.0
1,oneZero Financial Systems,Come join oneZero Financial Systems! An exciti...,,Junior Technical Writer,https://www.indeed.com/pagead/clk?mo=r&ad=-6NY...,technical writer,"San Francisco, CA",15-04-2019,,0.0,0.0
2,Allwyn Corporation,"$50,000 - $70,000 a yearContractThe role is in...",50000 - 70000,Technical Writer (Healthcare),https://www.indeed.com/pagead/clk?mo=r&ad=-6NY...,technical writer,"San Francisco, CA",15-04-2019,year,50000.0,70000.0
3,"Gables Engineering, Inc.","Since 1946, our goal at Gables Engineering has...",,Associate Technical Writer,https://www.indeed.com/pagead/clk?mo=r&ad=-6NY...,technical writer,"San Francisco, CA",15-04-2019,,0.0,0.0
4,"GISbiz, Inc.","ContractData Scientist:Databricks with Spark, ...",,Data Scientist,https://www.indeed.com/pagead/clk?mo=r&ad=-6NY...,data scientist,"Boston, MA",15-04-2019,,0.0,0.0
5,ISMIE Mutual Insurance Company,Job Purpose: Analyze statistical data effectin...,,Data Scientist,https://www.indeed.com/pagead/clk?mo=r&ad=-6NY...,data scientist,"Boston, MA",15-04-2019,,0.0,0.0
6,"Rippe & Kingston Systems, Inc.","Job Summary: Rippe & Kingston, a Cincinnati-ba...",,Technical Writer,https://www.indeed.com/pagead/clk?mo=r&ad=-6NY...,technical writer,"San Francisco, CA",15-04-2019,,0.0,0.0
7,Bloomberg,The Bloomberg Financial Products Documentation...,,Technical Writer,https://www.indeed.com/pagead/clk?mo=r&ad=-6NY...,technical writer,"San Francisco, CA",15-04-2019,,0.0,0.0
8,Adaptive Biotechnologies,Every immune system has a story to tell— the k...,,Technical Writer,https://www.indeed.com/pagead/clk?mo=r&ad=-6NY...,technical writer,"San Francisco, CA",15-04-2019,,0.0,0.0
9,Watts Water Technologies,"$55,000 - $60,000 a yearRESPONSIBILITIES AND D...",55000 - 60000,Technical Writer,https://www.indeed.com/pagead/clk?mo=r&ad=-6NY...,technical writer,"San Francisco, CA",15-04-2019,year,55000.0,60000.0


In [18]:
df.loc[df['salary_period'] == 'month','salary_lower'] = df['salary_lower']*12
df.loc[df['salary_period'] == 'month','salary_upper'] = df['salary_upper']*12
df.loc[df['salary_period'] == 'week','salary_lower'] = df['salary_lower']*52
df.loc[df['salary_period'] == 'week','salary_upper'] = df['salary_upper']*52
df.loc[df['salary_period'] == 'day','salary_lower'] = df['salary_lower']*260
df.loc[df['salary_period'] == 'day','salary_upper'] = df['salary_upper']*260
df.loc[df['salary_period'] == 'hour','salary_lower'] = df['salary_lower']*2080
df.loc[df['salary_period'] == 'hour','salary_upper'] = df['salary_upper']*2080

In [19]:
df['salary_average'] = np.where(df['salary_lower'] == 0,df['salary_upper'],(np.where(df['salary_upper'] == 0.0,df['salary_lower'],((df['salary_lower']+df['salary_upper'])/2))))

In [20]:
df

Unnamed: 0,company,job_posting_desc,job_posting_salary,job_posting_title,job_posting_url,job_title,location,scrap_date,salary_period,salary_lower,salary_upper,salary_average
0,inXile entertainment,Location: Remote\n\nThe Wasteland 3 team is lo...,,Senior Writer - Wasteland 3,https://www.indeed.com/pagead/clk?mo=r&ad=-6NY...,technical writer,"San Francisco, CA",15-04-2019,,0.0,0.0,0.0
1,oneZero Financial Systems,Come join oneZero Financial Systems! An exciti...,,Junior Technical Writer,https://www.indeed.com/pagead/clk?mo=r&ad=-6NY...,technical writer,"San Francisco, CA",15-04-2019,,0.0,0.0,0.0
2,Allwyn Corporation,"$50,000 - $70,000 a yearContractThe role is in...",50000 - 70000,Technical Writer (Healthcare),https://www.indeed.com/pagead/clk?mo=r&ad=-6NY...,technical writer,"San Francisco, CA",15-04-2019,year,50000.0,70000.0,60000.0
3,"Gables Engineering, Inc.","Since 1946, our goal at Gables Engineering has...",,Associate Technical Writer,https://www.indeed.com/pagead/clk?mo=r&ad=-6NY...,technical writer,"San Francisco, CA",15-04-2019,,0.0,0.0,0.0
4,"GISbiz, Inc.","ContractData Scientist:Databricks with Spark, ...",,Data Scientist,https://www.indeed.com/pagead/clk?mo=r&ad=-6NY...,data scientist,"Boston, MA",15-04-2019,,0.0,0.0,0.0
5,ISMIE Mutual Insurance Company,Job Purpose: Analyze statistical data effectin...,,Data Scientist,https://www.indeed.com/pagead/clk?mo=r&ad=-6NY...,data scientist,"Boston, MA",15-04-2019,,0.0,0.0,0.0
6,"Rippe & Kingston Systems, Inc.","Job Summary: Rippe & Kingston, a Cincinnati-ba...",,Technical Writer,https://www.indeed.com/pagead/clk?mo=r&ad=-6NY...,technical writer,"San Francisco, CA",15-04-2019,,0.0,0.0,0.0
7,Bloomberg,The Bloomberg Financial Products Documentation...,,Technical Writer,https://www.indeed.com/pagead/clk?mo=r&ad=-6NY...,technical writer,"San Francisco, CA",15-04-2019,,0.0,0.0,0.0
8,Adaptive Biotechnologies,Every immune system has a story to tell— the k...,,Technical Writer,https://www.indeed.com/pagead/clk?mo=r&ad=-6NY...,technical writer,"San Francisco, CA",15-04-2019,,0.0,0.0,0.0
9,Watts Water Technologies,"$55,000 - $60,000 a yearRESPONSIBILITIES AND D...",55000 - 60000,Technical Writer,https://www.indeed.com/pagead/clk?mo=r&ad=-6NY...,technical writer,"San Francisco, CA",15-04-2019,year,55000.0,60000.0,57500.0


In [21]:
df_json = df.to_json(orient='records')
df_json
print(type(df_json))
with open('df_json.json','a') as file:
    json.dump(df_json,file)

<class 'str'>


### Removing punctuation

In [22]:
def data_clean(df, column):
    cleaning_list = ['+','$','/',',','?','.',';','-','@','!','&','%','^','*',')',':','(','\n']
    for item in cleaning_list:
        df[column] = df[column].astype(str).str.replace(item, " ")

In [23]:
for column in df.columns[1:2]:
    data_clean(df, column)

In [24]:
df

Unnamed: 0,company,job_posting_desc,job_posting_salary,job_posting_title,job_posting_url,job_title,location,scrap_date,salary_period,salary_lower,salary_upper,salary_average
0,inXile entertainment,Location Remote The Wasteland 3 team is look...,,Senior Writer - Wasteland 3,https://www.indeed.com/pagead/clk?mo=r&ad=-6NY...,technical writer,"San Francisco, CA",15-04-2019,,0.0,0.0,0.0
1,oneZero Financial Systems,Come join oneZero Financial Systems An exciti...,,Junior Technical Writer,https://www.indeed.com/pagead/clk?mo=r&ad=-6NY...,technical writer,"San Francisco, CA",15-04-2019,,0.0,0.0,0.0
2,Allwyn Corporation,50 000 70 000 a yearContractThe role is in...,50000 - 70000,Technical Writer (Healthcare),https://www.indeed.com/pagead/clk?mo=r&ad=-6NY...,technical writer,"San Francisco, CA",15-04-2019,year,50000.0,70000.0,60000.0
3,"Gables Engineering, Inc.",Since 1946 our goal at Gables Engineering has...,,Associate Technical Writer,https://www.indeed.com/pagead/clk?mo=r&ad=-6NY...,technical writer,"San Francisco, CA",15-04-2019,,0.0,0.0,0.0
4,"GISbiz, Inc.",ContractData Scientist Databricks with Spark ...,,Data Scientist,https://www.indeed.com/pagead/clk?mo=r&ad=-6NY...,data scientist,"Boston, MA",15-04-2019,,0.0,0.0,0.0
5,ISMIE Mutual Insurance Company,Job Purpose Analyze statistical data effectin...,,Data Scientist,https://www.indeed.com/pagead/clk?mo=r&ad=-6NY...,data scientist,"Boston, MA",15-04-2019,,0.0,0.0,0.0
6,"Rippe & Kingston Systems, Inc.",Job Summary Rippe Kingston a Cincinnati ba...,,Technical Writer,https://www.indeed.com/pagead/clk?mo=r&ad=-6NY...,technical writer,"San Francisco, CA",15-04-2019,,0.0,0.0,0.0
7,Bloomberg,The Bloomberg Financial Products Documentation...,,Technical Writer,https://www.indeed.com/pagead/clk?mo=r&ad=-6NY...,technical writer,"San Francisco, CA",15-04-2019,,0.0,0.0,0.0
8,Adaptive Biotechnologies,Every immune system has a story to tell— the k...,,Technical Writer,https://www.indeed.com/pagead/clk?mo=r&ad=-6NY...,technical writer,"San Francisco, CA",15-04-2019,,0.0,0.0,0.0
9,Watts Water Technologies,55 000 60 000 a yearRESPONSIBILITIES AND D...,55000 - 60000,Technical Writer,https://www.indeed.com/pagead/clk?mo=r&ad=-6NY...,technical writer,"San Francisco, CA",15-04-2019,year,55000.0,60000.0,57500.0


### Cleaning Job description column

In [26]:
data = df[['job_posting_desc']]
data.head()

Unnamed: 0,job_posting_desc
0,Location Remote The Wasteland 3 team is look...
1,Come join oneZero Financial Systems An exciti...
2,50 000 70 000 a yearContractThe role is in...
3,Since 1946 our goal at Gables Engineering has...
4,ContractData Scientist Databricks with Spark ...


### Tokenizing the job_posting_desc column using NLTK library

In [27]:
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
import string
from nltk.corpus import stopwords

In [28]:
# nltk.download()

In [29]:
for desc in df['job_posting_desc']:
    desc = desc.replace('\n','')
    desc = desc.replace('\\n','')
    desc = desc.translate(str.maketrans('', '', string.punctuation))
df.head()

Unnamed: 0,company,job_posting_desc,job_posting_salary,job_posting_title,job_posting_url,job_title,location,scrap_date,salary_period,salary_lower,salary_upper,salary_average
0,inXile entertainment,Location Remote The Wasteland 3 team is look...,,Senior Writer - Wasteland 3,https://www.indeed.com/pagead/clk?mo=r&ad=-6NY...,technical writer,"San Francisco, CA",15-04-2019,,0.0,0.0,0.0
1,oneZero Financial Systems,Come join oneZero Financial Systems An exciti...,,Junior Technical Writer,https://www.indeed.com/pagead/clk?mo=r&ad=-6NY...,technical writer,"San Francisco, CA",15-04-2019,,0.0,0.0,0.0
2,Allwyn Corporation,50 000 70 000 a yearContractThe role is in...,50000 - 70000,Technical Writer (Healthcare),https://www.indeed.com/pagead/clk?mo=r&ad=-6NY...,technical writer,"San Francisco, CA",15-04-2019,year,50000.0,70000.0,60000.0
3,"Gables Engineering, Inc.",Since 1946 our goal at Gables Engineering has...,,Associate Technical Writer,https://www.indeed.com/pagead/clk?mo=r&ad=-6NY...,technical writer,"San Francisco, CA",15-04-2019,,0.0,0.0,0.0
4,"GISbiz, Inc.",ContractData Scientist Databricks with Spark ...,,Data Scientist,https://www.indeed.com/pagead/clk?mo=r&ad=-6NY...,data scientist,"Boston, MA",15-04-2019,,0.0,0.0,0.0


### Removing the stop words from job_posting_desc column

In [30]:
stopWords = set(stopwords.words('english'))
df['job_posting_desc'] = df['job_posting_desc'].str.lower().str.split()
df['job_posting_desc'] = df['job_posting_desc'].apply(lambda x : [item for item in x if item not in stopWords])
df

Unnamed: 0,company,job_posting_desc,job_posting_salary,job_posting_title,job_posting_url,job_title,location,scrap_date,salary_period,salary_lower,salary_upper,salary_average
0,inXile entertainment,"[location, remote, wasteland, 3, team, looking...",,Senior Writer - Wasteland 3,https://www.indeed.com/pagead/clk?mo=r&ad=-6NY...,technical writer,"San Francisco, CA",15-04-2019,,0.0,0.0,0.0
1,oneZero Financial Systems,"[come, join, onezero, financial, systems, exci...",,Junior Technical Writer,https://www.indeed.com/pagead/clk?mo=r&ad=-6NY...,technical writer,"San Francisco, CA",15-04-2019,,0.0,0.0,0.0
2,Allwyn Corporation,"[50, 000, 70, 000, yearcontractthe, role, salt...",50000 - 70000,Technical Writer (Healthcare),https://www.indeed.com/pagead/clk?mo=r&ad=-6NY...,technical writer,"San Francisco, CA",15-04-2019,year,50000.0,70000.0,60000.0
3,"Gables Engineering, Inc.","[since, 1946, goal, gables, engineering, alway...",,Associate Technical Writer,https://www.indeed.com/pagead/clk?mo=r&ad=-6NY...,technical writer,"San Francisco, CA",15-04-2019,,0.0,0.0,0.0
4,"GISbiz, Inc.","[contractdata, scientist, databricks, spark, s...",,Data Scientist,https://www.indeed.com/pagead/clk?mo=r&ad=-6NY...,data scientist,"Boston, MA",15-04-2019,,0.0,0.0,0.0
5,ISMIE Mutual Insurance Company,"[job, purpose, analyze, statistical, data, eff...",,Data Scientist,https://www.indeed.com/pagead/clk?mo=r&ad=-6NY...,data scientist,"Boston, MA",15-04-2019,,0.0,0.0,0.0
6,"Rippe & Kingston Systems, Inc.","[job, summary, rippe, kingston, cincinnati, ba...",,Technical Writer,https://www.indeed.com/pagead/clk?mo=r&ad=-6NY...,technical writer,"San Francisco, CA",15-04-2019,,0.0,0.0,0.0
7,Bloomberg,"[bloomberg, financial, products, documentation...",,Technical Writer,https://www.indeed.com/pagead/clk?mo=r&ad=-6NY...,technical writer,"San Francisco, CA",15-04-2019,,0.0,0.0,0.0
8,Adaptive Biotechnologies,"[every, immune, system, story, tell—, key, kno...",,Technical Writer,https://www.indeed.com/pagead/clk?mo=r&ad=-6NY...,technical writer,"San Francisco, CA",15-04-2019,,0.0,0.0,0.0
9,Watts Water Technologies,"[55, 000, 60, 000, yearresponsibilities, dutie...",55000 - 60000,Technical Writer,https://www.indeed.com/pagead/clk?mo=r&ad=-6NY...,technical writer,"San Francisco, CA",15-04-2019,year,55000.0,60000.0,57500.0


### Tokenizing the filtered job_posting_column

In [31]:
from nltk.tokenize.moses import MosesDetokenizer
detokenizer = MosesDetokenizer()
df['job_posting_desc'] = df['job_posting_desc'].apply(lambda x : [detokenizer.detokenize(x, return_str=True)])
df['job_posting_desc'] = df['job_posting_desc'].apply(lambda x : ''.join(x))
df['job_posting_desc'] = df['job_posting_desc'].str.replace('\d+', '')

In [32]:
df

Unnamed: 0,company,job_posting_desc,job_posting_salary,job_posting_title,job_posting_url,job_title,location,scrap_date,salary_period,salary_lower,salary_upper,salary_average
0,inXile entertainment,location remote wasteland team looking talent...,,Senior Writer - Wasteland 3,https://www.indeed.com/pagead/clk?mo=r&ad=-6NY...,technical writer,"San Francisco, CA",15-04-2019,,0.0,0.0,0.0
1,oneZero Financial Systems,come join onezero financial systems exciting f...,,Junior Technical Writer,https://www.indeed.com/pagead/clk?mo=r&ad=-6NY...,technical writer,"San Francisco, CA",15-04-2019,,0.0,0.0,0.0
2,Allwyn Corporation,yearcontractthe role salt lake city utah p...,50000 - 70000,Technical Writer (Healthcare),https://www.indeed.com/pagead/clk?mo=r&ad=-6NY...,technical writer,"San Francisco, CA",15-04-2019,year,50000.0,70000.0,60000.0
3,"Gables Engineering, Inc.",since goal gables engineering always simple o...,,Associate Technical Writer,https://www.indeed.com/pagead/clk?mo=r&ad=-6NY...,technical writer,"San Francisco, CA",15-04-2019,,0.0,0.0,0.0
4,"GISbiz, Inc.",contractdata scientist databricks spark scala ...,,Data Scientist,https://www.indeed.com/pagead/clk?mo=r&ad=-6NY...,data scientist,"Boston, MA",15-04-2019,,0.0,0.0,0.0
5,ISMIE Mutual Insurance Company,job purpose analyze statistical data effecting...,,Data Scientist,https://www.indeed.com/pagead/clk?mo=r&ad=-6NY...,data scientist,"Boston, MA",15-04-2019,,0.0,0.0,0.0
6,"Rippe & Kingston Systems, Inc.",job summary rippe kingston cincinnati based pr...,,Technical Writer,https://www.indeed.com/pagead/clk?mo=r&ad=-6NY...,technical writer,"San Francisco, CA",15-04-2019,,0.0,0.0,0.0
7,Bloomberg,bloomberg financial products documentation tea...,,Technical Writer,https://www.indeed.com/pagead/clk?mo=r&ad=-6NY...,technical writer,"San Francisco, CA",15-04-2019,,0.0,0.0,0.0
8,Adaptive Biotechnologies,every immune system story tell— key knowing li...,,Technical Writer,https://www.indeed.com/pagead/clk?mo=r&ad=-6NY...,technical writer,"San Francisco, CA",15-04-2019,,0.0,0.0,0.0
9,Watts Water Technologies,yearresponsibilities duties plan coordinat...,55000 - 60000,Technical Writer,https://www.indeed.com/pagead/clk?mo=r&ad=-6NY...,technical writer,"San Francisco, CA",15-04-2019,year,55000.0,60000.0,57500.0


### Dumping the contents of cleaned dataframe into a json file

In [33]:
cleaned_filename = 'indeed_'+time.strftime("%d-%m-%Y")+'_cleaned.json'
df.to_json(cleaned_filename, orient='records')