In [1]:
import pandas as pd
import numpy as np
import re

import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

# changing text below to markdown as not in current use

#importing raw data
jobs_original = pd.read_csv('./data/alljobs_unclean.csv')
jobs_original.head()

import pandas_profiling
pandas_profiling.ProfileReport(jobs_original)

#looks like we have a lot of duplicate jobs
#not surprising from general scraping
for column in jobs_original.columns:
    print(column, jobs_original.duplicated(subset=column).sum() )


#starting by dropping all fully duplicate columns
jobs_original.drop_duplicates(keep='first').shape

#looks like this loses us over half our data!
jobs_nona = jobs_original.drop_duplicates(keep='first')

pandas_profiling.ProfileReport(jobs_nona)

#fully duplicated description text is likely to be a true duplicate
duplicate_jds = jobs_nona[jobs_nona.duplicated(subset='job_description_all_text')]

pandas_profiling.ProfileReport(duplicate_jds)

#does look like some may be the same job advertised with various titles
#why are only 37 of these considered duplicated, when all of them were before?
duplicate_jds.duplicated('job_description_all_text').sum()

import missingno

missingno.matrix(jobs_nona)

#company rating and salary are expected to be sparse
#company is not really essential, so not concerned about missing values there
#can't do much if jd text is missing, so will drop those

jobs_nona = jobs_nona.dropna(subset=['job_description_all_text'])
jobs_nona.isna().sum()

# Given that only 16 are missing in company, let's look at those
jobs_nona[jobs_nona.company.isna()]

duplicate_jds[duplicate_jds.duplicated('job_description_all_text')]

#may still be some duplicates, but many appear to be using similar text to hire for multiple roles 
#or hiring for the same jobs in multiple locations or functions, so leaving remaining in for now

#need to convert rating to float
jobs_nona.company_rating.unique()

float_ratings = []
for rating in jobs_nona.company_rating:
    try:
        float_ratings.append(float(rating))
    except:
        new_rating = rating.replace(' ', '.')
        new_rating = new_rating.replace('o', '0')
        float_ratings.append(float(new_rating))

jobs_nona.company_rating = float_ratings
jobs_nona.company_rating.unique()

#saving a csv file to have a shorter list of things to fix in the future
jobs_nona.to_csv('./data/alljobs_nodupes.csv', index=False)

In [84]:
#format salary data
#$$ amount
#per time (day, week, month, year, hour)

# columns: base_dollars
# pay_period:
# full_time_annual_dollars

test_strings = {'job':['A','B','C','D','E','F','H'],
    'salary':[np.nan, '30 - 80 per hour', '$45 per hour', '65000 per year', '65000-80000 per year', '7000 per month','$500 per week']}
test_df = pd.DataFrame(test_strings)
test_df

Unnamed: 0,job,salary
0,A,
1,B,30 - 80 per hour
2,C,$45 per hour
3,D,65000 per year
4,E,65000-80000 per year
5,F,7000 per month
6,H,$500 per week


In [68]:
def find_salary_means(salary_column):
    find_numbers = [re.findall('\d*', salary) for salary in salary_column]
    salary_values = [[int(number) for number in listing if number != ''] for listing in find_numbers]
    salary_means = [np.mean(salary) for salary in salary_values]
    return salary_means


In [85]:
check = find_salary_means(test_df.salary)
check

TypeError: expected string or bytes-like object

In [76]:
def find_salary_time(salary_column, pay_lengths = ['hour','day','week','month','year']):
    test_times = []
    for item in salary_column:
        for time in pay_lengths:
            if time in item:
                test_times.append(time)
    
    return(test_times)

In [80]:
my_items = find_salary_time(test_df.salary)
my_items

['hour', 'hour', 'year', 'year', 'month', 'week']

In [78]:
#trying salary conversion on full df
salary_means = find_salary_means(jobs_nona.salary_data_text)
len(salary_means)

TypeError: expected string or bytes-like object