In [55]:
import numpy as np
import pandas as pd
import re
from numpy.random import default_rng
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
pd.set_option('max_columns',100) 
pd.set_option('max_rows',500)  
File_Path = ['DataAnalyst.csv', 'DataEngineer.csv', 'DataScientist.csv']
Save_Path = 'AllData.csv'
r_dic = {'million': 1000000, 'billion':1000000000}

In [56]:
def read_data(File_Path):
    df = pd.read_csv(File_Path)
    return df

In [57]:
def clean_job(path, df):
    name = path[0:-4]
    df['Job Title'] = [name for _ in range(len(df))]
    return df

In [58]:
def clean_salary(df):
    df['Lo Salary'] = df['Salary Estimate'].str.extract(r'^\s*\$(\d+\.*\d*[kK])[^0-9]', expand=True)
    df['Lo Salary'] = df['Lo Salary'].str.replace(r'[kK]$', '000')
    df['Hi Salary'] = df['Salary Estimate'].str.extract(r'\s*-\s*\$(\d+\.*\d*[kK])[^0-9]', expand=True)
    df['Hi Salary'] = df['Hi Salary'].str.replace(r'[kK]$', '000')
    return df

In [59]:
def clean_cname(df):
    df['Company Name'] = df['Company Name'].str.replace('\\n\d\.*\d*','')
    return df

In [60]:
def clean_location(df):
    df[['Area', 'State']] = df['Location'].str.split(',', n=1, expand=True)
    df.State.replace(' Arapahoe, CO', ' CO', inplace=True)
    return df

In [61]:
# constraint, for example number employees should be at least 1
# if less than 1, set to 'Unknown'
def RemoveStranger(x):
    x = x.strip()
    if re.match('(^-[1-9]+\d*$)|(^0$)', x):
        return 'Unknown'
    return x

In [62]:
def clean_size(df):
    df['Size'] = df['Size'].apply(RemoveStranger)
    df['Size'] = df['Size'].str.replace(r' employees$', '')
    df['Size'] = df['Size'].str.replace(r'\s*to\s*', ',')
    df[['Lo Em','Hi Em']] = df['Size'].str.split(',', n=1, expand=True)
    df['Lo Em'] = df['Lo Em'].str.replace('+', '')
    df['Hi Em'] = df['Hi Em'].astype(str)
    df['Hi Em'] = df['Hi Em'].str.replace('None', 'Unknown')
    return df

In [63]:
def clean_headquarters(df):
    df.Headquarters = df.Headquarters.apply(RemoveStranger)
    return df

In [64]:
def clean_industry(df):
    df.Industry = df.Industry.apply(RemoveStranger)
    return df

In [65]:
def clean_sector(df):
    df.Sector = df.Sector.apply(RemoveStranger)
    return df

In [66]:
def clean_rev(df):
    df.Revenue = df.Revenue.apply(RemoveStranger)
    df.Revenue.astype('str')
    def preliminary_clean(x):
        x = x.strip()
    #     pat = r'^(Less than)*[^0-9]*(\d*)(\+)*[^0-9a-z]*(million|billion)*[^0-9]*(\d+)*[^0-9a-z]*(million|billion)'
        pat = r'^(Less than)*[^0-9]*(\d*)[^0-9a-z]*(million|billion)*[^0-9]*(\d+)*[^0-9a-z]*(million|billion)'
        m = re.match(pat, x, re.I)
        res = ''
        if m:
            if m.group(1):
                res += str(m.group(1))
            if m.group(2):
                lo_r = int(m.group(2))
            if m.group(3):
                lo_r *= int(r_dic[str(m.group(3))])
                res += str(lo_r)
            if m.group(3) == None:
                lo_r *= int(r_dic[str(m.group(5))])
                res += str(lo_r)            
    #         if m.group(3):
    #             res += str(m.group(3))
            if m.group(4):
                res+=','
                hi_r = int(m.group(4))
                hi_r *= int(r_dic[str(m.group(5))])
                res += str(hi_r)
        else:
            res = 'Unknown'
        return res

    df.Revenue = df.Revenue.apply(preliminary_clean)
    
    df[['Lo Rev', 'Hi Rev']] = df.Revenue.str.split(',', expand=True)

    df['Hi Rev'] = df['Hi Rev'].astype(str)
    for i, v in enumerate (df['Hi Rev'].values):
        v = v.strip()
        if v == 'None':
            if 'Less than' in df['Lo Rev'][i]:
                v = df['Lo Rev'][i].strip('Less than')
                df.loc[i,'Lo Rev'] = 'Unknown'
            elif df['Lo Rev'][i] != 'Unknown':
                v = df['Lo Rev'][i]
            else:    
                v = 'Unknown'
        df.loc[i,'Hi Rev'] = v
    return df

In [67]:
def discretization_bs(df):
    df['Lo Rev'] = df['Lo Rev'].str.replace('Unknown', '-1')
    df['Hi Rev'] = df['Hi Rev'].str.replace('Unknown', '-1')
    df['Lo Em'] = df['Lo Em'].str.replace('Unknown', '-1')
    df['Hi Em'] = df['Hi Em'].str.replace('Unknown', '-1')
    df['Lo Rev'] = df['Lo Rev'].astype('int64')
    df['Hi Rev'] = df['Hi Rev'].astype('int64')
    df['Lo Em'] = df['Lo Em'].astype('int64')
    df['Hi Em'] = df['Hi Em'].astype('int64')
    
    df['BS_R'] = pd.cut(x=df['Hi Rev'],
                        bins=[-2, 0, 2000000, 10000000, 50000000, 100000000000],
                        labels=['Other','Micro','Small', 'Medium', 'Large'])
    df['BS_E'] = pd.cut(x=df['Hi Em'],
                        bins=[-2, 0, 10, 50, 250, 10000],
                        labels=['Other','Micro','Small', 'Medium', 'Large'])
    
    for i,v in enumerate(df['BS_R'].values):
        v = v.strip()
        if v == 'Other':
            if df['BS_E'][i] != 'Other':
                df.loc[i,'BS_R'] = df['BS_E'][i]
    return df

In [68]:
def clean(File_Path):
    df = read_data(File_Path)
    df = clean_job(File_Path, df)
    df = clean_salary(df)
    df = clean_cname(df)
    df = clean_location(df)
    df = clean_size(df)
    df = clean_headquarters(df)
    df = clean_industry(df)
    df = clean_sector(df)
    df = clean_rev(df)
    df = discretization_bs(df)
    return df

In [69]:
def data_preprocessing():
    data_list = []
    for path in File_Path:
        data_list.append(clean(path))
    data = pd.concat(data_list)
    data.reset_index()
    data.drop(['Unnamed: 0', 'index'],axis=1,inplace=True)
    data.to_csv(Save_Path)

In [70]:
data_preprocessing()