### 1. data cleaning

In [161]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt

from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.svm import SVR
from sklearn.preprocessing import PolynomialFeatures
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
import sklearn.metrics as skm
from sklearn.model_selection import train_test_split
import operator as op

from sklearn.impute import SimpleImputer

import seaborn as sns

In [162]:
data = pd.read_csv('/Users/liminzhenscc/Documents/study/python_data_analyze/project/2data_sc_salary/glassdoor_jobs1.csv')
data.head()

Unnamed: 0,Job Title,Salary Estimate,Average Salary,Job Description,Rating,Company Name,Location,Size,Founded,Type of ownership,Industry,Sector,Revenue
0,Data Science Graduate/Undergraduate - Immed St...,$97K - $141K (Glassdoor Est.),"$116,841 /yr (est.)",Be part of the Award Winning CIMIC Graduate Pr...,3.8,UGL Limited\n3.8,North Shore,5001 to 10000 Employees,1984,Subsidiary or Business Segment,Construction,"Construction, Repair & Maintenance Services",Unknown / Non-Applicable
1,Graduate Data Science - Brisbane (Corporate),$97K - $141K (Glassdoor Est.),"$128,589 /yr (est.)",Job Number:\n82295\nWork type:\nPermanent - Fu...,3.7,Aurizon\n3.7,Brisbane,1001 to 5000 Employees,2009,Company - Public,Taxi & Car Services,Transportation & Logistics,$2 to $5 billion (USD)
2,Consulting - Data & AI - 2022/23 Summer Vacati...,$97K - $141K (Glassdoor Est.),"$78,167 /yr (est.)","Date: 18-Jul-2022\n\nLocation:\nCanberra, ACT,...",4.0,Deloitte\n4.0,Canberra,10000+ Employees,1850,Company - Private,Accounting & Tax,Finance,$10+ billion (USD)
3,Data Scientist,$97K - $141K (Glassdoor Est.),"$125,000 /yr (est.)",Job description\nData Scientist Skills\nProgra...,-1.0,GoTech Solutions Pty Ltd,Sydney,Unknown,-1,Company - Public,-1,-1,Unknown / Non-Applicable
4,Research Associate - Data Science,$97K - $141K (Glassdoor Est.),"$102,500 /yr (est.)",Job no: 510607\nWork type: Full time\nLocation...,4.2,University of New South Wales\n4.2,Sydney,1001 to 5000 Employees,1949,College / University,Colleges & Universities,Education,Unknown / Non-Applicable


In [163]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1261 entries, 0 to 1260
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Job Title          1261 non-null   object 
 1   Salary Estimate    1261 non-null   object 
 2   Average Salary     1261 non-null   object 
 3   Job Description    1261 non-null   object 
 4   Rating             1261 non-null   float64
 5   Company Name       1261 non-null   object 
 6   Location           1261 non-null   object 
 7   Size               1261 non-null   object 
 8   Founded            1261 non-null   int64  
 9   Type of ownership  1261 non-null   object 
 10  Industry           1261 non-null   object 
 11  Sector             1261 non-null   object 
 12  Revenue            1261 non-null   object 
dtypes: float64(1), int64(1), object(11)
memory usage: 128.2+ KB


column needed to deal with:
1. Salary Estimate
2. Average Salary
3. Company Name
4. age(counted via Founded)
5. Job Description (data scientist , data analyst, machine learning, data engeneering.
6. Size (unknown to -1)


There are 12 columns, except Founded and Rasting, all other columns are object, we need to do some data cleaning to get useful information that we may need for the final model.

In [183]:
#Average Salary
ave_sal = data['Average Salary'].apply(lambda x: x.split('/')[0].replace('$', '').replace(',',''))
data['ave_salary'] = pd.to_numeric(ave_sal)
data.ave_salary

0       116,841 
1       128,589 
2        78,167 
3       125,000 
4       102,500 
          ...   
1256     72,849 
1257     72,849 
1258    120,000 
1259     67,572 
1260     81,505 
Name: Average Salary, Length: 1261, dtype: object

In [165]:
#dealing with Salary Estimate. get the lowest and maximum value.
salary = data['Salary Estimate'].apply(lambda x: x.split('(')[0])
salary = salary.apply(lambda x: x.replace('K', '').replace('$', ''))
salary

0       97 - 141 
1       97 - 141 
2       97 - 141 
3       97 - 141 
4       97 - 141 
          ...    
1256     73 - 91 
1257           -1
1258     73 - 91 
1259     73 - 91 
1260     73 - 91 
Name: Salary Estimate, Length: 1261, dtype: object

In [166]:
data['min_salary'] = salary.apply(lambda x: int(x.split('-')[0] if x != '-1' else -1))
data['max_salary'] = salary.apply(lambda x: int(x.split('-')[-1] if x != '-1' else -1))
#data['ave_salary'] = (data.min_salary+data.max_salary)/2
data.head()

Unnamed: 0,Job Title,Salary Estimate,Average Salary,Job Description,Rating,Company Name,Location,Size,Founded,Type of ownership,Industry,Sector,Revenue,min_salary,max_salary
0,Data Science Graduate/Undergraduate - Immed St...,$97K - $141K (Glassdoor Est.),"$116,841 /yr (est.)",Be part of the Award Winning CIMIC Graduate Pr...,3.8,UGL Limited\n3.8,North Shore,5001 to 10000 Employees,1984,Subsidiary or Business Segment,Construction,"Construction, Repair & Maintenance Services",Unknown / Non-Applicable,97,141
1,Graduate Data Science - Brisbane (Corporate),$97K - $141K (Glassdoor Est.),"$128,589 /yr (est.)",Job Number:\n82295\nWork type:\nPermanent - Fu...,3.7,Aurizon\n3.7,Brisbane,1001 to 5000 Employees,2009,Company - Public,Taxi & Car Services,Transportation & Logistics,$2 to $5 billion (USD),97,141
2,Consulting - Data & AI - 2022/23 Summer Vacati...,$97K - $141K (Glassdoor Est.),"$78,167 /yr (est.)","Date: 18-Jul-2022\n\nLocation:\nCanberra, ACT,...",4.0,Deloitte\n4.0,Canberra,10000+ Employees,1850,Company - Private,Accounting & Tax,Finance,$10+ billion (USD),97,141
3,Data Scientist,$97K - $141K (Glassdoor Est.),"$125,000 /yr (est.)",Job description\nData Scientist Skills\nProgra...,-1.0,GoTech Solutions Pty Ltd,Sydney,Unknown,-1,Company - Public,-1,-1,Unknown / Non-Applicable,97,141
4,Research Associate - Data Science,$97K - $141K (Glassdoor Est.),"$102,500 /yr (est.)",Job no: 510607\nWork type: Full time\nLocation...,4.2,University of New South Wales\n4.2,Sydney,1001 to 5000 Employees,1949,College / University,Colleges & Universities,Education,Unknown / Non-Applicable,97,141


In [167]:
#delete rating after company name
data['company_name'] = data['Company Name'].apply(lambda x: x.split('\n')[0])
data.head()

Unnamed: 0,Job Title,Salary Estimate,Average Salary,Job Description,Rating,Company Name,Location,Size,Founded,Type of ownership,Industry,Sector,Revenue,min_salary,max_salary,company_name
0,Data Science Graduate/Undergraduate - Immed St...,$97K - $141K (Glassdoor Est.),"$116,841 /yr (est.)",Be part of the Award Winning CIMIC Graduate Pr...,3.8,UGL Limited\n3.8,North Shore,5001 to 10000 Employees,1984,Subsidiary or Business Segment,Construction,"Construction, Repair & Maintenance Services",Unknown / Non-Applicable,97,141,UGL Limited
1,Graduate Data Science - Brisbane (Corporate),$97K - $141K (Glassdoor Est.),"$128,589 /yr (est.)",Job Number:\n82295\nWork type:\nPermanent - Fu...,3.7,Aurizon\n3.7,Brisbane,1001 to 5000 Employees,2009,Company - Public,Taxi & Car Services,Transportation & Logistics,$2 to $5 billion (USD),97,141,Aurizon
2,Consulting - Data & AI - 2022/23 Summer Vacati...,$97K - $141K (Glassdoor Est.),"$78,167 /yr (est.)","Date: 18-Jul-2022\n\nLocation:\nCanberra, ACT,...",4.0,Deloitte\n4.0,Canberra,10000+ Employees,1850,Company - Private,Accounting & Tax,Finance,$10+ billion (USD),97,141,Deloitte
3,Data Scientist,$97K - $141K (Glassdoor Est.),"$125,000 /yr (est.)",Job description\nData Scientist Skills\nProgra...,-1.0,GoTech Solutions Pty Ltd,Sydney,Unknown,-1,Company - Public,-1,-1,Unknown / Non-Applicable,97,141,GoTech Solutions Pty Ltd
4,Research Associate - Data Science,$97K - $141K (Glassdoor Est.),"$102,500 /yr (est.)",Job no: 510607\nWork type: Full time\nLocation...,4.2,University of New South Wales\n4.2,Sydney,1001 to 5000 Employees,1949,College / University,Colleges & Universities,Education,Unknown / Non-Applicable,97,141,University of New South Wales


In [168]:
data['Revenue'] = data['Revenue'].apply(lambda x: x if 'unknown' not in x.lower() else -1)


In [169]:
data.Revenue.head()

0                        -1
1    $2 to $5 billion (USD)
2        $10+ billion (USD)
3                        -1
4                        -1
Name: Revenue, dtype: object

In [170]:
#age of company
data['age'] = data.Founded.apply(lambda x: x if x==-1 else 2022-x)
data.head()

Unnamed: 0,Job Title,Salary Estimate,Average Salary,Job Description,Rating,Company Name,Location,Size,Founded,Type of ownership,Industry,Sector,Revenue,min_salary,max_salary,company_name,age
0,Data Science Graduate/Undergraduate - Immed St...,$97K - $141K (Glassdoor Est.),"$116,841 /yr (est.)",Be part of the Award Winning CIMIC Graduate Pr...,3.8,UGL Limited\n3.8,North Shore,5001 to 10000 Employees,1984,Subsidiary or Business Segment,Construction,"Construction, Repair & Maintenance Services",-1,97,141,UGL Limited,38
1,Graduate Data Science - Brisbane (Corporate),$97K - $141K (Glassdoor Est.),"$128,589 /yr (est.)",Job Number:\n82295\nWork type:\nPermanent - Fu...,3.7,Aurizon\n3.7,Brisbane,1001 to 5000 Employees,2009,Company - Public,Taxi & Car Services,Transportation & Logistics,$2 to $5 billion (USD),97,141,Aurizon,13
2,Consulting - Data & AI - 2022/23 Summer Vacati...,$97K - $141K (Glassdoor Est.),"$78,167 /yr (est.)","Date: 18-Jul-2022\n\nLocation:\nCanberra, ACT,...",4.0,Deloitte\n4.0,Canberra,10000+ Employees,1850,Company - Private,Accounting & Tax,Finance,$10+ billion (USD),97,141,Deloitte,172
3,Data Scientist,$97K - $141K (Glassdoor Est.),"$125,000 /yr (est.)",Job description\nData Scientist Skills\nProgra...,-1.0,GoTech Solutions Pty Ltd,Sydney,Unknown,-1,Company - Public,-1,-1,-1,97,141,GoTech Solutions Pty Ltd,-1
4,Research Associate - Data Science,$97K - $141K (Glassdoor Est.),"$102,500 /yr (est.)",Job no: 510607\nWork type: Full time\nLocation...,4.2,University of New South Wales\n4.2,Sydney,1001 to 5000 Employees,1949,College / University,Colleges & Universities,Education,-1,97,141,University of New South Wales,73


In [187]:
data['python_yn'] = data['Job Description'].apply(lambda x: 1 if 'python' in x.lower() else 0)
#data['r_yn'] = data['Job Description'].apply(lambda x: 1 if 'r studio' in x.lower() else 0)
data['ML_yn'] = data['Job Description'].apply(lambda x: 1 if 'machine learning' in x.lower() else 0)
data['excel_yn'] = data['Job Description'].apply(lambda x: 1 if 'excel' in x.lower() else 0)
data['spark_yn'] = data['Job Description'].apply(lambda x: 1 if 'spark' in x.lower() else 0)
data['aws_yn'] = data['Job Description'].apply(lambda x: 1 if 'aws' in x.lower() else 0)

In [188]:
data['graduate_yn'] = data['Job Title'].apply(lambda x: 1 if 'graduate' in x.lower() else 0)
data.graduate_yn.value_counts()

0    888
1    373
Name: graduate_yn, dtype: int64

In [189]:
data['r_yn'].value_counts()

0    1261
Name: r_yn, dtype: int64

In [173]:
data.Size = data['Size'].apply(lambda x: x if 'unknown' not in x.lower() else -1)

In [185]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1261 entries, 0 to 1260
Data columns (total 25 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Job Title          1261 non-null   object 
 1   Salary Estimate    1261 non-null   object 
 2   Average Salary     1261 non-null   object 
 3   Job Description    1261 non-null   object 
 4   Rating             1261 non-null   float64
 5   Company Name       1261 non-null   object 
 6   Location           1261 non-null   object 
 7   Size               1261 non-null   object 
 8   Founded            1261 non-null   int64  
 9   Type of ownership  1261 non-null   object 
 10  Industry           1261 non-null   object 
 11  Sector             1261 non-null   object 
 12  Revenue            1261 non-null   object 
 13  min_salary         1261 non-null   int64  
 14  max_salary         1261 non-null   int64  
 15  company_name       1261 non-null   object 
 16  age                1261 

In [186]:
data.to_csv('/Users/liminzhenscc/Documents/study/python_data_analyze/project/2data_sc_salary/glassdoor_jobs_cleaned2.csv')

