## Import Necessary libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 

from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn import metrics
from sklearn.metrics import roc_auc_score, confusion_matrix, mean_squared_error

import warnings
warnings.filterwarnings('ignore')

## Reading the File

In [2]:
df=pd.read_csv('glassdoor_jobs.csv')

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 14 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Job Title          1000 non-null   object 
 1   Salary Estimate    1000 non-null   object 
 2   Job Description    1000 non-null   object 
 3   Rating             1000 non-null   float64
 4   Company Name       1000 non-null   object 
 5   Location           1000 non-null   object 
 6   Headquarters       1000 non-null   object 
 7   Size               1000 non-null   object 
 8   Founded            1000 non-null   int64  
 9   Type of ownership  1000 non-null   object 
 10  Industry           1000 non-null   object 
 11  Sector             1000 non-null   object 
 12  Revenue            1000 non-null   object 
 13  Competitors        1000 non-null   object 
dtypes: float64(1), int64(1), object(12)
memory usage: 109.5+ KB


In [6]:
df.head()

Unnamed: 0,Job Title,Salary Estimate,Job Description,Rating,Company Name,Location,Headquarters,Size,Founded,Type of ownership,Industry,Sector,Revenue,Competitors
0,Data Scientist,$64K-$106K (Glassdoor est.),"Secure our Nation, Ignite your Future\n\nJoin ...",4.2,ManTech\n4.2,"Chantilly, VA","Herndon, VA",5001 to 10000 employees,1968,Company - Public,Research & Development,Business Services,$1 to $2 billion (USD),-1
1,Associate Business Intelligence Data Scientist,$64K-$106K (Glassdoor est.),"Blending customer advisory, customer support, ...",3.2,Carousel Industries\n3.2,"Exeter, RI","Exeter, RI",1001 to 5000 employees,1992,Company - Private,IT Services,Information Technology,$500 million to $1 billion (USD),-1
2,"Scientist, Population Genomics",$64K-$106K (Glassdoor est.),WuXi NextCODE is seeking a motivated Scientist...,2.9,Carousel Industries\n3.2,"Cambridge, MA","Cambridge, MA",501 to 1000 employees,2015,Company - Private,Biotech & Pharmaceuticals,Biotech & Pharmaceuticals,Unknown / Non-Applicable,-1
3,Data Analyst II,$64K-$106K (Glassdoor est.),The Data Analyst II is responsible for data en...,4.2,"Insight Enterprises, Inc.\n4.2","Plano, TX","Tempe, AZ",5001 to 10000 employees,1988,Company - Public,Enterprise Software & Network Solutions,Information Technology,$5 to $10 billion (USD),"CDW, PCM, SHI International"
4,Sensory Scientist,$64K-$106K (Glassdoor est.),A leading food production company in St. Louis...,3.8,NIC Infotek\n3.8,"Saint Louis, MO","Tampa, FL",51 to 200 employees,2004,Company - Public,Staffing & Outsourcing,Business Services,Unknown / Non-Applicable,-1


In [7]:
df.shape

(1000, 14)

In [11]:
#checking salary estimate values
df['Salary Estimate'].value_counts()

$96K-$156K (Glassdoor est.)     170
$97K-$156K (Glassdoor est.)      32
$111K-$181K (Glassdoor est.)     32
$57K-$97K (Glassdoor est.)       32
$57K-$98K (Glassdoor est.)       32
$60K-$101K (Glassdoor est.)      32
$109K-$178K (Glassdoor est.)     32
$60K-$100K (Glassdoor est.)      32
$108K-$171K (Glassdoor est.)     32
$58K-$69K (Glassdoor est.)       32
$69K-$118K (Glassdoor est.)      32
$55K-$91K (Glassdoor est.)       32
$90K-$115K(Employer est.)        32
$105K-$166K (Glassdoor est.)     32
$141K-$225K (Glassdoor est.)     32
$72K-$122K (Glassdoor est.)      32
$56K-$97K (Glassdoor est.)       32
$97K-$154K (Glassdoor est.)      32
$74K-$123K (Glassdoor est.)      32
$94K-$154K (Glassdoor est.)      32
$119K-$186K (Glassdoor est.)     32
$99K-$110K (Glassdoor est.)      32
$96K-$101K (Glassdoor est.)      32
$112K-$179K (Glassdoor est.)     32
$73K-$125K (Glassdoor est.)      32
$115K-$140K (Glassdoor est.)     32
$64K-$106K (Glassdoor est.)      30
Name: Salary Estimate, dtype

In [13]:
#checking values for HQ
df['Headquarters'].value_counts()

San Francisco, CA      64
New York, NY           57
Boston, MA             30
Reston, VA             25
-1                     21
                       ..
Rolling Meadows, IL     1
Scottsdale, AZ          1
Plainsboro, NJ          1
Holyoke, MA             1
Phila, PA               1
Name: Headquarters, Length: 282, dtype: int64

In [17]:
#job description
df['Job Description'].head()

0    Secure our Nation, Ignite your Future\n\nJoin ...
1    Blending customer advisory, customer support, ...
2    WuXi NextCODE is seeking a motivated Scientist...
3    The Data Analyst II is responsible for data en...
4    A leading food production company in St. Louis...
Name: Job Description, dtype: object

In [14]:
df['Rating'].value_counts()

 3.6    68
 3.8    67
 4.0    65
 3.9    62
 3.5    61
 3.4    55
 5.0    51
 4.1    50
 4.2    48
-1.0    46
 4.5    46
 3.7    46
 4.3    42
 4.6    36
 3.1    32
 3.2    30
 3.3    29
 4.4    27
 4.7    24
 4.9    18
 2.9    18
 4.8    17
 3.0    17
 2.5     9
 2.3     8
 2.2     8
 2.7     7
 2.6     6
 2.8     4
 2.4     1
 1.5     1
 2.0     1
Name: Rating, dtype: int64

Cleaning tasks:

1. Remove "Glassdoor est" and 'K' from Salary
2. Remove -1 values from HQ
3. Convert salary estimate from range to single value [average]
5. Remove -1 from rating
6. Remove Rating from Company Name field
7. Add a State field using location
8. Check if job location and HQ are same
9. Add an Age field for the company [current year - founded]
10. Split job description into various job skills [R, Python, AWS, Docker etc] and job position [data scientist,analyst etc.]
and seniority level
11. Replace number of Competitors by count and remove -1

## Data Cleaning

### Headquarters

In [30]:
#dropping ones with -1 in HQ
df=df[df['Headquarters']!='-1']

In [31]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 979 entries, 0 to 999
Data columns (total 14 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Job Title          979 non-null    object 
 1   Salary Estimate    979 non-null    object 
 2   Job Description    979 non-null    object 
 3   Rating             979 non-null    float64
 4   Company Name       979 non-null    object 
 5   Location           979 non-null    object 
 6   Headquarters       979 non-null    object 
 7   Size               979 non-null    object 
 8   Founded            979 non-null    int64  
 9   Type of ownership  979 non-null    object 
 10  Industry           979 non-null    object 
 11  Sector             979 non-null    object 
 12  Revenue            979 non-null    object 
 13  Competitors        979 non-null    object 
dtypes: float64(1), int64(1), object(12)
memory usage: 114.7+ KB


In [32]:
df['Headquarters'].value_counts()

San Francisco, CA     64
New York, NY          57
Boston, MA            30
Reston, VA            25
Cambridge, MA         17
                      ..
Suwon, South Korea     1
Ventura, CA            1
Dearborn, MI           1
Alpharetta, GA         1
Lake Oswego, OR        1
Name: Headquarters, Length: 281, dtype: int64

### Salary

In [34]:
df['Salary Estimate']

0      $64K-$106K (Glassdoor est.)
1      $64K-$106K (Glassdoor est.)
2      $64K-$106K (Glassdoor est.)
3      $64K-$106K (Glassdoor est.)
4      $64K-$106K (Glassdoor est.)
                  ...             
995    $96K-$156K (Glassdoor est.)
996    $96K-$156K (Glassdoor est.)
997    $96K-$156K (Glassdoor est.)
998    $96K-$156K (Glassdoor est.)
999    $96K-$156K (Glassdoor est.)
Name: Salary Estimate, Length: 979, dtype: object