In [1]:
#import necessary libraries 
import numpy as np
import pandas as pd

In [2]:
#read file
df=pd.read_csv('dsjobs_version3.csv')
#to make sure that we see all columns of df
pd.options.display.max_columns = None

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 14 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Job Title          500 non-null    object 
 1   Salary Estimate    500 non-null    object 
 2   Job Description    499 non-null    object 
 3   Rating             500 non-null    float64
 4   Company Name       500 non-null    object 
 5   Location           500 non-null    object 
 6   Headquarters       500 non-null    object 
 7   Size               500 non-null    object 
 8   Founded            500 non-null    int64  
 9   Type of ownership  500 non-null    object 
 10  Industry           500 non-null    object 
 11  Sector             500 non-null    object 
 12  Revenue            500 non-null    object 
 13  Competitors        500 non-null    object 
dtypes: float64(1), int64(1), object(12)
memory usage: 54.8+ KB


In [3]:
df.head()

Unnamed: 0,Job Title,Salary Estimate,Job Description,Rating,Company Name,Location,Headquarters,Size,Founded,Type of ownership,Industry,Sector,Revenue,Competitors
0,Data Scientist,"₹ 9,06,841",Functional Analytics Lead\n\nJob details:\nWor...,3.8,Capgemini\n3.8,Bengaluru,"Paris, France",10000+ employees,1967,Company - Public,Enterprise Software & Network Solutions,Information Technology,₹500+ billion (INR),"Accenture, CGI, Sopra Steria"
1,Data Scientist,₹221K - ₹238K,Provide analytical insights into emerging prob...,3.4,PayPal\n3.4,Bengaluru,"San Jose, CA",10000+ employees,1998,Company - Public,Internet,Information Technology,₹500+ billion (INR),"Square, Amazon, Apple"
2,Data Scientist,"₹ 25,38,462",Description\n\nSHIFT: Day Job\n\nSCHEDULE:\n\n...,3.6,Oracle\n3.6,Hyderabad,"Redwood City, CA",10000+ employees,1977,Company - Public,Enterprise Software & Network Solutions,Information Technology,₹500+ billion (INR),"SAP, Salesforce, Microsoft"
3,Data Scientist,"₹ 15,66,457","At Amazon, we strive to be most customer-centr...",4.2,Amazon\n4.2,Hyderabad,"Seattle, WA",10000+ employees,1994,Company - Public,Internet,Information Technology,₹500+ billion (INR),"Google, Microsoft, Walmart"
4,Data Scientist,"₹ 10,83,817",,3.9,Accenture\n3.9,Mumbai,"Dublin, Ireland",10000+ employees,1989,Company - Public,Consulting,Business Services,₹500+ billion (INR),"Cognizant Technology Solutions, EY, McKinsey &..."


# Data cleaning 

## Salary Estimation

In [4]:
df['Salary Estimate'].head()

#Issues 

#1. Remove comma, rupee
#2. Handle range 
#3. Handle /mo

#removes all rows with '-1' as Salary
df = df[df['Salary Estimate'] != '-1']

#replace symbols and comma 
df['Salary Estimate'] = df['Salary Estimate'].str.replace(',', '')
df['Salary Estimate'] = df['Salary Estimate'].str.replace('₹ ', '')
df['Salary Estimate'] = df['Salary Estimate'].str.replace('₹', '')
df['Salary Estimate'] = df['Salary Estimate'].str.replace('K','000')
df['Salary Estimate'].head()

#factor will be 12 if we have monthly salary or else 1; this will be used to calculate annual salary
df['factor'] = np.where(df['Salary Estimate'].str.contains('/mo'),12,1)
df['factor'] = df['factor'].astype(int)

#remove '/mo'
df['Salary Estimate'] = df['Salary Estimate'].str.replace('/mo','')

#handling range values by splitting into min and max 
df['min'] = df['Salary Estimate'].str.split('-').str[0]
df['max'] = df['Salary Estimate'].str.split('-').str[1]

#for those values which are not present in range, min and max will be equal
df['max'] = df['max'].fillna(df['min'])
df['max']=df['max'].astype(int)
df['min']=df['min'].astype(int)

#calculate annual min and max salary by multiplying with factor 
df['Annual Min'] = df['min'] * df['factor']
df['Annual Min'].astype(int)
df['Annual Max'] = df['max'] * df['factor']
df['Annual Max'].astype(int)

#calculate avg salary using annual min and max 
df['Avg Salary'] = df['Annual Min'] + df['Annual Max']
df['Avg Salary'] = df['Avg Salary'].astype(int)
df['Avg Salary'] = df['Avg Salary'] / 2

#checking all cols and removing the ones which will not be needed 
df[['min','max','factor','Salary Estimate','Annual Min','Annual Max','Avg Salary']]
df.drop(['min','max','factor','Salary Estimate'],1,inplace=True)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 338 entries, 0 to 499
Data columns (total 16 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Job Title          338 non-null    object 
 1   Job Description    337 non-null    object 
 2   Rating             338 non-null    float64
 3   Company Name       338 non-null    object 
 4   Location           338 non-null    object 
 5   Headquarters       338 non-null    object 
 6   Size               338 non-null    object 
 7   Founded            338 non-null    int64  
 8   Type of ownership  338 non-null    object 
 9   Industry           338 non-null    object 
 10  Sector             338 non-null    object 
 11  Revenue            338 non-null    object 
 12  Competitors        338 non-null    object 
 13  Annual Min         338 non-null    int32  
 14  Annual Max         338 non-null    int32  
 15  Avg Salary         338 non-null    float64
dtypes: float64(2), int32(2), i

## Extracting job title and seniority from job title

In [6]:
df['Job Title'].head()

def title_simplifier(title):
    if 'data scientist' in title.lower():
        return 'data scientist'
    elif 'data engineer' in title.lower():
        return 'data engineer'
    elif 'analyst' in title.lower():
        return 'analyst'
    elif 'machine learning' in title.lower():
        return 'mle'
    elif 'research' in title.lower():
        return 'research'
    elif 'software' in title.lower():
        return 'sw'
    elif 'manager' in title.lower():
        return 'manager'
    elif 'director' in title.lower():
        return 'director'
    else:
        return 'na'

df['job_type']=df['Job Title'].apply(title_simplifier)

#all the positions are clearly defined for for data scientist
df['job_type'].value_counts()

data scientist    338
Name: job_type, dtype: int64

In [7]:
def seniority(title):
    if 'sr' in title.lower() or 'senior' in title.lower() or 'sr' in title.lower() or 'lead' in title.lower() or 'principal' in title.lower():
        return 'senior'
    elif 'jr' in title.lower() or 'jr.' in title.lower():
        return 'jr'
    else:
        return 'na'

df['job_seniority'] = df['Job Title'].apply(seniority)
#there are 2 levels, sr. and jr. 
df['job_seniority'].value_counts()

#droppping job title
df.drop(['Job Title'],1,inplace=True)

## Job description length

In [8]:
df['job_len'] = df['Job Description'].str.len()
df['job_len'].head()

df['job_len'].isnull().sum()

df.info()

df.dropna(inplace=True)

df.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 338 entries, 0 to 499
Data columns (total 18 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Job Description    337 non-null    object 
 1   Rating             338 non-null    float64
 2   Company Name       338 non-null    object 
 3   Location           338 non-null    object 
 4   Headquarters       338 non-null    object 
 5   Size               338 non-null    object 
 6   Founded            338 non-null    int64  
 7   Type of ownership  338 non-null    object 
 8   Industry           338 non-null    object 
 9   Sector             338 non-null    object 
 10  Revenue            338 non-null    object 
 11  Competitors        338 non-null    object 
 12  Annual Min         338 non-null    int32  
 13  Annual Max         338 non-null    int32  
 14  Avg Salary         338 non-null    float64
 15  job_type           338 non-null    object 
 16  job_seniority      338 non

## Rating

In [9]:
df['Rating'].head()

df['Rating'].describe()

#we see that there are -1 ratings
df[df['Rating'] == -1.0]

#converting -1 ratings to 0
df.loc[df.Rating == -1.0, "Rating"] = 0.0

df['Rating'] = df['Rating'].astype(int)

df['Rating'].head()

df['Rating'].describe()

count    337.000000
mean       3.293769
std        0.715302
min        0.000000
25%        3.000000
50%        3.000000
75%        4.000000
max        5.000000
Name: Rating, dtype: float64

## Industry

In [10]:
df['Industry'].value_counts()

#191 companies do not have 'industry' specified in the data, for now, we will convert -1 to 'unknown'
df['Industry'] = np.where(df['Industry'] == '-1' , 'Unknown', df['Industry'])

df['Industry'].value_counts()

IT Services                                58
Unknown                                    52
Enterprise Software & Network Solutions    43
Internet                                   30
Computer Hardware & Software               28
Consulting                                 24
Staffing & Outsourcing                     13
Biotech & Pharmaceuticals                  11
Social Services                            10
Transportation Management                   6
Metal Brokerages                            6
Financial Analytics & Research              6
Electrical & Electronic Manufacturing       5
Advertising & Marketing                     4
Financial Transaction Processing            4
Healthcare Services & Hospitals             3
Food & Drink Manufacturing                  3
Oil & Gas Services                          2
TV Broadcasting & Cable Networks            2
Education Training Services                 2
Investment Banking & Asset Management       2
Lending                           

## Remove ratings from company name

In [11]:
df['Company Name']

df['Company Name'] = df['Company Name'].str.split('\n').str[0]

df['Company Name']

0                     Capgemini
1                        PayPal
2                        Oracle
3                        Amazon
5                    Quanticate
                 ...           
493                 TCG Digital
495         Egnify Technologies
497                      CIMMYT
498    MatchMove Global Pte Ltd
499        Golden Opportunities
Name: Company Name, Length: 337, dtype: object

## Separating state from HQ

In [12]:
df['Headquarters']

0                Paris, France
1                 San Jose, CA
2             Redwood City, CA
3                  Seattle, WA
5      Hitchin, United Kingdom
                ...           
493               Somerset, NJ
495           Hyderabad, India
497           New Delhi, India
498       Singapore, Singapore
499       Hallandale Beach, FL
Name: Headquarters, Length: 337, dtype: object

In [13]:
#hq_base because it has both values, states from USA and other countries like India, Japan
df['hq_base']=df['Headquarters'].str.split(',').str[1]

In [14]:
#hq city 
df['hq_city']=df['Headquarters'].str.split(',').str[0]

In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 337 entries, 0 to 499
Data columns (total 20 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Job Description    337 non-null    object 
 1   Rating             337 non-null    int32  
 2   Company Name       337 non-null    object 
 3   Location           337 non-null    object 
 4   Headquarters       337 non-null    object 
 5   Size               337 non-null    object 
 6   Founded            337 non-null    int64  
 7   Type of ownership  337 non-null    object 
 8   Industry           337 non-null    object 
 9   Sector             337 non-null    object 
 10  Revenue            337 non-null    object 
 11  Competitors        337 non-null    object 
 12  Annual Min         337 non-null    int32  
 13  Annual Max         337 non-null    int32  
 14  Avg Salary         337 non-null    float64
 15  job_type           337 non-null    object 
 16  job_seniority      337 non

In [16]:
#one company's hq is not mentioned
df[df['hq_base'].isna() == True]

Unnamed: 0,Job Description,Rating,Company Name,Location,Headquarters,Size,Founded,Type of ownership,Industry,Sector,Revenue,Competitors,Annual Min,Annual Max,Avg Salary,job_type,job_seniority,job_len,hq_base,hq_city
108,Location: Bhubaneswar/\n\nTechnology:\nJob Des...,3,SILICON TECHLAB,Bhubaneswar,-1,1 to 50 employees,-1,Company - Private,IT Services,Information Technology,Unknown / Non-Applicable,-1,35000,38000,36500.0,data scientist,na,466.0,,-1


In [17]:
#updateing hq base to India and hq city to bhubaneshwar
df.loc[df.hq_city == '-1', "hq_base"] = 'India'
df.loc[df.hq_city == '-1', "hq_city"] = 'Bhubaneshwar'

In [18]:
df[df['hq_base'].isna() == True]

Unnamed: 0,Job Description,Rating,Company Name,Location,Headquarters,Size,Founded,Type of ownership,Industry,Sector,Revenue,Competitors,Annual Min,Annual Max,Avg Salary,job_type,job_seniority,job_len,hq_base,hq_city


## Checking if location and hq city are same 

In [19]:
df['lochq'] = np.where(df['Location'] == df['hq_city'],1,0)

df['lochq'].value_counts()

0    257
1     80
Name: lochq, dtype: int64

## Calculate age of the company

In [20]:
df['Founded'].value_counts()

#86 companies have no foundation year 
df['age'] = df.Founded.apply(lambda x: x if x<1 else 2020 - x)

df['age'].value_counts()

-1      86
 5      21
 11     15
 10     14
 24     12
 6      11
 21     11
 7      10
 8       9
 22      9
 53      9
 17      8
 3       8
 13      7
 47      6
 26      6
 9       6
 4       6
 31      5
 34      5
 14      5
 15      5
 12      4
 45      4
 43      4
 23      3
 38      3
 25      3
 16      2
 27      2
 33      2
 36      2
 177     2
 40      2
 173     2
 144     2
 57      2
 140     2
 134     2
 29      1
 113     1
 2       1
 157     1
 154     1
 150     1
 130     1
 75      1
 110     1
 183     1
 73      1
 61      1
 18      1
 19      1
 20      1
 51      1
 44      1
 39      1
 35      1
 184     1
Name: age, dtype: int64

## Using Job description to find specific skills

In [21]:
#Parsing of Job Description (If certain tool skills like Python, R-studio, AWS, Excel are mentioned)
df['Job Description'] = df['Job Description'].astype(str) 

#If Python is a requirement in the JD
df['python_yn'] = df['Job Description'].apply(lambda x: 1 if 'python' in x.lower() else 0)
df.python_yn.value_counts()

#R-studio
df['R_yn'] = df['Job Description'].apply(lambda x: 1 if 'r studio' in x.lower() or 'r-studio' in x.lower() or 'r' in x.lower() else 0)
df.R_yn.value_counts()

#Spark
df['spark_yn'] = df['Job Description'].apply(lambda x: 1 if 'spark' in x.lower() else 0)
df.spark_yn.value_counts()

#AWS
df['aws_yn'] = df['Job Description'].apply(lambda x: 1 if 'aws' in x.lower() else 0)
df.aws_yn.value_counts()

#Excel
df['excel_yn'] = df['Job Description'].apply(lambda x: 1 if 'excel' in x.lower() else 0)
df.excel_yn.value_counts()

0    195
1    142
Name: excel_yn, dtype: int64

In [22]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 337 entries, 0 to 499
Data columns (total 27 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Job Description    337 non-null    object 
 1   Rating             337 non-null    int32  
 2   Company Name       337 non-null    object 
 3   Location           337 non-null    object 
 4   Headquarters       337 non-null    object 
 5   Size               337 non-null    object 
 6   Founded            337 non-null    int64  
 7   Type of ownership  337 non-null    object 
 8   Industry           337 non-null    object 
 9   Sector             337 non-null    object 
 10  Revenue            337 non-null    object 
 11  Competitors        337 non-null    object 
 12  Annual Min         337 non-null    int32  
 13  Annual Max         337 non-null    int32  
 14  Avg Salary         337 non-null    float64
 15  job_type           337 non-null    object 
 16  job_seniority      337 non

In [23]:
df.head()

Unnamed: 0,Job Description,Rating,Company Name,Location,Headquarters,Size,Founded,Type of ownership,Industry,Sector,Revenue,Competitors,Annual Min,Annual Max,Avg Salary,job_type,job_seniority,job_len,hq_base,hq_city,lochq,age,python_yn,R_yn,spark_yn,aws_yn,excel_yn
0,Functional Analytics Lead\n\nJob details:\nWor...,3,Capgemini,Bengaluru,"Paris, France",10000+ employees,1967,Company - Public,Enterprise Software & Network Solutions,Information Technology,₹500+ billion (INR),"Accenture, CGI, Sopra Steria",906841,906841,906841.0,data scientist,na,2462.0,France,Paris,0,53,1,1,1,0,0
1,Provide analytical insights into emerging prob...,3,PayPal,Bengaluru,"San Jose, CA",10000+ employees,1998,Company - Public,Internet,Information Technology,₹500+ billion (INR),"Square, Amazon, Apple",221000,238000,229500.0,data scientist,na,1452.0,CA,San Jose,0,22,1,1,0,0,0
2,Description\n\nSHIFT: Day Job\n\nSCHEDULE:\n\n...,3,Oracle,Hyderabad,"Redwood City, CA",10000+ employees,1977,Company - Public,Enterprise Software & Network Solutions,Information Technology,₹500+ billion (INR),"SAP, Salesforce, Microsoft",2538462,2538462,2538462.0,data scientist,na,1777.0,CA,Redwood City,0,43,1,1,0,0,0
3,"At Amazon, we strive to be most customer-centr...",4,Amazon,Hyderabad,"Seattle, WA",10000+ employees,1994,Company - Public,Internet,Information Technology,₹500+ billion (INR),"Google, Microsoft, Walmart",1566457,1566457,1566457.0,data scientist,na,4616.0,WA,Seattle,0,26,1,1,1,1,1
5,Overview\n\n\nWe have an exciting opportunity ...,4,Quanticate,Bengaluru,"Hitchin, United Kingdom",201 to 500 employees,1995,Company - Private,Biotech & Pharmaceuticals,Biotech & Pharmaceuticals,₹1 to ₹5 billion (INR),"GCE Solutions, Parexel, IQVIA",417000,451000,434000.0,data scientist,na,2405.0,United Kingdom,Hitchin,0,25,1,1,0,0,0


In [24]:
#saving the cleaned data separately for eda
df.to_csv('cleaned_data.csv')b