In [1]:
import pandas as pd
import numpy as np
import re
import os
import datetime
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('./data/gsearch_jobs.csv')

In [4]:
df.shape

(36470, 27)

In [3]:
pd.set_option("display.max_columns", None)
df.head(2)

Unnamed: 0.1,Unnamed: 0,index,title,company_name,location,via,description,extensions,job_id,thumbnail,posted_at,schedule_type,work_from_home,salary,search_term,date_time,search_location,commute_time,salary_pay,salary_rate,salary_avg,salary_min,salary_max,salary_hourly,salary_yearly,salary_standardized,description_tokens
0,0,0,Data Analyst,Meta,Anywhere,via LinkedIn,In the intersection of compliance and analytic...,"['15 hours ago', '101K–143K a year', 'Work fro...",eyJqb2JfdGl0bGUiOiJEYXRhIEFuYWx5c3QiLCJodGlkb2...,https://encrypted-tbn0.gstatic.com/images?q=tb...,15 hours ago,Full-time,True,101K–143K a year,data analyst,2023-08-04 03:00:13.797776,United States,,101K–143K,a year,122000.0,101000.0,143000.0,,122000.0,122000.0,"['r', 'tableau', 'sql', 'python']"
1,1,1,Data Analyst,ATC,United States,via LinkedIn,Job Title: Entry Level Business Analyst / Prod...,"['12 hours ago', 'Full-time', 'Health insurance']",eyJqb2JfdGl0bGUiOiJEYXRhIEFuYWx5c3QiLCJodGlkb2...,https://encrypted-tbn0.gstatic.com/images?q=tb...,12 hours ago,Full-time,,,data analyst,2023-08-04 03:00:13.797776,United States,,,,,,,,,,[]


## Drop columns

In [5]:
columns_to_drop = ['Unnamed: 0','index','thumbnail','posted_at','commute_time','search_term','salary','salary_pay','salary_rate','salary_min','salary_max','salary_avg','salary_hourly','salary_yearly','extensions']
df.drop(columns=columns_to_drop, inplace=True)

## Drop duplicated rows

In [6]:
# Find duplicates based on 'title' and 'company' columns (in this case cannot use job_id as in some cases any minimal difference in time of publication gives a new job_id)
duplicate_mask = df.duplicated(subset=['title', 'company_name'], keep=False)

# Get both original and duplicate rows for double-check
original_and_duplicates = df[duplicate_mask].sort_values(by=['title', 'company_name'])
original_and_duplicates.head(2)

Unnamed: 0,title,company_name,location,via,description,job_id,schedule_type,work_from_home,date_time,search_location,salary_standardized,description_tokens
13088,"""Data Analyst - Spatial Trend, Sens Slope, Spa...",Upwork,Anywhere,via Upwork,I am looking for a skilled Python Code Develop...,eyJqb2JfdGl0bGUiOiJcIkRhdGEgQW5hbHlzdCAtIFNwYX...,Contractor,True,2023-07-12 03:00:36.170543,United States,,['python']
13089,"""Data Analyst - Spatial Trend, Sens Slope, Spa...",Upwork,Anywhere,via Upwork,I am looking for a skilled Python Code Develop...,eyJqb2JfdGl0bGUiOiJcIkRhdGEgQW5hbHlzdCAtIFNwYX...,Contractor,True,2023-07-12 03:00:40.643589,United States,,['python']


In [7]:
# Drop the duplicate rows
df.drop_duplicates(subset=['title', 'company_name'], keep='first', inplace=True)

## Creating a New Column: "experience_level"

In [8]:
def categorize_experience(title):
    title_lower = title.lower()
    if any(word in title_lower for word in [ 'intern','internship']):
        return 'Intern'
    elif any(word in title_lower for word in ['manager', 'executive', 'principal', 'staff']):
        return 'Manager'
    elif any(word in title_lower for word in ['director']):
        return 'Director'
    elif any(word in title_lower for word in ['mid-senior','mid level','mid-level','ii','specialist']):
        return 'Mid-Senior'
    elif any(word in title_lower for word in ['jr', 'jr.','junior','entry', 'associate','1']):
        return 'Junior'
    elif any(word in title_lower for word in ['sr.', 'sr', 'senior','exper','3','4','lead','steward']):
        return 'Senior'
    elif any(word in title for word in ['III','IV']):
        return 'Senior'
    elif 'II' in title:
        return 'Mid-Senior'
    elif re.search(r'\bI\b', title):
        return 'Junior'
    else:
        return None

In [9]:
# Apply the function to create the 'experience_level' column
df['experience_level'] = df['title'].apply(categorize_experience)

In [10]:
df['experience_level'].value_counts()

experience_level
Senior        4796
Junior        1250
Mid-Senior    1221
Manager        778
Intern         286
Director        86
Name: count, dtype: int64

## Cleaning column: "title"

In [11]:
def clean_title(title):
    title_lower = title.lower()
    if 'engineer' in title_lower:
        if 'data' in title_lower:
            return 'Data Engineer'
    elif 'software' in title_lower:
        return 'Software Engineer'
    elif 'cloud' in title_lower:
        return 'Cloud Engineer'
    elif 'scien' in title_lower:
        return 'Data Scientist'
    elif re.search(r'\bBI\b', title) or re.search(r'\bbusiness\s*(?!data\s*)\w*\s*analyst\b|\bbusiness\s*(?!data\s*)\w*\s*intelligence\b|\bfinancial\s*analyst\b', title_lower):
        return 'Business Analyst'
    elif re.search(r'\bdata\b|\banalytics?\b', title_lower):
        return 'Data Analyst'
    elif re.search(r'\bdata\b.*\b(analyst|analytics|analysis)\b|\banalytics?\b', title_lower):
        return 'Data Analyst'
    else:
        return None

In [12]:
# Apply the function and create the 'title_cleaned' column in clear data role categories 
df.loc[:, 'title_cleaned'] = df['title'].apply(clean_title)

In [13]:
df['title_cleaned'].value_counts()

title_cleaned
Data Analyst         15415
Business Analyst      1430
Data Scientist        1203
Data Engineer          660
Cloud Engineer          56
Software Engineer       41
Name: count, dtype: int64

In [14]:
# Filter rows with None in 'title_cleaned' column to analyze if there are more key words that can help to categorize the titles
none_titles = df[df['title_cleaned'].isnull()]
none_titles.sample(2)

Unnamed: 0,title,company_name,location,via,description,job_id,schedule_type,work_from_home,date_time,search_location,salary_standardized,description_tokens,experience_level,title_cleaned
24644,Examination papper on Regression Analysis,Upwork,Anywhere,via Upwork,"Examination task\n\nIn the examination task, a...",eyJqb2JfdGl0bGUiOiJFeGFtaW5hdGlvbiBwYXBwZXIgb2...,Contractor,True,2022-12-03 04:00:42.761612,United States,,"['word', 'excel']",,
18670,Sr bi developer,CRB,"Kansas City, MO",via Talent.com,Job Description\n\nCRB is looking for an energ...,eyJqb2JfdGl0bGUiOiJTciBiaSBkZXZlbG9wZXIiLCJodG...,Full-time,,2022-11-07 15:17:30.126625,United States,,"['dax', 'powershell', 'ssis', 'tableau', 'azur...",Senior,


## Creating a New Column: "Role"

In [15]:
def get_role_name(row):
    title_cleaned = row['title_cleaned']
    experience_level = row['experience_level']
    
    if title_cleaned is None and experience_level is None:
        return None
    elif experience_level is None:
        return title_cleaned
    elif title_cleaned is None:
        return None
    else:
        return f"{title_cleaned} {experience_level}"

In [16]:
df['role'] = df.apply(get_role_name, axis=1)

In [17]:
# Rearrenged columns
new_order = ['job_id','title','experience_level','title_cleaned','role'] + [col for col in df.columns if col not in ['job_id','title', 'experience_level','title_cleaned','role']]
df = df[new_order]

## Cleaning column: "location"

In [18]:
# Dictionary of state names and abbreviations
state_name_to_abbreviation = {
    'Alabama': 'AL', 'Alaska': 'AK', 'Arizona': 'AZ', 'Arkansas': 'AR', 'California': 'CA', 'Colorado': 'CO',
    'Connecticut': 'CT', 'Delaware': 'DE', 'Florida': 'FL', 'Georgia': 'GA', 'Hawaii': 'HI', 'Idaho': 'ID',
    'Illinois': 'IL', 'Indiana': 'IN', 'Iowa': 'IA', 'Kansas': 'KS', 'Kentucky': 'KY', 'Louisiana': 'LA',
    'Maine': 'ME', 'Maryland': 'MD', 'Massachusetts': 'MA', 'Michigan': 'MI', 'Minnesota': 'MN',
    'Mississippi': 'MS', 'Missouri': 'MO', 'Montana': 'MT', 'Nebraska': 'NE', 'Nevada': 'NV',
    'New Hampshire': 'NH', 'New Jersey': 'NJ', 'New Mexico': 'NM', 'New York': 'NY', 'North Carolina': 'NC',
    'North Dakota': 'ND', 'Ohio': 'OH', 'Oklahoma': 'OK', 'Oregon': 'OR', 'Pennsylvania': 'PA',
    'Rhode Island': 'RI', 'South Carolina': 'SC', 'South Dakota': 'SD', 'Tennessee': 'TN', 'Texas': 'TX',
    'Utah': 'UT', 'Vermont': 'VT', 'Virginia': 'VA', 'Washington': 'WA', 'West Virginia': 'WV', 'Wisconsin': 'WI',
    'Wyoming': 'WY'
}

# Function to replace state names with abbreviations
def replace_state_with_abbreviation(text):
    if pd.isna(text):
        return text  
    # Removing additional text after the state abbreviation in ()
    text = re.sub(r'\s*\(\+\d+\s*other[s]?\)', '', str(text))
    for state_name, state_abbr in state_name_to_abbreviation.items():
        if state_name.lower() in text.lower():
            return state_abbr
    # Return non-state values as is
    return text  

# Extract information after comma and removing leading andtrailing spaces
df['location'] = df['location'].fillna(np.nan)  # Fill NaN values explicitly
df['location'] = df['location'].str.split(',').str[-1].str.strip()

# Replace 'Anywhere' with 'United States' 
df['location'] = df['location'].str.replace('Anywhere', 'United States', case=False)

# Update the entire column to 'USA' if it contains the phrase 'United States'
df.loc[df['location'].str.contains('United States', case=False, na=False), 'location'] = 'USA'

# Replace state names with abbreviations
df['location'] = df['location'].apply(replace_state_with_abbreviation)

In [19]:
df['location'].value_counts()

location
USA    17911
MO      1259
KS       864
OK       771
AR       454
CA        88
TX        15
NE         8
NY         7
MA         5
PA         4
NJ         2
DC         2
NH         1
CT         1
Name: count, dtype: int64

In [None]:
import spacy

nlp = spacy.load("en_core_web_sm")

def extract_location(text):
    doc = nlp(text)
    locations = [ent.text for ent in doc.ents if ent.label_ == "GPE"]  # GPE = geopolitical entity
    if locations:
        # Assumption: The first location mentioned is the relevant one
        return locations[0]
    return None

In [None]:
# Fill NaN values with 'USA' or any default value
df['location'] = df['location'].fillna('USA')  
# Filter rows with 'USA' or null
mask = (df['location'] == 'USA') | df['location'].isnull()  

for idx, row in df[mask].iterrows():
    location = extract_location(row['description'])
    if location:
        # Update 'location' column with the extracted location
        df.loc[idx, 'location'] = location 

# Convert the extracted locations to state abbreviations 
df['location'] = df['location'].map(state_name_to_abbreviation.get)

## Cleaning column: "via"

In [20]:
# Getting rid of 'via' at the beginning of the strings
df['via'] = df['via'].str.replace(r'^via', '', regex=True).str.strip()

## Cleaning column: "schedule_type"

In [21]:
df.loc[df['schedule_type'].str.contains('internship', case=False, na=False), 'schedule_type'] = 'Internship'
df.loc[df['schedule_type'].str.contains('contractor|tempwork|temp work', case=False, na=False), 'schedule_type'] = 'Contractor'
df.loc[df['schedule_type'].str.contains('full-time', case=False, na=False), 'schedule_type'] = 'Full-Time'
df.loc[df['schedule_type'].str.contains('part-time|volunteer', case=False, na=False), 'schedule_type'] = 'Part-Time'

In [22]:
df['schedule_type'].value_counts()

schedule_type
Full-Time     14276
Contractor     6617
Part-Time       250
Internship      158
Name: count, dtype: int64

## Cleaning column:  "work_from_home"

In [23]:
# Replace null values with "On-Site"
df['work_from_home'].fillna('On-Site', inplace=True)

# Replace "True" with "Remote"
df['work_from_home'].replace(True, 'Remote', inplace=True)

In [24]:
df['work_from_home'].value_counts()

work_from_home
Remote     11429
On-Site     9977
Name: count, dtype: int64

## Cleaning column:  "date_time"

In [25]:
# Transforming 'date_time' column to date format without hours
df['date_time'] = pd.to_datetime(df['date_time']).dt.date

## Cleaning column:  "salary_standardized"

In [26]:
df['salary_standardized'].describe()

count      4027.000000
mean      93823.717710
std       45625.543707
min       15080.000000
25%       62400.000000
50%       88400.000000
75%      119600.000000
max      624000.000000
Name: salary_standardized, dtype: float64

## Cleaning column: "description_tokens"

In [27]:
# Renaming the column
df.rename(columns={'description_tokens': 'skills'}, inplace=True)

In [28]:
import ast
# Convert the values (type = object) in column 'Skills' into actual lists
df['skills'] = df['skills'].apply(ast.literal_eval)

In [29]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 21406 entries, 0 to 36462
Data columns (total 15 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   job_id               21406 non-null  object 
 1   title                21406 non-null  object 
 2   experience_level     8417 non-null   object 
 3   title_cleaned        18805 non-null  object 
 4   role                 18805 non-null  object 
 5   company_name         21406 non-null  object 
 6   location             21392 non-null  object 
 7   via                  21405 non-null  object 
 8   description          21406 non-null  object 
 9   schedule_type        21301 non-null  object 
 10  work_from_home       21406 non-null  object 
 11  date_time            21406 non-null  object 
 12  search_location      21406 non-null  object 
 13  salary_standardized  4027 non-null   float64
 14  skills               21406 non-null  object 
dtypes: float64(1), object(14)
memory usage: 2

## Save Dataframe: Job Posts

In [30]:
df.to_csv('./data/us_jobposts.csv', index=False) 

## Create a Dataframe based on the skills

In [34]:
# Remove empty lists [] and explode the column into separate rows
skills_df = df[df['skills'].apply(lambda x: len(x) > 0)].explode('skills')
skills_df

Unnamed: 0,job_id,title,experience_level,title_cleaned,role,company_name,location,via,description,schedule_type,work_from_home,date_time,search_location,salary_standardized,skills
0,eyJqb2JfdGl0bGUiOiJEYXRhIEFuYWx5c3QiLCJodGlkb2...,Data Analyst,,Data Analyst,Data Analyst,Meta,USA,LinkedIn,In the intersection of compliance and analytic...,Full-Time,Remote,2023-08-04,United States,122000.0,r
0,eyJqb2JfdGl0bGUiOiJEYXRhIEFuYWx5c3QiLCJodGlkb2...,Data Analyst,,Data Analyst,Data Analyst,Meta,USA,LinkedIn,In the intersection of compliance and analytic...,Full-Time,Remote,2023-08-04,United States,122000.0,tableau
0,eyJqb2JfdGl0bGUiOiJEYXRhIEFuYWx5c3QiLCJodGlkb2...,Data Analyst,,Data Analyst,Data Analyst,Meta,USA,LinkedIn,In the intersection of compliance and analytic...,Full-Time,Remote,2023-08-04,United States,122000.0,sql
0,eyJqb2JfdGl0bGUiOiJEYXRhIEFuYWx5c3QiLCJodGlkb2...,Data Analyst,,Data Analyst,Data Analyst,Meta,USA,LinkedIn,In the intersection of compliance and analytic...,Full-Time,Remote,2023-08-04,United States,122000.0,python
2,eyJqb2JfdGl0bGUiOiJBZXJvbmF1dGljYWwgRGF0YSBBbm...,Aeronautical Data Analyst,,Data Analyst,Data Analyst,"Garmin International, Inc.",KS,Indeed,Overview:\n\nWe are seeking a full-time...\nAe...,Full-Time,On-Site,2023-08-04,United States,,sql
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36462,eyJqb2JfdGl0bGUiOiJEYXRhIEFuYWx5c3QgLSBSZW1vdG...,Data Analyst - Remote,,Data Analyst,Data Analyst,UnitedHealth Group,USA,My ArkLaMiss Jobs,Combine two of the fastest-growing fields on t...,Full-Time,On-Site,2023-02-14,United States,,word
36462,eyJqb2JfdGl0bGUiOiJEYXRhIEFuYWx5c3QgLSBSZW1vdG...,Data Analyst - Remote,,Data Analyst,Data Analyst,UnitedHealth Group,USA,My ArkLaMiss Jobs,Combine two of the fastest-growing fields on t...,Full-Time,On-Site,2023-02-14,United States,,go
36462,eyJqb2JfdGl0bGUiOiJEYXRhIEFuYWx5c3QgLSBSZW1vdG...,Data Analyst - Remote,,Data Analyst,Data Analyst,UnitedHealth Group,USA,My ArkLaMiss Jobs,Combine two of the fastest-growing fields on t...,Full-Time,On-Site,2023-02-14,United States,,pyspark
36462,eyJqb2JfdGl0bGUiOiJEYXRhIEFuYWx5c3QgLSBSZW1vdG...,Data Analyst - Remote,,Data Analyst,Data Analyst,UnitedHealth Group,USA,My ArkLaMiss Jobs,Combine two of the fastest-growing fields on t...,Full-Time,On-Site,2023-02-14,United States,,r


In [61]:
# Skills categorization 

tools = ["tableau","excel","power_bi","sas","word","powerpoint","sap","ssis","looker","qlik","alteryx",
    "spss","ssrs","outlook","dax","sharepoint","splunk","microstrategy","cognos","visio","google sheets","spreadsheet",
    "ms access","datarobot"]

database = ["sql server","mysql","cassandra","postgresql","mongodb","elasticsearch","dynamodb","redis","db2","neo4j",
    "mariadb","firebase","couchbase","sqlite","firestore","couchdb"]

cloud_platforms = [ "aws","azure","snowflake","databricks","redshift","gcp","oracle","bigquery","aurora",
    "vmware","ibm cloud","firebase","watson","openstack","heroku","digital ocean", "colocation","ovh","linode"]

libraries = ["spark","hadoop","kafka","airflow","pyspark","pandas","tensorflow","pytorch","numpy","scikit-learn",
    "keras","jupyter","react","matplotlib","spring","gdpr","plotly","seaborn","graphql","nltk","opencv","ggplot2",
    "selenium","mxnet","tidyverse"]

frameworks_list = ["express","node.js","angular","flask","ruby","django","vue.js","phoenix","fastapi","jquery",
    "asp.net","ruby on rails", "react","laravel","angular.js", "asp.net core", "next.js","drupal","svelte",
    "symfony","blazor","play framework",  "gatsby"]

languages = ["python","sql","r","java","scala","nosql","sas","go","mongodb","shell","javascript","c++","c#","c",
    "matlab","vba","bash","t-sql","powershell","html","perl","css","ruby","typescript"]


In [56]:
# Function to categorize skills
def categorize_skills(skill):
    lowercase_skill = skill.lower()
    if skill in tools:
        return "Tools"
    elif skill in database:
        return "Database"
    elif skill in cloud_platforms:
        return "Cloud Platforms"
    elif skill in libraries:
        return "Libraries"
    elif skill in frameworks_list:
        return "Frameworks"
    elif skill in languages:
        return "Languages"
    else:
        return "Other"

In [57]:
# Apply the categorization function to create the 'skills_categories' column
skills_df['skills_categories'] = skills_df['skills'].apply(categorize_skills)

In [60]:
skills_df.head(2)

Unnamed: 0,job_id,title,experience_level,title_cleaned,role,company_name,location,via,description,schedule_type,work_from_home,date_time,search_location,salary_standardized,skills,skills_categories
0,eyJqb2JfdGl0bGUiOiJEYXRhIEFuYWx5c3QiLCJodGlkb2...,Data Analyst,,Data Analyst,Data Analyst,Meta,USA,LinkedIn,In the intersection of compliance and analytic...,Full-Time,Remote,2023-08-04,United States,122000.0,r,Languages
0,eyJqb2JfdGl0bGUiOiJEYXRhIEFuYWx5c3QiLCJodGlkb2...,Data Analyst,,Data Analyst,Data Analyst,Meta,USA,LinkedIn,In the intersection of compliance and analytic...,Full-Time,Remote,2023-08-04,United States,122000.0,tableau,Tools


## Save Dataframe: Skills

In [63]:
skills_df.to_csv('./data/us_skills.csv', index=False) 