In [1]:
import pandas as pd
import numpy as np
import re
import os
import datetime
import matplotlib.pyplot as plt


In [2]:
df = pd.read_csv('../data/gsearch_jobs.csv')

In [3]:
df.shape

(36470, 27)

In [4]:
df.sample(3)

Unnamed: 0.1,Unnamed: 0,index,title,company_name,location,via,description,extensions,job_id,thumbnail,...,commute_time,salary_pay,salary_rate,salary_avg,salary_min,salary_max,salary_hourly,salary_yearly,salary_standardized,description_tokens
27593,27593,425,Jr. Data Analyst,Dice,Anywhere,via LinkedIn,Dice is the leading career destination for tec...,"['7 hours ago', 'Work from home', 'Full-time']",eyJqb2JfdGl0bGUiOiJKci4gRGF0YSBBbmFseXN0IiwiaH...,https://encrypted-tbn0.gstatic.com/images?q=tb...,...,,,,,,,,,,"['sql', 'excel']"
3559,3559,1487,Marketing Analyst,On-Board Companies,United States,via BeBee,On-Board Services is hiring a Marketing Analys...,"['23 hours ago', 'Full-time and Contractor', '...",eyJqb2JfdGl0bGUiOiJNYXJrZXRpbmcgQW5hbHlzdCIsIm...,https://encrypted-tbn0.gstatic.com/images?q=tb...,...,,,,,,,,,,['sharepoint']
20138,20138,718,Operations Data Analyst,Google Inc,"Kansas City, KS",via Jora,"At GFiber, we believe in the power of great in...","['13 hours ago', 'Full-time', 'No degree menti...",eyJqb2JfdGl0bGUiOiJPcGVyYXRpb25zIERhdGEgQW5hbH...,,...,,,,,,,,,,"['python', 'excel', 'sql']"


## Drop columns

In [5]:
columns_to_drop = ['Unnamed: 0','index','thumbnail','posted_at','commute_time','search_location','search_term','salary','salary_pay','salary_rate','salary_min','salary_max','salary_avg','salary_hourly','salary_yearly','extensions']
df.drop(columns=columns_to_drop, inplace=True)

## Drop duplicated rows

In [6]:
# Find duplicates based on 'title' and 'company' columns (in this case cannot use job_id as in some cases any minimal difference in time of publication gives a new job_id)
duplicate_mask = df.duplicated(subset=['title', 'company_name'], keep=False)

# Get both original and duplicate rows for double-check
original_and_duplicates = df[duplicate_mask].sort_values(by=['title', 'company_name'])
original_and_duplicates.sample(3)

Unnamed: 0,title,company_name,location,via,description,job_id,schedule_type,work_from_home,date_time,salary_standardized,description_tokens
28628,Experience Analyst,Inclusively,United States,via LinkedIn,Inclusively is partnering with a global creati...,eyJqb2JfdGl0bGUiOiJFeHBlcmllbmNlIEFuYWx5c3QiLC...,Full-time,,2023-02-07 04:00:33.888886,,"['powerpoint', 'tableau', 'sql', 'excel']"
12626,Data Analyst,Upwork,United States,via BeBee,How to Become a Freelance Data Analyst\n\nThe ...,eyJqb2JfdGl0bGUiOiJEYXRhIEFuYWx5c3QiLCJodGlkb2...,Part-time,,2023-06-12 03:01:03.756553,,"['tableau', 'python', 'go', 'excel', 'sql', 'p..."
5036,"Medical Informatics Analyst, Senior",Blue Shield of California,United States,via BeBee,Your Role\n\nThe Data & Analytics (D&A) team f...,eyJqb2JfdGl0bGUiOiJNZWRpY2FsIEluZm9ybWF0aWNzIE...,Full-time,,2023-08-05 03:00:27.551956,,"['jira', 'tableau', 'python', 'r', 'sql', 'sas']"


In [7]:
# Drop the duplicate rows
df.drop_duplicates(subset=['title', 'company_name'], keep='first', inplace=True)

## Creating a New Column: "experience_level"

In [8]:
def categorize_experience(title):
    title_lower = title.lower()
    if any(word in title_lower for word in [ 'intern','internship']):
        return 'Internship'
    elif any(word in title_lower for word in ['manager', 'executive', 'principal', 'staff']):
        return 'Manager'
    elif any(word in title_lower for word in ['director']):
        return 'Director'
    elif any(word in title_lower for word in ['mid-senior','mid level','mid-level','ii','specialist']):
        return 'Mid-Senior'
    elif any(word in title_lower for word in ['jr', 'jr.','junior','entry', 'associate','1']):
        return 'Junior'
    elif any(word in title_lower for word in ['sr.', 'sr', 'senior','exper','3','4','lead','steward']):
        return 'Senior'
    elif any(word in title for word in ['III','IV']):
        return 'Senior'
    elif 'II' in title:
        return 'Mid-Senior'
    elif re.search(r'\bI\b', title):
        return 'Junior'
    else:
        return None

In [9]:
# Apply the function to create the 'experience_level' column
df['experience_level'] = df['title'].apply(categorize_experience)

In [10]:
df['experience_level'].value_counts()

experience_level
Senior        4796
Junior        1250
Mid-Senior    1221
Manager        778
Internship     286
Director        86
Name: count, dtype: int64

In [11]:
df['experience_level'].fillna('Not-Specified', inplace=True)

## Cleaning column: "title"

In [12]:
def clean_title(title):
    title_lower = title.lower()
    if 'machine' in title_lower:
        return 'Machine Learning Engineer'
    elif 'artificial' in title_lower or 'AI' in title or 'ML' in title:
        return 'Machine Learning Engineer'
    elif 'cloud' in title_lower:
        return 'Cloud Engineer'
    elif 'software' in title_lower:
        return 'Software Engineer'
    elif 'dev' in title_lower:
        return 'Software Developer'
    elif re.search(r'\bBI\b', title) or re.search(r'\bbusiness\s*(?!data\s*)\w*\s*analyst\b|\bbusiness\s*(?!data\s*)\w*\s*intelligence\b|\bfinancial\s*analyst\b', title_lower):
        return 'Business Analyst'
    elif 'engineer' in title_lower and 'data' in title_lower:
        return 'Data Engineer'
    elif 'software' in title_lower:
        return 'Software Engineer'
    elif 'scien' in title_lower:
        return 'Data Scientist'
    elif re.search(r'\bdata\b|\banalytics?\b', title_lower):
        return 'Data Analyst'
    elif re.search(r'\bdata\b.*\b(analyst|analytics|analysis)\b|\banalytics?\b', title_lower):
        return 'Data Analyst'
    elif 'tableau' in title_lower or 'R' in title or 'forecast' in title_lower or 'excel' in title_lower or 'trends' in title_lower or 'regression' in title_lower or 'intelligence' in title_lower or 'digital' in title_lower or 'spss' in title_lower or 'numpy' in title_lower or 'data' in title_lower :
        return 'Data Analyst'
    elif 'system' in title_lower:
        return 'Software Engineer'
    elif 'program' in title_lower:
        return 'Software Engineer'
    elif 'backend' in title_lower:
        return 'Software Engineer'
    elif 'frontend' in title_lower:
        return 'Software Engineer'
    elif 'full' in title_lower:
        return 'Software Engineer'
    
    else:
        return None

In [13]:
# Apply the function and create the 'title_cleaned' column in clear data role categories 
df.loc[:, 'title_cleaned'] = df['title'].apply(clean_title)

In [14]:
df['title_cleaned'].value_counts()

title_cleaned
Data Analyst                 16075
Business Analyst              1376
Data Scientist                1138
Data Engineer                  599
Software Developer             380
Software Engineer              140
Machine Learning Engineer      136
Cloud Engineer                  71
Name: count, dtype: int64

In [15]:
df['title_cleaned'].isnull().sum()

1491

In [16]:
df.dropna(subset=['title_cleaned'], inplace=True)
df.shape

(19915, 13)

## Creating a New Column: "Role"

In [17]:
def get_role_name(row):
    title_cleaned = row['title_cleaned']
    experience_level = row['experience_level']
    
    if experience_level == "Not-Specified":
        return title_cleaned
    elif title_cleaned is None:
        return None
    else:
        return f"{title_cleaned} {experience_level}"


In [18]:
df['role'] = df.apply(get_role_name, axis=1)

## Cleaning column: "location"

In [19]:
# Dictionary of state names and abbreviations
state_name_to_abbreviation = {
    'Alabama': 'AL', 'Alaska': 'AK', 'Arizona': 'AZ', 'Arkansas': 'AR', 'California': 'CA', 'Colorado': 'CO',
    'Connecticut': 'CT', 'Delaware': 'DE', 'Florida': 'FL', 'Georgia': 'GA', 'Hawaii': 'HI', 'Idaho': 'ID',
    'Illinois': 'IL', 'Indiana': 'IN', 'Iowa': 'IA', 'Kansas': 'KS', 'Kentucky': 'KY', 'Louisiana': 'LA',
    'Maine': 'ME', 'Maryland': 'MD', 'Massachusetts': 'MA', 'Michigan': 'MI', 'Minnesota': 'MN',
    'Mississippi': 'MS', 'Missouri': 'MO', 'Montana': 'MT', 'Nebraska': 'NE', 'Nevada': 'NV',
    'New Hampshire': 'NH', 'New Jersey': 'NJ', 'New Mexico': 'NM', 'New York': 'NY', 'North Carolina': 'NC',
    'North Dakota': 'ND', 'Ohio': 'OH', 'Oklahoma': 'OK', 'Oregon': 'OR', 'Pennsylvania': 'PA',
    'Rhode Island': 'RI', 'South Carolina': 'SC', 'South Dakota': 'SD', 'Tennessee': 'TN', 'Texas': 'TX',
    'Utah': 'UT', 'Vermont': 'VT', 'Virginia': 'VA', 'Washington': 'WA', 'West Virginia': 'WV', 'Wisconsin': 'WI',
    'Wyoming': 'WY'
}

# Function to replace state names with abbreviations
def replace_state_with_abbreviation(text):
    if pd.isna(text):
        return text  
    # Removing additional text after the state abbreviation in ()
    text = re.sub(r'\s*\(\+\d+\s*other[s]?\)', '', str(text))
    for state_name, state_abbr in state_name_to_abbreviation.items():
        if state_name.lower() in text.lower():
            return state_abbr
    # Return non-state values as is
    return text  

# Extract information after comma and removing leading andtrailing spaces
df['location'] = df['location'].fillna(np.nan)  # Fill NaN values explicitly
df['location'] = df['location'].str.split(',').str[-1].str.strip()

# Replace 'Anywhere' with 'United States' 
df['location'] = df['location'].str.replace('Anywhere', 'United States', case=False)

# Update the entire column to 'USA' if it contains the phrase 'United States'
df.loc[df['location'].str.contains('United States', case=False, na=False), 'location'] = 'USA'

# Replace state names with abbreviations
df['location'] = df['location'].apply(replace_state_with_abbreviation)

In [20]:
df['location'].isnull().sum()

14

In [21]:
# Evaluated case by case because they were just 14
df['location'].fillna('KS', inplace=True)

## Cleaning column: "via"

In [22]:
df['via'].isnull().sum()

1

In [23]:
null_locations = df[df['via'].isnull()]
null_locations

Unnamed: 0,title,company_name,location,via,description,job_id,schedule_type,work_from_home,date_time,salary_standardized,description_tokens,experience_level,title_cleaned,role
155,Data Analyst,Jobs Near Me,MO,,"Data Analyst Jobs Near Me in Joplin, Missouri\...",eyJqb2JfdGl0bGUiOiJEYXRhIEFuYWx5c3QiLCJodGlkb2...,Full-time,,2023-08-17 03:00:27.163001,31200.0,"['javascript', 'sql', 'sas', 'excel', 'spss']",Not-Specified,Data Analyst,Data Analyst


In [24]:
df['via'].fillna('Linkedin', inplace=True)

In [25]:
# Getting rid of 'via' at the beginning of the strings
df['via'] = df['via'].str.replace(r'^via', '', regex=True).str.strip()

In [26]:
# Top 5 job search websites
df['via'].value_counts().head(5)

via
LinkedIn       7326
Upwork         3505
BeBee          2281
Trabajo.org    1466
Indeed         1169
Name: count, dtype: int64

## Cleaning column: "schedule_type"

In [27]:
df.loc[df['schedule_type'].str.contains('internship', case=False, na=False), 'schedule_type'] = 'Internship'
df.loc[df['schedule_type'].str.contains('contractor|tempwork|temp work', case=False, na=False), 'schedule_type'] = 'Contractor'
df.loc[df['schedule_type'].str.contains('full-time', case=False, na=False), 'schedule_type'] = 'Full-time'
df.loc[df['schedule_type'].str.contains('part-time|volunteer', case=False, na=False), 'schedule_type'] = 'Part-time'

In [28]:
df['schedule_type'].fillna('Full-time', inplace=True)

## Cleaning column:  "work_from_home"

In [29]:
# Create a new column 'Work-Modality' based on modifications in 'work_from_home'
df['work_modality'] = df['work_from_home'].fillna('On-Site')
df['work_modality'].replace(True, 'Remote', inplace=True)

In [30]:
df['work_modality'].value_counts()

work_modality
Remote     10310
On-Site     9605
Name: count, dtype: int64

## Cleaning column:  "date_time"

In [31]:
# Transforming 'date_time' column to date format without hours
df['date_time'] = pd.to_datetime(df['date_time']).dt.date

In [32]:
df.rename(columns={'date_time': 'posted_date'}, inplace=True)

## Cleaning column:  "salary_standardized"

In [33]:
df['salary_standardized'].describe()

count      3562.000000
mean      94203.002925
std       45405.587652
min       15080.000000
25%       62400.000000
50%       88400.000000
75%      119600.000000
max      624000.000000
Name: salary_standardized, dtype: float64

## Cleaning column: "description_tokens"

In [34]:
# Renaming the column
df.rename(columns={'description_tokens': 'extracted_skills'}, inplace=True)

## description

In [35]:
import unidecode
# Replace - jumps with spaces
df['description'] = df['description'].str.replace('\n', ' ')
# Delete the hyphens & ¿
df['description'] = df['description'].map(lambda x: x.replace('-', '').replace('¿', ''))
# Normalise words (no accents, etc.)
df['description'] = df['description'].map(lambda x: unidecode.unidecode(x))
# Adding spaces in a word if it contains a capitalised letter in between
df['description'] = df['description'].map(lambda x: re.sub(r"(?<![A-Z])(?<!^)([A-Z])",r" \1", x))
# Adding spaces before and after numbers
df['description'] = df['description'].map(lambda x: re.sub(r"([0-9]+(\.[0-9]+)?)", r" \1", x))
# Substitute n number of spaces by just one space
df['description'] = df['description'].map(lambda x: ' '.join(x.split()))
# Remove space before dots and after opening parenthesis
df['description'] = df['description'].map(lambda x: x.replace('( ', '(').replace(' .', '.').replace('/ ', '/'))
# Fix problems with the normalisation
df['description'] = df['description'].map(lambda x: x.replace('ano', 'año').replace('anos', 'años'))

## Save Dataframe: Job Posts

In [36]:
df.columns

Index(['title', 'company_name', 'location', 'via', 'description', 'job_id',
       'schedule_type', 'work_from_home', 'posted_date', 'salary_standardized',
       'extracted_skills', 'experience_level', 'title_cleaned', 'role',
       'work_modality'],
      dtype='object')

In [37]:
columns_to_drop = ['work_from_home']
df.drop(columns=columns_to_drop, inplace=True)

In [38]:
# Rearrenged columns
new_order = ['job_id','title','experience_level','title_cleaned','role','work_modality'] + [col for col in df.columns if col not in ['job_id','title', 'experience_level','title_cleaned','role','work_modality']]
df = df[new_order]

In [39]:
df.to_csv('../data/us_jobposts.csv', index=False) 

## Create a Dataframe based on the skills

In [40]:
df.head(1)

Unnamed: 0,job_id,title,experience_level,title_cleaned,role,work_modality,company_name,location,via,description,schedule_type,posted_date,salary_standardized,extracted_skills
0,eyJqb2JfdGl0bGUiOiJEYXRhIEFuYWx5c3QiLCJodGlkb2...,Data Analyst,Not-Specified,Data Analyst,Data Analyst,Remote,Meta,USA,LinkedIn,In the intersection of compliance and analytic...,Full-time,2023-08-04,122000.0,"['r', 'tableau', 'sql', 'python']"


In [41]:
import ast
# Convert the values (type = object) in column 'Skills' into actual lists
df['extracted_skills'] = df['extracted_skills'].apply(ast.literal_eval)

In [42]:
df.head(1)

Unnamed: 0,job_id,title,experience_level,title_cleaned,role,work_modality,company_name,location,via,description,schedule_type,posted_date,salary_standardized,extracted_skills
0,eyJqb2JfdGl0bGUiOiJEYXRhIEFuYWx5c3QiLCJodGlkb2...,Data Analyst,Not-Specified,Data Analyst,Data Analyst,Remote,Meta,USA,LinkedIn,In the intersection of compliance and analytic...,Full-time,2023-08-04,122000.0,"[r, tableau, sql, python]"


In [44]:
# Remove empty lists [] and explode the column into separate rows
skills_df = df[df['extracted_skills'].apply(lambda x: len(x) > 0)].explode('extracted_skills')
skills_df.sample(2)

Unnamed: 0,job_id,title,experience_level,title_cleaned,role,work_modality,company_name,location,via,description,schedule_type,posted_date,salary_standardized,extracted_skills
29690,eyJqb2JfdGl0bGUiOiJJbnN1cmFuY2UgLSBEYXRhIEFuYW...,Insurance - Data Analyst - REMOTE,Not-Specified,Data Analyst,Data Analyst,Remote,Work At Home Vintage Experts LLC,USA,LinkedIn,Put your Insurance Experience to work - FROM H...,Contractor,2023-04-08,,css
18662,eyJqb2JfdGl0bGUiOiJEYXRhIGFuYWx5c3QiLCJodGlkb2...,Data analyst,Not-Specified,Data Analyst,Data Analyst,On-Site,"Berean Group International, Inc.",KS,Talent.com,Berean Group is seeking a Remote Data Analyst ...,Full-time,2022-11-04,,sql


In [45]:
# Skills categorization 

tools = ["tableau","excel","power_bi","sas","word","powerpoint","sap","ssis","looker","qlik","alteryx",
    "spss","ssrs","outlook","dax","sharepoint","splunk","microstrategy","cognos","visio","google sheets","spreadsheet",
    "ms access","datarobot"]

database = ["sql server","mysql","cassandra","postgresql","mongodb","elasticsearch","dynamodb","redis","db2","neo4j",
    "mariadb","firebase","couchbase","sqlite","firestore","couchdb"]

cloud_platforms = [ "aws","azure","snowflake","databricks","redshift","gcp","oracle","bigquery","aurora",
    "vmware","ibm cloud","firebase","watson","openstack","heroku","digital ocean", "colocation","ovh","linode"]

libraries = ["spark","hadoop","kafka","airflow","pyspark","pandas","tensorflow","pytorch","numpy","scikit-learn",
    "keras","jupyter","react","matplotlib","spring","gdpr","plotly","seaborn","graphql","nltk","opencv","ggplot2",
    "selenium","mxnet","tidyverse"]

frameworks_list = ["express","node.js","angular","flask","ruby","django","vue.js","phoenix","fastapi","jquery",
    "asp.net","ruby on rails", "react","laravel","angular.js", "asp.net core", "next.js","drupal","svelte",
    "symfony","blazor","play framework",  "gatsby"]

languages = ["python","sql","r","java","scala","nosql","sas","go","mongodb","shell","javascript","c++","c#","c",
    "matlab","vba","bash","t-sql","powershell","html","perl","css","ruby","typescript"]


In [46]:
# Function to categorize skills
def categorize_skills(skill):
    lowercase_skill = skill.lower()
    if skill in tools:
        return "Tools"
    elif skill in database:
        return "Database"
    elif skill in cloud_platforms:
        return "Cloud Platforms"
    elif skill in libraries:
        return "Libraries"
    elif skill in frameworks_list:
        return "Frameworks"
    elif skill in languages:
        return "Languages"
    else:
        return "Other"

In [47]:
# Apply the categorization function to create the 'skills_categories' column
skills_df['skills_categories'] = skills_df['extracted_skills'].apply(categorize_skills)

In [62]:
skills_df.head(10)

Unnamed: 0,job_id,title,experience_level,title_cleaned,role,work_modality,company_name,location,via,description,schedule_type,posted_date,salary_standardized,extracted_skills,skills_categories
0,eyJqb2JfdGl0bGUiOiJEYXRhIEFuYWx5c3QiLCJodGlkb2...,Data Analyst,Not-Specified,Data Analyst,Data Analyst,Remote,Meta,USA,LinkedIn,In the intersection of compliance and analytic...,Full-time,2023-08-04,122000.0,r,Languages
0,eyJqb2JfdGl0bGUiOiJEYXRhIEFuYWx5c3QiLCJodGlkb2...,Data Analyst,Not-Specified,Data Analyst,Data Analyst,Remote,Meta,USA,LinkedIn,In the intersection of compliance and analytic...,Full-time,2023-08-04,122000.0,tableau,Tools
0,eyJqb2JfdGl0bGUiOiJEYXRhIEFuYWx5c3QiLCJodGlkb2...,Data Analyst,Not-Specified,Data Analyst,Data Analyst,Remote,Meta,USA,LinkedIn,In the intersection of compliance and analytic...,Full-time,2023-08-04,122000.0,sql,Languages
0,eyJqb2JfdGl0bGUiOiJEYXRhIEFuYWx5c3QiLCJodGlkb2...,Data Analyst,Not-Specified,Data Analyst,Data Analyst,Remote,Meta,USA,LinkedIn,In the intersection of compliance and analytic...,Full-time,2023-08-04,122000.0,python,Languages
2,eyJqb2JfdGl0bGUiOiJBZXJvbmF1dGljYWwgRGF0YSBBbm...,Aeronautical Data Analyst,Not-Specified,Data Analyst,Data Analyst,On-Site,"Garmin International, Inc.",KS,Indeed,Overview: We are seeking a fulltime... Aeronau...,Full-time,2023-08-04,,sql,Languages
3,eyJqb2JfdGl0bGUiOiJEYXRhIEFuYWx5c3QgLSBDb25zdW...,Data Analyst - Consumer Goods - Contract to Hire,Not-Specified,Data Analyst,Data Analyst,Remote,Upwork,USA,Upwork,Enthusiastic Data Analyst for processing sales...,Contractor,2023-08-04,41600.0,powerpoint,Tools
3,eyJqb2JfdGl0bGUiOiJEYXRhIEFuYWx5c3QgLSBDb25zdW...,Data Analyst - Consumer Goods - Contract to Hire,Not-Specified,Data Analyst,Data Analyst,Remote,Upwork,USA,Upwork,Enthusiastic Data Analyst for processing sales...,Contractor,2023-08-04,41600.0,power_bi,Tools
3,eyJqb2JfdGl0bGUiOiJEYXRhIEFuYWx5c3QgLSBDb25zdW...,Data Analyst - Consumer Goods - Contract to Hire,Not-Specified,Data Analyst,Data Analyst,Remote,Upwork,USA,Upwork,Enthusiastic Data Analyst for processing sales...,Contractor,2023-08-04,41600.0,excel,Tools
4,eyJqb2JfdGl0bGUiOiJEYXRhIEFuYWx5c3QgfCBXb3JrZm...,Data Analyst | Workforce Management,Not-Specified,Data Analyst,Data Analyst,On-Site,Krispy Kreme,USA,LinkedIn,Overview of Position This position will be the...,Contractor,2023-08-04,100000.0,powerpoint,Tools
4,eyJqb2JfdGl0bGUiOiJEYXRhIEFuYWx5c3QgfCBXb3JrZm...,Data Analyst | Workforce Management,Not-Specified,Data Analyst,Data Analyst,On-Site,Krispy Kreme,USA,LinkedIn,Overview of Position This position will be the...,Contractor,2023-08-04,100000.0,outlook,Tools


## Save Dataframe: Skills

In [None]:
skills_df.to_csv('../data/us_skills.csv', index=False) 