In [1]:
import pandas as pd
import numpy as np
import re
import os
import datetime
import matplotlib.pyplot as plt


In [2]:
df = pd.read_csv('../data/gsearch_jobs.csv')

In [3]:
df.shape

(36470, 27)

In [4]:
df.sample(3)

Unnamed: 0.1,Unnamed: 0,index,title,company_name,location,via,description,extensions,job_id,thumbnail,...,commute_time,salary_pay,salary_rate,salary_avg,salary_min,salary_max,salary_hourly,salary_yearly,salary_standardized,description_tokens
28996,28996,1828,Business Intelligence Analyst,Allied OneSource,"Olathe, KS",via LinkedIn,Do you have a curious mind where you are able ...,"['11 hours ago', 'Full-time']",eyJqb2JfdGl0bGUiOiJCdXNpbmVzcyBJbnRlbGxpZ2VuY2...,https://encrypted-tbn0.gstatic.com/images?q=tb...,...,,,,,,,,,,['sql']
4480,4480,282,Data Scientist,SP Software Solutions,Anywhere,via LinkedIn,Position: Data Scientist\n\nLocation: Santa Cl...,"['12 hours ago', 'Work from home', 'Contractor']",eyJqb2JfdGl0bGUiOiJEYXRhIFNjaWVudGlzdCIsImh0aW...,https://encrypted-tbn0.gstatic.com/images?q=tb...,...,,,,,,,,,,"['pytorch', 'tensorflow', 'keras', 'python']"
26999,26999,3063,Data Entry Clerk II,LeadStack Inc.,United States,via LinkedIn,**Please note that *** requires a *** per hour...,"['5 hours ago', 'Full-time', 'No degree mentio...",eyJqb2JfdGl0bGUiOiJEYXRhIEVudHJ5IENsZXJrIElJIi...,https://encrypted-tbn0.gstatic.com/images?q=tb...,...,,,,,,,,,,[]


## Drop columns

In [5]:
columns_to_drop = ['Unnamed: 0','index','thumbnail','posted_at','commute_time','search_term','salary','salary_pay','salary_rate','salary_min','salary_max','salary_avg','salary_hourly','salary_yearly','extensions']
df.drop(columns=columns_to_drop, inplace=True)

## Drop duplicated rows

In [6]:
# Find duplicates based on 'title' and 'company' columns (in this case cannot use job_id as in some cases any minimal difference in time of publication gives a new job_id)
duplicate_mask = df.duplicated(subset=['title', 'company_name'], keep=False)

# Get both original and duplicate rows for double-check
original_and_duplicates = df[duplicate_mask].sort_values(by=['title', 'company_name'])
original_and_duplicates.sample(3)

Unnamed: 0,title,company_name,location,via,description,job_id,schedule_type,work_from_home,date_time,search_location,salary_standardized,description_tokens
11973,Business Systems Data Analyst,MTC Holding Corporation,"Shawnee, KS",via Vacancies For Col U Fans,Description:\n\nPurpose...\n\nThe Business Sys...,eyJqb2JfdGl0bGUiOiJCdXNpbmVzcyBTeXN0ZW1zIERhdG...,Full-time,,2023-06-22 03:00:25.924344,United States,,[]
31389,Data Conversion Analyst,ABC Fitness Solutions,Anywhere,via LinkedIn,Company : ABC Fitness Solutions\n\nIt's fun to...,eyJqb2JfdGl0bGUiOiJEYXRhIENvbnZlcnNpb24gQW5hbH...,Full-time,True,2022-12-15 04:00:39.906292,United States,,"['excel', 'sql']"
21384,Healthcare Data Analyst,Gainwell Technologies LLC,United States,via BeBee,Be part of a team that unleashes the power of ...,eyJqb2JfdGl0bGUiOiJIZWFsdGhjYXJlIERhdGEgQW5hbH...,Full-time,,2023-02-11 04:00:36.927315,United States,,['sql']


In [7]:
# Drop the duplicate rows
df.drop_duplicates(subset=['title', 'company_name'], keep='first', inplace=True)

## Creating a New Column: "experience_level"

In [8]:
def categorize_experience(title):
    title_lower = title.lower()
    if any(word in title_lower for word in [ 'intern','internship']):
        return 'Internship'
    elif any(word in title_lower for word in ['manager', 'executive', 'principal', 'staff']):
        return 'Manager'
    elif any(word in title_lower for word in ['director']):
        return 'Director'
    elif any(word in title_lower for word in ['mid-senior','mid level','mid-level','ii','specialist']):
        return 'Mid-Senior'
    elif any(word in title_lower for word in ['jr', 'jr.','junior','entry', 'associate','1']):
        return 'Junior'
    elif any(word in title_lower for word in ['sr.', 'sr', 'senior','exper','3','4','lead','steward']):
        return 'Senior'
    elif any(word in title for word in ['III','IV']):
        return 'Senior'
    elif 'II' in title:
        return 'Mid-Senior'
    elif re.search(r'\bI\b', title):
        return 'Junior'
    else:
        return None

In [9]:
# Apply the function to create the 'experience_level' column
df['experience_level'] = df['title'].apply(categorize_experience)

In [10]:
df['experience_level'].value_counts()

experience_level
Senior        4796
Junior        1250
Mid-Senior    1221
Manager        778
Internship     286
Director        86
Name: count, dtype: int64

In [11]:
df['experience_level'].fillna('Not-Specified', inplace=True)

## Cleaning column: "title"

In [12]:
def clean_title(title):
    title_lower = title.lower()
    if 'machine' in title_lower:
        return 'Machine Learning Engineer'
    elif 'artificial' in title_lower or 'AI' in title or 'ML' in title:
        return 'Machine Learning Engineer'
    elif 'cloud' in title_lower:
        return 'Cloud Engineer'
    elif 'software' in title_lower:
        return 'Software Engineer'
    elif 'dev' in title_lower:
        return 'Software Engineer'
    elif re.search(r'\bBI\b', title) or re.search(r'\bbusiness\s*(?!data\s*)\w*\s*analyst\b|\bbusiness\s*(?!data\s*)\w*\s*intelligence\b|\bfinancial\s*analyst\b', title_lower):
        return 'Business Analyst'
    elif 'engineer' in title_lower and 'data' in title_lower:
        return 'Data Engineer'
    elif 'software' in title_lower:
        return 'Software Engineer'
    elif 'scien' in title_lower:
        return 'Data Scientist'
    elif re.search(r'\bdata\b|\banalytics?\b', title_lower):
        return 'Data Analyst'
    elif re.search(r'\bdata\b.*\b(analyst|analytics|analysis)\b|\banalytics?\b', title_lower):
        return 'Data Analyst'
    elif 'tableau' in title_lower or 'R' in title or 'forecast' in title_lower or 'excel' in title_lower or 'trends' in title_lower or 'regression' in title_lower or 'intelligence' in title_lower or 'digital' in title_lower or 'spss' in title_lower or 'numpy' in title_lower or 'data' in title_lower :
        return 'Data Analyst'
    elif 'system' in title_lower:
        return 'Software Engineer'
    elif 'program' in title_lower:
        return 'Software Engineer'
    elif 'backend' in title_lower:
        return 'Software Engineer'
    elif 'frontend' in title_lower:
        return 'Software Engineer'
    elif 'full' in title_lower:
        return 'Software Engineer'
    
    else:
        return None

In [13]:
# Apply the function and create the 'title_cleaned' column in clear data role categories 
df.loc[:, 'title_cleaned'] = df['title'].apply(clean_title)

In [14]:
df['title_cleaned'].value_counts()

title_cleaned
Data Analyst                 16075
Business Analyst              1376
Data Scientist                1138
Data Engineer                  599
Software Engineer              520
Machine Learning Engineer      136
Cloud Engineer                  71
Name: count, dtype: int64

In [15]:
df['title_cleaned'].isnull().sum()

1491

In [16]:
df.dropna(subset=['title_cleaned'], inplace=True)
df.shape

(19915, 14)

## Creating a New Column: "Role"

In [17]:
def get_role_name(row):
    title_cleaned = row['title_cleaned']
    experience_level = row['experience_level']
    
    if experience_level == "Not-Specified":
        return title_cleaned
    elif title_cleaned is None:
        return None
    else:
        return f"{title_cleaned} {experience_level}"


In [18]:
df['role'] = df.apply(get_role_name, axis=1)

## Cleaning column: "description_tokens"

In [19]:
# Renaming the column
df.rename(columns={'description_tokens': 'extracted_skills'}, inplace=True)

## Cleaning column: "location"

In [20]:
df.rename(columns={'location': 'original_location'}, inplace=True)

In [21]:
# Dictionary of state names and abbreviations
state_name_to_abbreviation = {
    'Alabama': 'AL', 'Alaska': 'AK', 'Arizona': 'AZ', 'Arkansas': 'AR', 'California': 'CA', 'Colorado': 'CO',
    'Connecticut': 'CT', 'Delaware': 'DE', 'Florida': 'FL', 'Georgia': 'GA', 'Hawaii': 'HI', 'Idaho': 'ID',
    'Illinois': 'IL', 'Indiana': 'IN', 'Iowa': 'IA', 'Kansas': 'KS', 'Kentucky': 'KY', 'Louisiana': 'LA',
    'Maine': 'ME', 'Maryland': 'MD', 'Massachusetts': 'MA', 'Michigan': 'MI', 'Minnesota': 'MN',
    'Mississippi': 'MS', 'Missouri': 'MO', 'Montana': 'MT', 'Nebraska': 'NE', 'Nevada': 'NV',
    'New Hampshire': 'NH', 'New Jersey': 'NJ', 'New Mexico': 'NM', 'New York': 'NY', 'North Carolina': 'NC',
    'North Dakota': 'ND', 'Ohio': 'OH', 'Oklahoma': 'OK', 'Oregon': 'OR', 'Pennsylvania': 'PA',
    'Rhode Island': 'RI', 'South Carolina': 'SC', 'South Dakota': 'SD', 'Tennessee': 'TN', 'Texas': 'TX',
    'Utah': 'UT', 'Vermont': 'VT', 'Virginia': 'VA', 'Washington': 'WA', 'West Virginia': 'WV', 'Wisconsin': 'WI',
    'Wyoming': 'WY'
}

# Function to replace state names with abbreviations
def replace_state_with_abbreviation(text):
    if pd.isna(text):
        return text  
    # Removing additional text after the state abbreviation in ()
    text = re.sub(r'\s*\(\+\d+\s*other[s]?\)', '', str(text))
    for state_name, state_abbr in state_name_to_abbreviation.items():
        if state_name.lower() in text.lower():
            return state_abbr
    # Return non-state values as is
    return text  

# Extract information after comma and removing leading andtrailing spaces
df['original_location'] = df['original_location'].fillna(np.nan)  # Fill NaN values explicitly
df['original_location'] = df['original_location'].str.split(',').str[-1].str.strip()

# Update the entire column to 'Not Specified' if it contains the phrase 'United States'
df.loc[df['original_location'].str.contains('United States', case=False, na=False), 'original_location'] = 'Not-Specified'

# Replace state names with abbreviations
df['original_location'] = df['original_location'].apply(replace_state_with_abbreviation)

In [22]:
df['original_location'].isnull().sum()

14

In [23]:
# Evaluated case by case because they were just 14
df['original_location'].fillna('KS', inplace=True)

In [24]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import pandas as pd
import re

# Ensure NLTK resources are downloaded
nltk.download('punkt')
nltk.download('stopwords')

# Set of English stopwords
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/mairagutierrez/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/mairagutierrez/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [25]:
# Define the function to tokenize descriptions and remove stopwords
def tokenize_and_remove_stopwords(description):
    tokens = word_tokenize(description.lower())
    return [token for token in tokens if token.isalpha() and token not in stop_words]

# Function to find the first state in a text
def find_first_state(text):
    for state in state_name_to_abbreviation.keys():
        if re.search(r'\b' + state + r'\b', text):
            return state_name_to_abbreviation[state]
    return None

In [26]:
# Tokenize descriptions and remove stopwords
df['description_tokens'] = df['description'].apply(tokenize_and_remove_stopwords)

# Find the first state in the tokenized descriptions and store it in a new column
df['location'] = df['description'].apply(find_first_state)

In [27]:
# Replace null values in 'location' with values from 'original_location'
df['location'].fillna(df['original_location'], inplace=True)

In [28]:
state_abbreviation_to_name = {
    'AL': 'Alabama', 'AK': 'Alaska', 'AZ': 'Arizona', 'AR': 'Arkansas', 'CA': 'California', 'CO': 'Colorado',
    'CT': 'Connecticut','DC':'Washington DC', 'DE': 'Delaware', 'FL': 'Florida', 'GA': 'Georgia', 'HI': 'Hawaii', 'ID': 'Idaho',
    'IL': 'Illinois', 'IN': 'Indiana', 'IA': 'Iowa', 'KS': 'Kansas', 'KY': 'Kentucky', 'LA': 'Louisiana',
    'ME': 'Maine', 'MD': 'Maryland', 'MA': 'Massachusetts', 'MI': 'Michigan', 'MN': 'Minnesota',
    'MS': 'Mississippi', 'MO': 'Missouri', 'MT': 'Montana', 'NE': 'Nebraska', 'NV': 'Nevada',
    'NH': 'New Hampshire', 'NJ': 'New Jersey', 'NM': 'New Mexico', 'NY': 'New York', 'NC': 'North Carolina',
    'ND': 'North Dakota', 'OH': 'Ohio', 'OK': 'Oklahoma', 'OR': 'Oregon', 'PA': 'Pennsylvania',
    'RI': 'Rhode Island', 'SC': 'South Carolina', 'SD': 'South Dakota', 'TN': 'Tennessee', 'TX': 'Texas',
    'UT': 'Utah', 'VT': 'Vermont', 'VA': 'Virginia', 'WA': 'Washington', 'WV': 'West Virginia',
    'WI': 'Wisconsin', 'WY': 'Wyoming'
}

def replace_state_abbreviations(location):
    if location.upper() in state_abbreviation_to_name:
        return state_abbreviation_to_name[location.upper()]
    else:
        return location

In [29]:
df['location'] = df['location'].apply(replace_state_abbreviations)

In [30]:
df.head(2)

Unnamed: 0,title,company_name,original_location,via,description,job_id,schedule_type,work_from_home,date_time,search_location,salary_standardized,extracted_skills,experience_level,title_cleaned,role,description_tokens,location
0,Data Analyst,Meta,Anywhere,via LinkedIn,In the intersection of compliance and analytic...,eyJqb2JfdGl0bGUiOiJEYXRhIEFuYWx5c3QiLCJodGlkb2...,Full-time,True,2023-08-04 03:00:13.797776,United States,122000.0,"['r', 'tableau', 'sql', 'python']",Not-Specified,Data Analyst,Data Analyst,"[intersection, compliance, analytics, seeking,...",Anywhere
1,Data Analyst,ATC,Not-Specified,via LinkedIn,Job Title: Entry Level Business Analyst / Prod...,eyJqb2JfdGl0bGUiOiJEYXRhIEFuYWx5c3QiLCJodGlkb2...,Full-time,,2023-08-04 03:00:13.797776,United States,,[],Not-Specified,Data Analyst,Data Analyst,"[job, title, entry, level, business, analyst, ...",Not-Specified


In [31]:
df['location'].unique()

array(['Anywhere', 'Not-Specified', 'Kansas', 'Oklahoma', 'Arkansas',
       'Missouri', 'Illinois', 'Florida', 'Hawaii', 'New York', 'Alaska',
       'Ohio', 'California', 'Indiana', 'Pennsylvania', 'New Jersey',
       'Arizona', 'New Mexico', 'Massachusetts', 'Idaho', 'Washington',
       'Texas', 'Maryland', 'Alabama', 'Georgia', 'Connecticut', 'Maine',
       'North Carolina', 'Rhode Island', 'Minnesota', 'North Dakota',
       'Oregon', 'Virginia', 'Colorado', 'Michigan', 'Nevada', 'Nebraska',
       'Utah', 'South Carolina', 'Louisiana', 'Delaware', 'Wisconsin',
       'South Dakota', 'Mississippi', 'Iowa', 'Kentucky', 'Vermont',
       'Tennessee', 'Wyoming', 'Montana', 'New Hampshire',
       'Washington DC'], dtype=object)

## Cleaning column: "via"

In [32]:
df['via'].isnull().sum()

1

In [33]:
null_locations = df[df['via'].isnull()]
null_locations

Unnamed: 0,title,company_name,original_location,via,description,job_id,schedule_type,work_from_home,date_time,search_location,salary_standardized,extracted_skills,experience_level,title_cleaned,role,description_tokens,location
155,Data Analyst,Jobs Near Me,MO,,"Data Analyst Jobs Near Me in Joplin, Missouri\...",eyJqb2JfdGl0bGUiOiJEYXRhIEFuYWx5c3QiLCJodGlkb2...,Full-time,,2023-08-17 03:00:27.163001,United States,31200.0,"['javascript', 'sql', 'sas', 'excel', 'spss']",Not-Specified,Data Analyst,Data Analyst,"[data, analyst, jobs, near, joplin, missouri, ...",Missouri


In [34]:
df['via'].fillna('Linkedin', inplace=True)

In [35]:
# Getting rid of 'via' at the beginning of the strings
df['via'] = df['via'].str.replace(r'^via', '', regex=True).str.strip()

In [36]:
# Top 5 job search websites
df['via'].value_counts().head(5)

via
LinkedIn       7326
Upwork         3505
BeBee          2281
Trabajo.org    1466
Indeed         1169
Name: count, dtype: int64

## Cleaning column: "schedule_type"

In [37]:
df.loc[df['schedule_type'].str.contains('internship', case=False, na=False), 'schedule_type'] = 'Internship'
df.loc[df['schedule_type'].str.contains('contractor|tempwork|temp work', case=False, na=False), 'schedule_type'] = 'Contractor'
df.loc[df['schedule_type'].str.contains('full-time', case=False, na=False), 'schedule_type'] = 'Full-time'
df.loc[df['schedule_type'].str.contains('part-time|volunteer', case=False, na=False), 'schedule_type'] = 'Part-time'

In [38]:
df['schedule_type'].fillna('Full-time', inplace=True)

## Cleaning column:  "work_from_home"

In [39]:
# Create a new column 'Work-Modality' based on modifications in 'work_from_home'
df['work_modality'] = df['work_from_home'].fillna('On-Site')
df['work_modality'].replace(True, 'Remote', inplace=True)

In [40]:
df['work_modality'].value_counts()

work_modality
Remote     10310
On-Site     9605
Name: count, dtype: int64

## Cleaning column:  "date_time"

In [41]:
# Transforming 'date_time' column to date format without hours
df['date_time'] = pd.to_datetime(df['date_time']).dt.date

In [42]:
df.rename(columns={'date_time': 'posted_date'}, inplace=True)

## Cleaning column:  "salary_standardized"

In [43]:
df['salary_standardized'].describe()

count      3562.000000
mean      94203.002925
std       45405.587652
min       15080.000000
25%       62400.000000
50%       88400.000000
75%      119600.000000
max      624000.000000
Name: salary_standardized, dtype: float64

## Make uniform my datasets

In [44]:
columns_to_drop = ['work_from_home','description','original_location','description_tokens']
df.drop(columns=columns_to_drop, inplace=True)

In [45]:
# Rearrenged columns
new_order = ['job_id','title','experience_level','title_cleaned','role','work_modality','company_name','location','via','schedule_type','salary_standardized','posted_date','extracted_skills','search_location']
df = df[new_order]

In [46]:
df.columns

Index(['job_id', 'title', 'experience_level', 'title_cleaned', 'role',
       'work_modality', 'company_name', 'location', 'via', 'schedule_type',
       'salary_standardized', 'posted_date', 'extracted_skills',
       'search_location'],
      dtype='object')

## Save Dataframe: Job Posts

In [47]:
df.to_csv('../data/us_jobposts.csv', index=False) 

In [48]:
df.shape

(19915, 14)