In [1]:
import pandas as pd
import numpy as np
import re
import os
import datetime
import matplotlib.pyplot as plt

In [2]:
file_paths = [
    '../data/extracted_data/barcelona_business_intelligence.csv',
    '../data/extracted_data/barcelona_data_analyst.csv',
    '../data/extracted_data/barcelona_data_engineer.csv',
    '../data/extracted_data/barcelona_data_scientist.csv',
    '../data/extracted_data/madrid_business_intelligence.csv',
    '../data/extracted_data/madrid_data_analyst.csv',
    '../data/extracted_data/madrid_data_engineer.csv',
    '../data/extracted_data/madrid_data_scientist.csv'
]

# Concatenate CSV files
dfs = [pd.read_csv(file) for file in file_paths]
df = pd.concat(dfs, ignore_index=True)

# Save dataframe
df.to_csv('../data/spain_jobposts_combined.csv', index=False)

In [3]:
df = pd.read_csv('../data/spain_jobposts_combined.csv',encoding='utf-8')
df.sample(1)

Unnamed: 0,title,company_name,location,posted_time_ago,seniority_level,employment_type,job_function,industries,description
2828,Desarrollador/a HTML JS Java8 PL/SQL,TUYÚ Technology,Madrid Community of Madrid Sp...,...,Entry level,Full-time,Engineering and Information Techn...,IT Services and IT Consulting ...,¿Quieres crecer profesionalmente? ¿...


In [4]:
df.shape

(3124, 9)

## Drop duplicated rows

In [5]:
# Find duplicates based on 'title' and 'company' columns 
duplicate_mask = df.duplicated(subset=['title', 'company_name'], keep=False)

# Get both original and duplicate rows for double-check
original_and_duplicates = df[duplicate_mask].sort_values(by=['title', 'company_name'])
original_and_duplicates

Unnamed: 0,title,company_name,location,posted_time_ago,seniority_level,employment_type,job_function,industries,description
1938,Data Analyst / Engineer - PySpark y Azure,EXCELIA,Madrid Community of Madrid Sp...,...,Associate,Full-time,Engineering and Information Techn...,IT Services and IT Consulting ...,excelia es una firma multinacional de Consu...
2507,Data Analyst / Engineer - PySpark y Azure,EXCELIA,Madrid Community of Madrid Sp...,...,Associate,Full-time,Engineering and Information Techn...,IT Services and IT Consulting ...,excelia es una firma multinacional de Consu...
2566,Data Analyst / Engineer - PySpark y Azure,EXCELIA,Madrid Community of Madrid Sp...,...,Associate,Full-time,Engineering and Information Techn...,IT Services and IT Consulting ...,excelia es una firma multinacional de Consu...
1934,Ingeniero/ a de Datos SQL Big data y Azure,Tecnicas Reunidas ...,Madrid Community of Madrid Sp...,...,Mid-Senior level,Full-time,Engineering,IT Services and IT Consulting ...,Técnicas Reunidas Group (TR) es una empresa...
2288,Ingeniero/ a de Datos SQL Big data y Azure,Tecnicas Reunidas ...,Madrid Community of Madrid Sp...,...,Mid-Senior level,Full-time,Engineering,IT Services and IT Consulting ...,Técnicas Reunidas Group (TR) es una empresa...
...,...,...,...,...,...,...,...,...,...
1557,Técnico/a de desarrollo (.net y Sql),Grupo Crit,Madrid Community of Madrid Sp...,...,Entry level,Full-time,Management and Manufacturing ...,Human Resources Services,En Grupo Crit desde del departament...
2406,UI Software Engineer (C++) - Madden Core,Electronic Arts (EA) ...,Madrid Community of Madrid Sp...,...,Associate,Full-time,Engineering,IT Services and IT Consulting Soft...,EA SPORTS is one of the leading sports ente...
2575,UI Software Engineer (C++) - Madden Core,Electronic Arts (EA) ...,Madrid Community of Madrid Sp...,...,Associate,Full-time,Engineering,IT Services and IT Consulting Soft...,EA SPORTS is one of the leading sports ente...
716,Workplace Analytics Engineer,Canonical,Barcelona Catalonia Spain ...,...,Entry level,Full-time,Information Technology,Technology Information and Interne...,Bring your people analytics social...


In [6]:
# Drop the duplicate rows
df.drop_duplicates(subset=['title', 'company_name'], keep='first', inplace=True)

## Create column: "job_id"

In [7]:
df['job_id'] = range(1, len(df) + 1)

## Cleaning column: "title"

In [8]:
def clean_title(title):
    title_lower = title.lower()
    if 'machine' in title_lower:
        return 'Machine Learning Engineer'
    if 'artificial' in title_lower or 'AI' in title or 'ML' in title:
        return 'Machine Learning Engineer'
    elif 'cloud' in title_lower:
        return 'Cloud Engineer'
    elif 'software' in title_lower:
        return 'Software Engineer'
    elif 'dev' in title_lower:
        return 'Software Engineer'
    elif re.search(r'\bBI\b', title) or re.search(r'\bbusiness\s*(?!data\s*)\w*\s*analyst\b|\bbusiness\s*(?!data\s*)\w*\s*intelligence\b|\bfinancial\s*analyst\b', title_lower):
        return 'Business Analyst'
    elif 'engineer' in title_lower and 'data' in title_lower:
        return 'Data Engineer'
    elif 'software' in title_lower:
        return 'Software Engineer'
    elif 'scien' in title_lower:
        return 'Data Scientist'
    elif 'cient' in title_lower:
        return 'Data Scientist'
    elif re.search(r'\bdata\b|\banalytics?\b', title_lower):
        return 'Data Analyst'
    elif re.search(r'\bdata\b.*\b(analyst|analytics|analysis)\b|\banalytics?\b', title_lower):
        return 'Data Analyst'
    elif 'system' in title_lower:
        return 'Software Engineer'
    elif 'program' in title_lower:
        return 'Software Engineer'
    elif 'ingen' in title_lower and 'datos' in title_lower:
        return 'Data Engineer'
    elif 'analista'in title_lower and 'datos' or 'digital' in title_lower:
        return 'Data Analyst'
    elif 'desarro' in title_lower:
        return 'Software Engineer'
    elif 'backend' in title_lower:
        return 'Software Engineer'
    elif 'frontend' in title_lower:
        return 'Software Engineer'
    elif 'full' in title_lower:
        return 'Software Engineer'
    
    else:
        return None

In [9]:
# Apply the function and create the 'title_cleaned' column in clear data role categories 
df.loc[:, 'title_cleaned'] = df['title'].apply(clean_title)

In [11]:
df['title_cleaned'].value_counts()

title_cleaned
Software Engineer            455
Data Analyst                 314
Data Engineer                278
Data Scientist               172
Business Analyst             146
Machine Learning Engineer     88
Cloud Engineer                46
Name: count, dtype: int64

In [10]:
df['title_cleaned'].isnull().sum()

391

In [12]:
df.dropna(subset=['title_cleaned'], inplace=True)

## Cleaning column: "seniority_level"

In [13]:
df['seniority_level'] = df['seniority_level'].str.strip()

In [14]:
replace_dict = {
    'Mid-Senior level': 'Mid-Senior',
    'Entry level': 'Junior',
    'Associate': 'Junior',
    'Not Applicable': 'Not-Specified',
    'Internship': 'Internship',
    'Director': 'Director',
    'Executive': 'Manager',
}

df['seniority_level'] = df['seniority_level'].replace(replace_dict)

In [15]:
df['seniority_level'].value_counts()

seniority_level
Junior           628
Mid-Senior       574
Not-Specified    204
Internship        19
Director           7
Manager            5
Name: count, dtype: int64

## Create column: "experience_level"

In [16]:
def categorize_experience(title):
    title_lower = title.lower()
    if any(word in title_lower for word in [ 'intern','internship']):
        return 'Internship'
    elif any(word in title_lower for word in ['manager', 'executive', 'principal', 'staff']):
        return 'Manager'
    elif any(word in title_lower for word in ['director','associate director']):
        return 'Director'
    elif any(word in title_lower for word in ['mid-senior','mid level','mid-level','ii','specialist']):
        return 'Mid-Senior'
    elif any(word in title_lower for word in ['jr', 'jr.','junior','entry', 'associate','1']):
        return 'Junior'
    elif any(word in title_lower for word in ['sr.', 'sr', 'senior','exper','3','4','lead','steward']):
        return 'Senior'
    elif any(word in title for word in ['III','IV']):
        return 'Senior'
    elif 'II' in title:
        return 'Mid-Senior'
    elif re.search(r'\bI\b', title):
        return 'Junior'
    else:
        return None

In [17]:
# Apply the function to create the 'experience_level' column
df['experience_level'] = df['title'].apply(categorize_experience)

In [18]:
df['experience_level'].isnull().sum()

1028

In [19]:
# Fill null cells in 'experience_level' with values from 'seniority_level' where 'experience_level' is null
df['experience_level'].fillna(df['seniority_level'], inplace=True)

In [20]:
df['experience_level'].isnull().sum()

40

In [21]:
df.head(2)

Unnamed: 0,title,company_name,location,posted_time_ago,seniority_level,employment_type,job_function,industries,description,job_id,title_cleaned,experience_level
0,Business Analyst (Bangkok Based relocation pr...,Agoda,Barcelona Catalonia Spain ...,...,Junior,Full-time,Research Analyst and Informatio...,Technology Information and Interne...,About AgodaAgoda is an online travel bookin...,1,Business Analyst,Junior
1,Senior Data Analyst (Product Team) (Bangkok Ba...,Agoda,Barcelona Catalonia Spain ...,...,Junior,Full-time,Information Technology,Technology Information and Interne...,About AgodaAgoda is an online travel bookin...,2,Data Analyst,Senior


## Creating a New Column: "Role"

In [23]:
import numpy as np

def get_role_name(row):
    title_cleaned = row['title_cleaned']
    experience_level = row['experience_level']
    
    if pd.isnull(experience_level) or experience_level == "Not-Specified":
        return title_cleaned
    elif pd.isnull(title_cleaned):
        return None
    else:
        return f"{title_cleaned} {experience_level}" if not pd.isnull(experience_level) else title_cleaned

In [24]:
df['role'] = df.apply(get_role_name, axis=1)

In [25]:
df['role'].unique()

array(['Business Analyst Junior', 'Data Analyst Senior',
       'Data Analyst Junior', 'Business Analyst Mid-Senior',
       'Business Analyst Internship', 'Software Engineer Mid-Senior',
       'Data Engineer Junior', 'Data Engineer Senior', 'Business Analyst',
       'Business Analyst Senior', 'Data Analyst Mid-Senior',
       'Data Analyst Manager', 'Software Engineer Manager',
       'Cloud Engineer Mid-Senior', 'Business Analyst Manager',
       'Data Engineer Mid-Senior', 'Business Analyst Director',
       'Software Engineer Junior', 'Data Scientist Mid-Senior',
       'Data Scientist Junior', 'Data Analyst Director', 'Data Engineer',
       'Software Engineer', 'Software Engineer Internship',
       'Data Analyst', 'Software Engineer Senior',
       'Data Scientist Senior', 'Cloud Engineer Junior', 'Data Scientist',
       'Data Analyst Internship', 'Data Scientist Internship',
       'Machine Learning Engineer Junior', 'Cloud Engineer Senior',
       'Machine Learning Engineer

## Cleaning column: "location"

In [27]:
# Convert the column to lowercase for case-insensitive matching
df.loc[:, 'location'] = df['location'].str.lower()

# Set conditions for replacing values
condition_catalonia = df['location'].str.contains('barcelona|catalonia|manresa|granollers', case=False)
condition_madrid = df['location'].str.contains('madrid', case=False)

# Replace values based on conditions using .loc to avoid SettingWithCopyWarning
df.loc[condition_catalonia, 'location'] = 'Catalonia'
df.loc[condition_madrid, 'location'] = 'Madrid'

## Creating new column: "via"

In [28]:
df['via'] = 'Linkedin'

## Cleaning column: "employment_type"

In [29]:
# Changing name to have similar columns to the US dataset
df = df.rename(columns={'employment_type': 'schedule_type'})

In [30]:
df['schedule_type'] = df['schedule_type'].str.strip()

In [31]:
df['schedule_type'] = df['schedule_type'].replace({
    'Other': 'Full-time',
    'Temporary': 'Contractor',
    'Contract': 'Contractor'})

In [32]:
df['schedule_type'].value_counts()

schedule_type
Full-time     1431
Contractor      47
Part-time       13
Internship       8
Name: count, dtype: int64

## Cleaning column: "posted_time_ago"

In [33]:
# Remove leading and trailing whitespace from 'posted_time_ago' column
df['posted_time_ago'] = df['posted_time_ago'].str.strip()

In [34]:
from datetime import datetime

# Dictionary to map time units to their corresponding pandas timedelta arguments
time_units = {
    'month': 'D',  # Change to 'D' to avoid ambiguity
    'year': 'D',  # Change to 'D' to avoid ambiguity
    'week': 'W',
    'day': 'D',
    'hour': 'h',
    'minute': 'm',
    'second': 's'
}

# Function to convert time strings to timedelta
def time_string_to_timedelta(time_str):
    # Extract numerical value and unit
    value, unit, _ = time_str.split()
    # Convert value to an integer
    value = int(value)
    # Remove 's' (plural) if present 
    unit = unit.rstrip('s')
    # Map the unit to the corresponding timedelta argument
    unit = time_units.get(unit.lower()) 
    # Create timedelta object
    return pd.to_timedelta(value, unit=unit)

In [35]:
# Apply the function to create a new column with timedelta values
df['time_difference'] = df['posted_time_ago'].apply(time_string_to_timedelta)

# Calculate the date when the job was posted (03.Dec.2023 is the reference date)
today = pd.to_datetime(datetime.now().date())
df['posted_date'] = today - df['time_difference']

print(df[['posted_time_ago', 'posted_date']])

     posted_time_ago posted_date
0        2 weeks ago  2023-11-21
1         1 week ago  2023-11-28
2         3 days ago  2023-12-02
3         1 week ago  2023-11-28
4         1 week ago  2023-11-28
...              ...         ...
3114    5 months ago  2023-11-30
3118     3 weeks ago  2023-11-14
3119    6 months ago  2023-11-29
3120      4 days ago  2023-12-01
3122     3 weeks ago  2023-11-14

[1499 rows x 2 columns]


In [36]:
df.dtypes

title                        object
company_name                 object
location                     object
posted_time_ago              object
seniority_level              object
schedule_type                object
job_function                 object
industries                   object
description                  object
job_id                        int64
title_cleaned                object
experience_level             object
role                         object
via                          object
time_difference     timedelta64[ns]
posted_date          datetime64[ns]
dtype: object

## Creating column: "skills"

In [37]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

# Ensure NLTK resources are downloaded
nltk.download('punkt')
nltk.download('stopwords')

# Define the function to tokenize descriptions and remove stopwords
def tokenize_and_remove_stopwords(description):
    # Convert to lowercase and tokenize
    tokens = word_tokenize(description.lower())
    # Remove non-alphabetic tokens and stopwords
    return [token for token in tokens if token.isalpha() and token not in stop_words]

# Set of English stopwords
stop_words = set(stopwords.words('english'))

# Tokenize the descriptions and remove stopwords
df['description_tokens'] = df['description'].apply(tokenize_and_remove_stopwords)

# Define a list of technical skills relevant to data roles
skills_list = [ "tableau","excel","power_bi","sas","word","powerpoint","sap","ssis","looker","qlik","alteryx",
    "spss","ssrs","outlook","dax","sharepoint","splunk","microstrategy","cognos","visio","google sheets","spreadsheet",
    "ms access","datarobot","sql server","mysql","cassandra","postgresql","mongodb","elasticsearch","dynamodb","redis","db2","neo4j",
    "mariadb","firebase","couchbase","sqlite","firestore","couchdb","aws","azure","snowflake","databricks","redshift","gcp","oracle","bigquery","aurora",
    "vmware","ibm cloud","firebase","watson","openstack","heroku","digital ocean", "colocation","ovh","linode","spark","hadoop","kafka","airflow","pyspark","pandas","tensorflow","pytorch","numpy","scikit-learn",
    "keras","jupyter","react","matplotlib","spring","gdpr","plotly","seaborn","graphql","nltk","opencv","ggplot2",
    "selenium","mxnet","tidyverse","express","node.js","angular","flask","ruby","django","vue.js","phoenix","fastapi","jquery",
    "asp.net","ruby on rails", "react","laravel","angular.js", "asp.net core", "next.js","drupal","svelte",
    "symfony","blazor","play framework",  "gatsby","python","sql","r","java","scala","nosql","sas","go","mongodb","shell","javascript","c++","c#","c",
    "matlab","vba","bash","t-sql","powershell","html","perl","css","ruby","typescript", 'deep learning',
    'machine learning', 'nlp', 'statistics', 'data mining', 'data visualization', 'big data', 'data analysis']  # Your list of skills

# Function to extract skills from the tokenized descriptions
def extract_skills(description_tokens):
    return [skill for skill in skills_list if skill in description_tokens]


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/mairagutierrez/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/mairagutierrez/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [38]:
# Create a new column with the extracted skills for each row
df['extracted_skills'] = df['description_tokens'].apply(extract_skills)

# Display the updated dataframe with the new 'extracted_skills' column
df['extracted_skills']

0       [tableau, excel, sas, spss, python, sql, r, sa...
1            [tableau, excel, python, sql, r, statistics]
2       [tableau, excel, aws, redshift, python, sql, vba]
3                                         [microstrategy]
4                             [tableau, looker, dax, sql]
                              ...                        
3114                                          [python, c]
3118    [postgresql, redis, aws, pandas, flask, django...
3119            [pandas, pytorch, numpy, sql, statistics]
3120                            [python, sql, statistics]
3122                                                   []
Name: extracted_skills, Length: 1499, dtype: object

In [39]:
df.head(1)

Unnamed: 0,title,company_name,location,posted_time_ago,seniority_level,schedule_type,job_function,industries,description,job_id,title_cleaned,experience_level,role,via,time_difference,posted_date,description_tokens,extracted_skills
0,Business Analyst (Bangkok Based relocation pr...,Agoda,Catalonia,2 weeks ago,Junior,Full-time,Research Analyst and Informatio...,Technology Information and Interne...,About AgodaAgoda is an online travel bookin...,1,Business Analyst,Junior,Business Analyst Junior,Linkedin,14 days,2023-11-21,"[agodaagoda, online, travel, booking, platform...","[tableau, excel, sas, spss, python, sql, r, sa..."


## Make uniform my datasets

In [40]:
columns_to_drop = ['seniority_level','posted_time_ago','time_difference','description_tokens','industries','description']
df.drop(columns=columns_to_drop , inplace=True)

In [41]:
# Create work_modality column so that it has the same columns to evaluate in mysql

df['work_modality'] = None
df['salary_standardized'] = None

In [42]:
# Rearrenged columns
new_order = ['job_id','title','experience_level','title_cleaned','role','work_modality','company_name','location','via','schedule_type','salary_standardized','posted_date','extracted_skills']
df = df[new_order]

In [43]:
df.columns

Index(['job_id', 'title', 'experience_level', 'title_cleaned', 'role',
       'work_modality', 'company_name', 'location', 'via', 'schedule_type',
       'salary_standardized', 'posted_date', 'extracted_skills'],
      dtype='object')

In [44]:
df.sample(2)

Unnamed: 0,job_id,title,experience_level,title_cleaned,role,work_modality,company_name,location,via,schedule_type,salary_standardized,posted_date,extracted_skills
4,5,Data Engineer (Power BI),Mid-Senior,Business Analyst,Business Analyst Mid-Senior,,Q-tech,Catalonia,Linkedin,Full-time,,2023-11-28,"[tableau, looker, dax, sql]"
2078,1316,Digital Marketing Data Analyst,Mid-Senior,Data Analyst,Data Analyst Mid-Senior,,Darkroom,Madrid,Linkedin,Full-time,,2023-11-28,"[python, sql]"


## Create search_location column

In [45]:
df['search_location']='Spain'

## Save Dataframe: Job Posts

In [46]:
df.to_csv('../data/spain_jobposts.csv', index=False) 

In [47]:
df.head(2)

Unnamed: 0,job_id,title,experience_level,title_cleaned,role,work_modality,company_name,location,via,schedule_type,salary_standardized,posted_date,extracted_skills,search_location
0,1,Business Analyst (Bangkok Based relocation pr...,Junior,Business Analyst,Business Analyst Junior,,Agoda,Catalonia,Linkedin,Full-time,,2023-11-21,"[tableau, excel, sas, spss, python, sql, r, sa...",Spain
1,2,Senior Data Analyst (Product Team) (Bangkok Ba...,Senior,Data Analyst,Data Analyst Senior,,Agoda,Catalonia,Linkedin,Full-time,,2023-11-28,"[tableau, excel, python, sql, r, statistics]",Spain


In [49]:
df.shape

(1499, 14)