# Analysis of Data Job Market in Latvia
## Data Collection and preparation

### 0.Setting up Virtual Environment

In [1]:
!pip install virtualenv
!virtualenv myenv

created virtual environment CPython3.8.8.final.0-64 in 6110ms
  creator CPython3Windows(dest=C:\Users\Inese_\Desktop\sgt_project\myenv, clear=False, no_vcs_ignore=False, global=False)
  seeder FromAppData(download=False, pip=bundle, setuptools=bundle, wheel=bundle, via=copy, app_data_dir=C:\Users\Inese_\AppData\Local\pypa\virtualenv)
    added seed packages: pip==23.1.2, setuptools==67.7.2, wheel==0.40.0
  activators BashActivator,BatchActivator,FishActivator,NushellActivator,PowerShellActivator,PythonActivator


In [2]:
!myenv\Scripts\activate.bat

### 1. Data Collection

In [3]:
import requests
from bs4 import BeautifulSoup

In [4]:
import pandas as pd

In [5]:
import re

In [6]:
url = 'https://www.cv.lv/lv/search?limit=2000&offset=20&keywords%5B0%5D=data%20analyst&keywords%5B1%5D=data%20scientist&keywords%5B2%5D=data%20engineer&keywords%5B3%5D=machine%20learning&keywords%5B4%5D=analyst&keywords%5B5%5D=anal%C4%ABti%C4%B7is&keywords%5B6%5D=datu&keywords%5B7%5D=datu%20zin%C4%81tnieks&keywords%5B8%5D=big%20data%20analytics&sorting=EXPIRING&fuzzy=false&suitableForRefugees=false&isHourlySalary=false&isRemoteWork=false&isQuickApply=false'
response = requests.get(url)

In [7]:
soup = BeautifulSoup(response.content, 'html.parser')

In [8]:
vacancy_items = soup.find_all("li", class_="jsx-1871295890 jsx-2661613696 vacancies-list__item false")

In [9]:
vacancies = []
for item in vacancy_items:
    title = item.find("span", class_="jsx-1401030249 vacancy-item__title").text.strip()
    vacancy_item = item.find('a', {'class': 'vacancy-item'})
    vacancy_url = vacancy_item.get('href')
    end_date = item.find("span", class_="jsx-1401030249 vacancy-item__expiry").text.strip()
                                
    location = item.find("span", class_="jsx-1401030249 vacancy-item__locations").text.strip()
    salary = item.find("span", class_="jsx-1401030249 vacancy-item__salary-label")
    if salary is not None:
        salary = salary.text.strip()
    
    info_secondary = item.find('div', {'class': 'vacancy-item__info-secondary'})
    info_secondary_text = info_secondary.get_text()
    
    time_patterns = '(dien(a|s)|stund(a|s)|mēne(ši|sis)|sekund(e|es)|minūt(es|e))'
    pattern = f'Publicēts pirms (\d+) {time_patterns}|Atjaunināts pirms (\d+) {time_patterns}'
    match = re.search(pattern, info_secondary_text)
    published = match.group(0)
    
    p=1
    for a_tag in item.find_all('a', {'class': 'jsx-1401030249'}):
        p=p+1
        if p==3:
            company_name = a_tag.text
            company_url = a_tag['href']
        
      
    vacancies.append({
        "Title": title,
        "Vacancy URL" : vacancy_url,
        "Company name": company_name,
        "Company URL" :company_url,
        "Location": location,
        "Published": published,
        "End_date" : end_date, 
        "Salary": salary
    })

df = pd.DataFrame(vacancies)

### 3. Data Cleaning

In [10]:
cv_lv_link = r'https://www.cv.lv'
df['Vacancy URL'] = cv_lv_link + df['Vacancy URL']
df['Company URL'] = cv_lv_link + df['Company URL']

In [11]:
df['End_date'] = df['End_date'].str.replace('Beidzas: ', '')

In [12]:
df['Location'] = df['Location'].str.replace('—', '')
# split Location column into City, Region, Country
splitting = lambda x: pd.Series([i for i in reversed(x.split(','))])
df[['Country', 'Region', 'City']] = df['Location'].apply(splitting)

# Remove the original Location column
df.drop('Location', axis=1, inplace=True)



In [13]:
df

Unnamed: 0,Title,Vacancy URL,Company name,Company URL,Published,End_date,Salary,Country,Region,City
0,IEKŠĒJĀS KOMUNIKĀCIJAS SPECIĀLISTS,https://www.cv.lv/lv/vacancy/990508/rigas-namu...,Rīgas namu pārvaldnieks SIA,https://www.cv.lv/lv/search/employer/rigas-nam...,Publicēts pirms 14 diena,09.05.2023,€ 1500,Latvija,Rīgas rajons,Rīga
1,Projektu vadītājs – jurists (uz noteiktu laiku),https://www.cv.lv/lv/vacancy/988039/rigas-dome...,Rīgas domes Īpašuma departaments,https://www.cv.lv/lv/search/employer/rigas-dom...,Publicēts pirms 20 diena,10.05.2023,€ 1301 – 1700,Latvija,Rīgas rajons,Rīga
2,SABIEDRISKO ATTIECĪBU SPECIĀLISTS/-E,https://www.cv.lv/lv/vacancy/991836/centrala-f...,Centrālā finanšu un līgumu aģentūra (CFLA),https://www.cv.lv/lv/search/employer/centrala-...,Publicēts pirms 12 diena,10.05.2023,€ 1403 – 1584,Latvija,Rīgas rajons,Rīga
3,Sporta departamenta direktors (ierēdņa amats u...,https://www.cv.lv/lv/vacancy/992246/lr-izgliti...,LR Izglītības un zinātnes ministrija,https://www.cv.lv/lv/search/employer/lr-izglit...,Atjaunināts pirms 5 diena,10.05.2023,€ 2903 – 3226,Latvija,Rīgas rajons,Rīga
4,CRM Analyst,https://www.cv.lv/lv/vacancy/982718/sph-engine...,"SPH Engineering, SIA",https://www.cv.lv/lv/search/employer/sph-engin...,Atjaunināts pirms 6 diena,10.05.2023,€ 2500,Latvija,Rīgas rajons,Baloži
...,...,...,...,...,...,...,...,...,...,...
998,Test Engineer,https://www.cv.lv/lv/vacancy/993940/visma/test...,VISMA,https://www.cv.lv/lv/search/employer/visma?emp...,Publicēts pirms 6 diena,30.06.2023,€ 2000 – 3000,Latvija,Rīgas rajons,Rīga
999,Senior Test Engineer,https://www.cv.lv/lv/vacancy/995138/visma/seni...,VISMA,https://www.cv.lv/lv/search/employer/visma?emp...,Publicēts pirms 4 diena,30.06.2023,€ 2400 – 3000,Latvija,Rīgas rajons,Rīga
1000,Security Manager,https://www.cv.lv/lv/vacancy/987624/visma/secu...,VISMA,https://www.cv.lv/lv/search/employer/visma?emp...,Publicēts pirms 20 diena,30.06.2023,€ 3500 – 4500,Latvija,Rīgas rajons,Rīga
1001,Automatizācijas sistēmu inženieris/ Vadošais a...,https://www.cv.lv/lv/vacancy/992319/belam-riga...,Belam - Riga SIA,https://www.cv.lv/lv/search/employer/belam-rig...,Atjaunināts pirms 5 diena,27.07.2023,€ 2000,Latvija,Rīgas rajons,Rīga


In [14]:
# Remove the euro sign
df['Salary'] = df['Salary'].str.replace('€', '').str.strip()


# Check the format of each entry and apply the corresponding operation
df[['Salary Range min', 'Salary Range max']] = df['Salary'].str.split('–', n=1, expand=True)


In [15]:
df['Salary Range min'] = df['Salary Range min'].str.replace('/st.', '').str.strip().astype(float)
df['Salary Range max'] = df['Salary Range max'].str.replace('/st.', '').str.strip().astype(float)


  df['Salary Range min'] = df['Salary Range min'].str.replace('/st.', '').str.strip().astype(float)
  df['Salary Range max'] = df['Salary Range max'].str.replace('/st.', '').str.strip().astype(float)


In [16]:
for i in df.index:
    if pd.notna(df['Salary'][i]) and isinstance(df['Salary'][i], str) and '/st.' in df['Salary'][i]:
        if pd.notna(df['Salary Range min'][i]) and pd.notna(df['Salary Range max'][i]):
            df.loc[i, ['Salary Range min', 'Salary Range max']] *= 21.62 * 8


In [17]:
# Round the values to 2 decimal places
df['Salary Range min'] = df['Salary Range min'].round(2)
df['Salary Range max'] = df['Salary Range max'].round(2)

In [18]:
# remove the original Salary column
df.drop('Salary', axis=1, inplace=True)

In [19]:
import re
from datetime import datetime, timedelta

def get_date_from_string(s):
    if "dien" in s:
        days_ago = int(re.search(r'\d+', s).group())
        return (datetime.now() - timedelta(days=days_ago)).strftime('%d.%m.%Y')
    elif "mēnesi" in s:
        months_ago = int(re.search(r'\d+', s).group())
        return (datetime.now() - timedelta(days=months_ago*30)).strftime('%d.%m.%Y')
    elif "seku" in s or "min" in s or "stund" in s:
        return datetime.now().strftime('%d.%m.%Y')

# Apply the function to the column 'Published'
df['Published'] = df['Published'].apply(get_date_from_string)

### 4. Data Filtering

In [20]:
filtered_df = df[df['Title'].str.contains('anal|dat', case=False)]

In [21]:
filtered_df

Unnamed: 0,Title,Vacancy URL,Company name,Company URL,Published,End_date,Country,Region,City,Salary Range min,Salary Range max
4,CRM Analyst,https://www.cv.lv/lv/vacancy/982718/sph-engine...,"SPH Engineering, SIA",https://www.cv.lv/lv/search/employer/sph-engin...,03.05.2023,10.05.2023,Latvija,Rīgas rajons,Baloži,2500.0,
33,Risk & Integrity System Analyst,https://www.cv.lv/lv/vacancy/983555/evolution-...,Evolution Latvia SIA,https://www.cv.lv/lv/search/employer/evolution...,09.05.2023,11.05.2023,Latvija,Rīgas rajons,Rīga,2500.0,
38,DATU INŽENIERIS,https://www.cv.lv/lv/vacancy/983230/sia-aerone...,SIA Aerones Engineering,https://www.cv.lv/lv/search/employer/sia-aeron...,09.05.2023,11.05.2023,Latvija,Rīgas rajons,Rīga,2000.0,5000.0
40,REZERVES DAĻU UN NOLIKTAVAS KRĀJUMU DATU IEVAD...,https://www.cv.lv/lv/vacancy/983494/arsenal-in...,Arsenal Industrial SIA,https://www.cv.lv/lv/search/employer/arsenal-i...,09.05.2023,11.05.2023,Latvija,Rīgas rajons,Rīga,1000.0,1300.0
49,Product Data Specialist (temporary contract),https://www.cv.lv/lv/vacancy/992227/rimi-latvi...,RIMI Latvia SIA,https://www.cv.lv/lv/search/employer/rimi-latv...,27.04.2023,11.05.2023,Latvija,Rīgas rajons,Rīga,1100.0,1350.0
...,...,...,...,...,...,...,...,...,...,...,...
984,VECĀKAIS ĶĪMIĶIS– ANALĪTIĶIS produktu attīstīb...,https://www.cv.lv/lv/vacancy/997175/pharmidea-...,PHARMIDEA SIA,https://www.cv.lv/lv/search/employer/pharmidea...,09.05.2023,09.06.2023,Latvija,Rīgas rajons,Olaine,1700.0,
985,Datu ievades operators/e,https://www.cv.lv/lv/vacancy/997217/konig-dist...,König Distribution AS,https://www.cv.lv/lv/search/employer/konig-dis...,09.05.2023,09.06.2023,Latvija,Liepājas rajons,Liepāja,900.0,
990,Biznesa analītikas (BI) eksperts/-e,https://www.cv.lv/lv/vacancy/996869/elva-balti...,Elva Baltic SIA,https://www.cv.lv/lv/search/employer/elva-balt...,09.05.2023,09.06.2023,Latvija,Rīgas rajons,Rīga,2500.0,3500.0
991,QA analyst with Azerbaijani language,https://www.cv.lv/lv/vacancy/996973/lendiscore...,Lendiscore SIA,https://www.cv.lv/lv/search/employer/lendiscor...,09.05.2023,09.06.2023,Latvija,Rīgas rajons,Rīga,900.0,1600.0


### 5. SQLite Database

In [23]:
import sqlite3
import pandas as pd

# create a connection to the database
conn = sqlite3.connect('Job_postings_CVlv.db')

# save the DataFrame to the database
filtered_df.to_sql('alldata', conn)

# close the connection
conn.close()