Skip to content

Commit

Permalink
Merge branch 'master' into multi-maps
Browse files Browse the repository at this point in the history
  • Loading branch information
Guts committed May 3, 2020
2 parents 45b2d52 + fee4d45 commit 6ae30fa
Show file tree
Hide file tree
Showing 30 changed files with 983 additions and 518 deletions.
8 changes: 8 additions & 0 deletions .deploy/backup/data_dumper.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
#!/bin/bash

python manage.py dumpdata jobs.Contract jobs.ContractVariations > elgeopaso/jobs/fixtures/contracts.json
python manage.py dumpdata jobs.JobPosition jobs.JobPositionVariations > elgeopaso/jobs/fixtures/metiers.json
python manage.py dumpdata jobs.Place jobs.PlaceVariations > elgeopaso/jobs/fixtures/places.json
python manage.py dumpdata jobs.Source > elgeopaso/jobs/fixtures/sources.json
python manage.py dumpdata jobs.Technology jobs.TechnologyVariations > elgeopaso/jobs/fixtures/technos.json
python manage.py dumpdata --exclude auth.permission --exclude contenttypes cms.Article cms.Category > elgeopaso/cms/fixtures/content.json
39 changes: 22 additions & 17 deletions .deploy/git-hooks/post-receive.sh
Original file line number Diff line number Diff line change
@@ -1,49 +1,54 @@
#!/bin/sh
#!/bin/bash

# Variables
# --------------------------------------------------------

# Folder where to deploy the application
# The production directory
WWW="/var/www/elgeopaso"

# A temporary directory for deployment
TMP="/srv/tmp/elgeopaso"

# The Git local repository
# The Git repo
GIT="/srv/git/elgeopaso.git"

# Folder where to store environment files
# The Env repo
ENV="/srv/env/elgeopaso"

# Operations
# --------------------------------------------------------

# Deploy the content to the temporary directory
mkdir -p \$TMP
mkdir -p $TMP
git --work-tree=$TMP --git-dir=$GIT checkout -f

# Copy the env variable to the temporary directory
cp -a \$ENV/. \$TMP
echo "GIT STEP - Copy '.env' file to the temp folder"
cp -a $ENV/. $TMP

# Build tasks
cd $TMP
# Remove useless files and folders
rm -rf ./vscode

# Replace the content of the production directory
# with the temporary directory
cd / || exit

# Replace the production directory content with the temporary content
cd /
echo "GIT STEP - Clean final folder: $WWW"
rm -rf $WWW

echo "GIT STEP - Move new file structure from $TMP to $WWW"
mv $TMP $WWW

# Release tasks in production folder (start services, etc.)
cd \$WWW
# Release tasks
cd $WWW || exit

# Create virtualenv and update pip
virtualenv -p /usr/bin/python3.7 .venv
source ./.venv/bin/activate
python -m pip install -U pip
python -m pip install -U pip setuptools wheel

# Install dependencies
python -m pip install -U -r requirements/production.txt
python -m nltk.downloader punkt stopwords

# Run migrations
python manage.py migrate

# Collect static
python manage.py collectstatic --noinput
4 changes: 4 additions & 0 deletions .github/labeler.yml
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
ci-cd:
- .github/**/*

dependencies:
- ./requirements.txt
- ./requirements/*.txt

documentation:
- docs/**/*
4 changes: 2 additions & 2 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# See https://pre-commit.com for more information
# See https://pre-commit.com/hooks.html for more hooks
exclude: "node_modules|migrations|.venv|tests/dev/|tests/fixtures/"
# fail_fast: true
fail_fast: false
repos:
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v2.5.0
Expand All @@ -23,6 +23,6 @@ repos:
- id: black
exclude_types: [directory,]
language_version: python3
log_file: ./dev_precommit.log
log_file: ./dev_precommit_black.log
require_serial: true
types: [file, python]
2 changes: 1 addition & 1 deletion elgeopaso/__about__.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@
__title__ = "El Géo Paso"
__title_clean__ = "".join(e for e in __title__ if e.isalnum())
__uri__ = "https://github.com/Guts/elgeopaso/"
__version__ = "1.5.0-beta2"
__version__ = "1.5.0"
__version_info__ = tuple(
[
int(num) if num.isdigit() else num
Expand Down
3 changes: 1 addition & 2 deletions elgeopaso/jobs/analyzer/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
# -*- coding: utf-8 -*-
#! python3 # noqa: E265

from .georezo_analyzer import Analizer # noqa: F401
from .georezo.georezo_analyzer import GeorezoOfferAnalizer # noqa: F401
Empty file.
176 changes: 176 additions & 0 deletions elgeopaso/jobs/analyzer/georezo/georezo_analyzer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,176 @@
#! python3 # noqa: E265

"""
Module in charge of analyzing raw offers from GeoRezo: extracting contract type,
place, etc. from title and abstract.
"""


# ###########################################################################
# ######### Libraries #############
# #################################

# Standard library
import logging
import re

# Django
from django.db import IntegrityError

# project modules
from elgeopaso.jobs.models import (
Contract,
GeorezoRSS,
Offer,
Place,
Source,
)
from elgeopaso.utils import TextToolbelt

from .parsers import ContentParser, TitleParser

# ##############################################################################
# ########## Globals ###############
# ##################################

# logs
logger = logging.getLogger(__name__)

# timestamps format helpers
_regex_markups = re.compile(r"<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});")

# shortcuts
txt_toolbelt = TextToolbelt()

# ############################################################################
# ########## Classes ##############
# #################################


class GeorezoOfferAnalizer:
"""
Analyze last offers published on GeoRezo and stored in the main table.
"""

def __init__(
self,
li_offers_ids: list,
opt_contracts: bool = 1,
opt_places: bool = 1,
opt_technos: bool = 1,
opt_skills: bool = 1,
opt_words: bool = 1,
source="GEOREZO_RSS",
new: bool = 1,
):
"""
:param list li_offers_ids: IDs list of offers to process
:param bool opt_contracts: parse or not contracts types
:param bool opt_places: parse or not contracts places
:param bool opt_technos: parse or not contracts technologies
:param bool opt_skills: parse or not contracts jobs label
:param bool opt_words: parse or not contracts words
:param str source: set offers source
:param bool new: create or update offer
"""
# parameters
self.offers_ids = li_offers_ids
self.opt_contracts = opt_contracts
self.opt_places = opt_places
self.opt_technos = opt_technos
self.opt_skills = opt_skills
self.opt_words = opt_words
self.source = source
self.new = new
logger.debug("Launching analisis on {} offers.".format(len(self.offers_ids)))
super(GeorezoOfferAnalizer, self).__init__()

# MAIN METHOD ------------------------------------------------------------

def analisis(self):
"""Perform analisis on offers."""
# parse offers
for offer_id in self.offers_ids:
self.offer_id = offer_id
# chekcs if offer has already been added
if Offer.objects.filter(id_rss=offer_id).exists() and self.new:
logger.error("Offer RSS_ID already exists in DB: {}".format(offer_id))
continue
else:
logger.debug("launch analisis on : {}".format(self.offer_id))
pass
# get raw offer from georezo_rss table
raw_offer = GeorezoRSS.objects.get(id_rss=offer_id)

# -- Title analisis ----------------------
clean_title = txt_toolbelt.remove_html_markups(raw_offer.title)
title_parser = TitleParser(offer_id=offer_id, input_title=clean_title)

# determine contract type
contract_type = title_parser.parse_contract_type()
jobs_labels = title_parser.parse_jobs_positions()
place = title_parser.parse_place(mode=0)

# -- Content analisis ----------------------
clean_content = txt_toolbelt.remove_html_markups(raw_offer.content)

content_parser = ContentParser(
offer_id=offer_id, input_content=clean_content
)
technos = content_parser.parse_technology()

# add or update offer
if self.new:
# add new offer
clean_offer = Offer(
id_rss=offer_id,
raw_offer=raw_offer,
title=clean_title,
content=clean_content,
pub_date=raw_offer.pub_date,
contract=Contract.objects.get(abbrv=contract_type),
source=Source.objects.get(name=self.source),
place=Place.objects.get(name=place),
)
try:
clean_offer.save()
except IntegrityError as err_msg:
logger.error(
"Offer RSS_ID ({}) already exists in DB: {}".format(
offer_id, err_msg
)
)
continue
else:
clean_offer = Offer.objects.select_related().filter(id_rss=offer_id)
if not clean_offer.exists():
logger.info(
"Offer to update no longer exists and won't be created: {}".format(
offer_id
)
)
continue
else:
pass
clean_offer.update(
title=clean_title,
content=clean_content,
pub_date=raw_offer.pub_date,
contract=Contract.objects.get(abbrv=contract_type),
source=Source.objects.get(name=self.source),
place=Place.objects.get(name=place),
)
clean_offer = Offer.objects.select_related().get(id_rss=offer_id)

# associate ManyToMany relationships
clean_offer.technologies.set(technos)
clean_offer.jobs_positions.set(jobs_labels)
logger.debug("Offer analyzed and inserted jobs.offer: {}".format(offer_id))


# ############################################################################
# #### Stand alone program ########
# #################################
if __name__ == "__main__":
"""standalone execution."""
pass
5 changes: 5 additions & 0 deletions elgeopaso/jobs/analyzer/georezo/parsers/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
#! python3 # noqa: E265
# -*- coding: utf-8 -*-

from .content import ContentParser # noqa: F401
from .title import TitleParser # noqa: F401
79 changes: 79 additions & 0 deletions elgeopaso/jobs/analyzer/georezo/parsers/content.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
#! python3 # noqa: E265

"""
Content parser.
"""


# ###########################################################################
# ######### Libraries #############
# #################################

# Standard library
import logging
import re

# project modules
from elgeopaso.jobs.models import (
Technology,
TechnologyVariations,
)

from elgeopaso.utils import TextToolbelt

# ##############################################################################
# ########## Globals ###############
# ##################################

# logs
logger = logging.getLogger(__name__)

# timestamps format helpers
_regex_markups = re.compile(r"<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});")

# shortcuts
txt_toolbelt = TextToolbelt()

# ############################################################################
# ########## Classes ##############
# #################################


class ContentParser:
"""Parse content of offers published on GeoRezo to extract informations.
:param int offer_id: offer ID (for tracing purposes)
:param str input_content: content to parse
"""

def __init__(self, offer_id: int, input_content: str):
"""Instanciate content parser module."""
# parameters
self.offer_id = offer_id
self.input_content = input_content

# tokenize content
self.tokenized_content = txt_toolbelt.tokenize(self.input_content)

# PARSERS ----------------------------------------------------------------
def parse_technology(self) -> list:
"""Identify technologies in content."""
technos_matched = []

# parse tokenized content
for word in self.tokenized_content:
if TechnologyVariations.objects.filter(label=word.lower()).exists():
techno_name = TechnologyVariations.objects.get(label=word.lower()).name
technos_matched.append(Technology.objects.get(name=techno_name))
else:
continue
logger.debug("Technologies identified: {}".format(technos_matched))
return technos_matched


# ############################################################################
# #### Stand alone program ########
# #################################
if __name__ == "__main__":
"""standalone execution."""
pass
Loading

0 comments on commit 6ae30fa

Please sign in to comment.