-
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge branch 'master' into multi-maps
- Loading branch information
Showing
30 changed files
with
983 additions
and
518 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
#!/bin/bash | ||
|
||
python manage.py dumpdata jobs.Contract jobs.ContractVariations > elgeopaso/jobs/fixtures/contracts.json | ||
python manage.py dumpdata jobs.JobPosition jobs.JobPositionVariations > elgeopaso/jobs/fixtures/metiers.json | ||
python manage.py dumpdata jobs.Place jobs.PlaceVariations > elgeopaso/jobs/fixtures/places.json | ||
python manage.py dumpdata jobs.Source > elgeopaso/jobs/fixtures/sources.json | ||
python manage.py dumpdata jobs.Technology jobs.TechnologyVariations > elgeopaso/jobs/fixtures/technos.json | ||
python manage.py dumpdata --exclude auth.permission --exclude contenttypes cms.Article cms.Category > elgeopaso/cms/fixtures/content.json |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,49 +1,54 @@ | ||
#!/bin/sh | ||
#!/bin/bash | ||
|
||
# Variables | ||
# -------------------------------------------------------- | ||
|
||
# Folder where to deploy the application | ||
# The production directory | ||
WWW="/var/www/elgeopaso" | ||
|
||
# A temporary directory for deployment | ||
TMP="/srv/tmp/elgeopaso" | ||
|
||
# The Git local repository | ||
# The Git repo | ||
GIT="/srv/git/elgeopaso.git" | ||
|
||
# Folder where to store environment files | ||
# The Env repo | ||
ENV="/srv/env/elgeopaso" | ||
|
||
# Operations | ||
# -------------------------------------------------------- | ||
|
||
# Deploy the content to the temporary directory | ||
mkdir -p \$TMP | ||
mkdir -p $TMP | ||
git --work-tree=$TMP --git-dir=$GIT checkout -f | ||
|
||
# Copy the env variable to the temporary directory | ||
cp -a \$ENV/. \$TMP | ||
echo "GIT STEP - Copy '.env' file to the temp folder" | ||
cp -a $ENV/. $TMP | ||
|
||
# Build tasks | ||
cd $TMP | ||
# Remove useless files and folders | ||
rm -rf ./vscode | ||
|
||
# Replace the content of the production directory | ||
# with the temporary directory | ||
cd / || exit | ||
|
||
# Replace the production directory content with the temporary content | ||
cd / | ||
echo "GIT STEP - Clean final folder: $WWW" | ||
rm -rf $WWW | ||
|
||
echo "GIT STEP - Move new file structure from $TMP to $WWW" | ||
mv $TMP $WWW | ||
|
||
# Release tasks in production folder (start services, etc.) | ||
cd \$WWW | ||
# Release tasks | ||
cd $WWW || exit | ||
|
||
# Create virtualenv and update pip | ||
virtualenv -p /usr/bin/python3.7 .venv | ||
source ./.venv/bin/activate | ||
python -m pip install -U pip | ||
python -m pip install -U pip setuptools wheel | ||
|
||
# Install dependencies | ||
python -m pip install -U -r requirements/production.txt | ||
python -m nltk.downloader punkt stopwords | ||
|
||
# Run migrations | ||
python manage.py migrate | ||
|
||
# Collect static | ||
python manage.py collectstatic --noinput |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,5 +1,9 @@ | ||
ci-cd: | ||
- .github/**/* | ||
|
||
dependencies: | ||
- ./requirements.txt | ||
- ./requirements/*.txt | ||
|
||
documentation: | ||
- docs/**/* |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1,3 @@ | ||
# -*- coding: utf-8 -*- | ||
#! python3 # noqa: E265 | ||
|
||
from .georezo_analyzer import Analizer # noqa: F401 | ||
from .georezo.georezo_analyzer import GeorezoOfferAnalizer # noqa: F401 |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,176 @@ | ||
#! python3 # noqa: E265 | ||
|
||
""" | ||
Module in charge of analyzing raw offers from GeoRezo: extracting contract type, | ||
place, etc. from title and abstract. | ||
""" | ||
|
||
|
||
# ########################################################################### | ||
# ######### Libraries ############# | ||
# ################################# | ||
|
||
# Standard library | ||
import logging | ||
import re | ||
|
||
# Django | ||
from django.db import IntegrityError | ||
|
||
# project modules | ||
from elgeopaso.jobs.models import ( | ||
Contract, | ||
GeorezoRSS, | ||
Offer, | ||
Place, | ||
Source, | ||
) | ||
from elgeopaso.utils import TextToolbelt | ||
|
||
from .parsers import ContentParser, TitleParser | ||
|
||
# ############################################################################## | ||
# ########## Globals ############### | ||
# ################################## | ||
|
||
# logs | ||
logger = logging.getLogger(__name__) | ||
|
||
# timestamps format helpers | ||
_regex_markups = re.compile(r"<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});") | ||
|
||
# shortcuts | ||
txt_toolbelt = TextToolbelt() | ||
|
||
# ############################################################################ | ||
# ########## Classes ############## | ||
# ################################# | ||
|
||
|
||
class GeorezoOfferAnalizer: | ||
""" | ||
Analyze last offers published on GeoRezo and stored in the main table. | ||
""" | ||
|
||
def __init__( | ||
self, | ||
li_offers_ids: list, | ||
opt_contracts: bool = 1, | ||
opt_places: bool = 1, | ||
opt_technos: bool = 1, | ||
opt_skills: bool = 1, | ||
opt_words: bool = 1, | ||
source="GEOREZO_RSS", | ||
new: bool = 1, | ||
): | ||
""" | ||
:param list li_offers_ids: IDs list of offers to process | ||
:param bool opt_contracts: parse or not contracts types | ||
:param bool opt_places: parse or not contracts places | ||
:param bool opt_technos: parse or not contracts technologies | ||
:param bool opt_skills: parse or not contracts jobs label | ||
:param bool opt_words: parse or not contracts words | ||
:param str source: set offers source | ||
:param bool new: create or update offer | ||
""" | ||
# parameters | ||
self.offers_ids = li_offers_ids | ||
self.opt_contracts = opt_contracts | ||
self.opt_places = opt_places | ||
self.opt_technos = opt_technos | ||
self.opt_skills = opt_skills | ||
self.opt_words = opt_words | ||
self.source = source | ||
self.new = new | ||
logger.debug("Launching analisis on {} offers.".format(len(self.offers_ids))) | ||
super(GeorezoOfferAnalizer, self).__init__() | ||
|
||
# MAIN METHOD ------------------------------------------------------------ | ||
|
||
def analisis(self): | ||
"""Perform analisis on offers.""" | ||
# parse offers | ||
for offer_id in self.offers_ids: | ||
self.offer_id = offer_id | ||
# chekcs if offer has already been added | ||
if Offer.objects.filter(id_rss=offer_id).exists() and self.new: | ||
logger.error("Offer RSS_ID already exists in DB: {}".format(offer_id)) | ||
continue | ||
else: | ||
logger.debug("launch analisis on : {}".format(self.offer_id)) | ||
pass | ||
# get raw offer from georezo_rss table | ||
raw_offer = GeorezoRSS.objects.get(id_rss=offer_id) | ||
|
||
# -- Title analisis ---------------------- | ||
clean_title = txt_toolbelt.remove_html_markups(raw_offer.title) | ||
title_parser = TitleParser(offer_id=offer_id, input_title=clean_title) | ||
|
||
# determine contract type | ||
contract_type = title_parser.parse_contract_type() | ||
jobs_labels = title_parser.parse_jobs_positions() | ||
place = title_parser.parse_place(mode=0) | ||
|
||
# -- Content analisis ---------------------- | ||
clean_content = txt_toolbelt.remove_html_markups(raw_offer.content) | ||
|
||
content_parser = ContentParser( | ||
offer_id=offer_id, input_content=clean_content | ||
) | ||
technos = content_parser.parse_technology() | ||
|
||
# add or update offer | ||
if self.new: | ||
# add new offer | ||
clean_offer = Offer( | ||
id_rss=offer_id, | ||
raw_offer=raw_offer, | ||
title=clean_title, | ||
content=clean_content, | ||
pub_date=raw_offer.pub_date, | ||
contract=Contract.objects.get(abbrv=contract_type), | ||
source=Source.objects.get(name=self.source), | ||
place=Place.objects.get(name=place), | ||
) | ||
try: | ||
clean_offer.save() | ||
except IntegrityError as err_msg: | ||
logger.error( | ||
"Offer RSS_ID ({}) already exists in DB: {}".format( | ||
offer_id, err_msg | ||
) | ||
) | ||
continue | ||
else: | ||
clean_offer = Offer.objects.select_related().filter(id_rss=offer_id) | ||
if not clean_offer.exists(): | ||
logger.info( | ||
"Offer to update no longer exists and won't be created: {}".format( | ||
offer_id | ||
) | ||
) | ||
continue | ||
else: | ||
pass | ||
clean_offer.update( | ||
title=clean_title, | ||
content=clean_content, | ||
pub_date=raw_offer.pub_date, | ||
contract=Contract.objects.get(abbrv=contract_type), | ||
source=Source.objects.get(name=self.source), | ||
place=Place.objects.get(name=place), | ||
) | ||
clean_offer = Offer.objects.select_related().get(id_rss=offer_id) | ||
|
||
# associate ManyToMany relationships | ||
clean_offer.technologies.set(technos) | ||
clean_offer.jobs_positions.set(jobs_labels) | ||
logger.debug("Offer analyzed and inserted jobs.offer: {}".format(offer_id)) | ||
|
||
|
||
# ############################################################################ | ||
# #### Stand alone program ######## | ||
# ################################# | ||
if __name__ == "__main__": | ||
"""standalone execution.""" | ||
pass |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
#! python3 # noqa: E265 | ||
# -*- coding: utf-8 -*- | ||
|
||
from .content import ContentParser # noqa: F401 | ||
from .title import TitleParser # noqa: F401 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,79 @@ | ||
#! python3 # noqa: E265 | ||
|
||
""" | ||
Content parser. | ||
""" | ||
|
||
|
||
# ########################################################################### | ||
# ######### Libraries ############# | ||
# ################################# | ||
|
||
# Standard library | ||
import logging | ||
import re | ||
|
||
# project modules | ||
from elgeopaso.jobs.models import ( | ||
Technology, | ||
TechnologyVariations, | ||
) | ||
|
||
from elgeopaso.utils import TextToolbelt | ||
|
||
# ############################################################################## | ||
# ########## Globals ############### | ||
# ################################## | ||
|
||
# logs | ||
logger = logging.getLogger(__name__) | ||
|
||
# timestamps format helpers | ||
_regex_markups = re.compile(r"<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});") | ||
|
||
# shortcuts | ||
txt_toolbelt = TextToolbelt() | ||
|
||
# ############################################################################ | ||
# ########## Classes ############## | ||
# ################################# | ||
|
||
|
||
class ContentParser: | ||
"""Parse content of offers published on GeoRezo to extract informations. | ||
:param int offer_id: offer ID (for tracing purposes) | ||
:param str input_content: content to parse | ||
""" | ||
|
||
def __init__(self, offer_id: int, input_content: str): | ||
"""Instanciate content parser module.""" | ||
# parameters | ||
self.offer_id = offer_id | ||
self.input_content = input_content | ||
|
||
# tokenize content | ||
self.tokenized_content = txt_toolbelt.tokenize(self.input_content) | ||
|
||
# PARSERS ---------------------------------------------------------------- | ||
def parse_technology(self) -> list: | ||
"""Identify technologies in content.""" | ||
technos_matched = [] | ||
|
||
# parse tokenized content | ||
for word in self.tokenized_content: | ||
if TechnologyVariations.objects.filter(label=word.lower()).exists(): | ||
techno_name = TechnologyVariations.objects.get(label=word.lower()).name | ||
technos_matched.append(Technology.objects.get(name=techno_name)) | ||
else: | ||
continue | ||
logger.debug("Technologies identified: {}".format(technos_matched)) | ||
return technos_matched | ||
|
||
|
||
# ############################################################################ | ||
# #### Stand alone program ######## | ||
# ################################# | ||
if __name__ == "__main__": | ||
"""standalone execution.""" | ||
pass |
Oops, something went wrong.