In [1]:
import psycopg2
import pandas as pd
from App import get_skills_cv
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [3]:
from datetime import datetime, timedelta
start_date = datetime.now() - timedelta(days=1)

In [4]:
def connect():
    try:
        conn = psycopg2.connect("host=127.0.0.1 dbname=postgres user=postgres password=huser")
        print('DB connected successfully')
    except Exception as e:
        print(f"Error connecting to DB: {e}")
        raise
    return conn

In [3]:
def get_data():
    try:
        conn = connect()
        cursor = conn.cursor()

        fetch_data_sql = "SELECT job_link, job_name, job_text, job_company, job_location, job_type, job_date, skills FROM jobs_data;"

        try:
            cursor.execute(fetch_data_sql)
            rows = cursor.fetchall()
            colnames = [desc[0] for desc in cursor.description]
            df = pd.DataFrame(rows, columns=colnames)
            df.drop_duplicates(subset=['skills'],ignore_index=True)

            return df

        except Exception as ex:
            print(f"An error occurred while fetching data: {ex}")

        finally:
            cursor.close()
            conn.close()

    except Exception as e:
        print(f"Error connecting to the database: {e}")

In [4]:
df = get_data()

DB connected successfully


In [5]:
df.shape

(6662, 8)

In [7]:
cv_skills = get_skills_cv("cv.pdf")

In [10]:
def recommend_jobs(df, skills_cv, top_n=100):
    # Prepare the lists of skills
    job_skills_list = df['skills'].tolist()
    skills_cv_list = [skills_cv]
    all_skills = skills_cv_list + job_skills_list

    # Vectorize the skills
    vectorizer = TfidfVectorizer()
    vectorizer.fit(all_skills)
    cv_vector = vectorizer.transform(skills_cv_list)
    job_vectors = vectorizer.transform(job_skills_list)

    # Compute similarity scores
    similarity_scores = cosine_similarity(cv_vector, job_vectors).flatten()
    
    # Get the top N job indices based on similarity scores
    top_indices = similarity_scores.argsort()[-top_n:][::-1]
    
    # Extract the recommended job names and their corresponding scores
    recommendations = df.iloc[top_indices][['job_link','job_name', 'skills']].copy()
    recommendations['score'] = similarity_scores[top_indices]
    
    return recommendations

In [12]:
top_jobs = recommend_jobs(df, cv_skills, top_n=10000)
top_jobs

Unnamed: 0,job_link,job_name,skills,score
1420,https://fr.indeed.com//rc/clk?jk=174c1b0143b42...,Data Scientist - Nantes,nlp business deep learning cloud machine learn...,0.298842
5906,https://www.glassdoor.fr/job-listing/data-scie...,Data Scientist - Toulouse,"nlp, python, azure, scala, cloud, machine lear...",0.298842
912,https://fr.indeed.com//rc/clk?jk=1b89d368ed7a9...,Consultant d'Application - H/F - Paris - Issy-...,azure deep learning spark gcp nlp python aws d...,0.298842
5783,https://fr.indeed.com//rc/clk?jk=8a0e220e7630c...,Data Scientist - Toulouse,"nlp, python, azure, scala, cloud, machine lear...",0.298842
501,https://fr.indeed.com//rc/clk?jk=d7a4a0e23bf20...,Data Scientist - Nantes,azure deep learning spark gcp nlp python aws d...,0.298842
...,...,...,...,...
3034,https://fr.linkedin.com/jobs/view/data-analyst...,Data Analyst H/F,"google, business, powerbi, tableau, sql",0.000000
3033,https://fr.linkedin.com/jobs/view/consultant-b...,Consultant Bi &Analytics Confirmé - Senior H/F,"design, ml, business, powerbi, databricks, com...",0.000000
3031,https://fr.linkedin.com/jobs/view/analyste-de-...,Analyste de Données H/F,"collaboration, flux",0.000000
3030,https://fr.linkedin.com/jobs/view/consultant-p...,Consultant Power Bi H/F,power bi,0.000000


In [8]:
from sentence_transformers import SentenceTransformer, util



In [29]:
def calculate_similarity(cv_skills, job_skills,model):
  
  # Obtenir les embeddings pour les deux textes
  embedding1 = model.encode(cv_skills, convert_to_tensor=True)
  embedding2 = model.encode(job_skills, convert_to_tensor=True)

  # Calculer la similarité cosinus entre les embeddings
  cosine_score = util.pytorch_cos_sim(embedding1, embedding2)

  # Convertir le tenseur en nombre flottant simple pour obtenir un score compris entre -1 et 1
  similarity_score = cosine_score.item()

  # Convertir le score en un pourcentage (score entre 0 et 100)
  similarity_percentage = similarity_score  * 100

  # Afficher le résultat
  return similarity_percentage



def rank_jobs(path, df):
    ranked_cvs = []
    cv_skill = parser.get_skills_cv(path)
    model = SentenceTransformer('duongttr/job-candidiate-matching-sentbert') #good one
    job_skills = df['skills']
    for job_skill in job_skills:
        if len(job_skill)>=15:
          similarity = calculate_similarity(cv_skill, job_skill,model)
          ranked_cvs.append((job_skill,similarity))
    ranked_cvs = sorted(ranked_cvs, key=lambda x: x[1], reverse=True)
    return ranked_cvs

In [36]:
df_skill = df.head(100)
df_skill = df.drop_duplicates(subset=['skills'])

In [37]:
rank_jobs('cv.pdf',df_skill)

Extracted Skills Section:




[('engineering machine learning chef deep learning algorithms collaboration framework image processing python data science natural language processing computer science reinforcement learning software computer vision documentation languages tensorflow nlp security comprehension pytorch transformers design interaction',
  99.21997785568237),
 ('machine learning, spark, data scientist, deployment, sql, data processing, docker, deep learning, ai, statistics, pytorch, gitlab, airflow, collaboration, computer science, cuda, engineering, nlp, business, python',
  98.50357174873352),
 ('machine learning monitoring python', 97.69482612609863),
 ('machine learning, data science, simulation, mobile, vue',
  97.63604402542114),
 ('machine learning, business, data science, analytics, r, power bi, tableau, sql',
  97.29138612747192),
 ('cloud, data integration, aws, python, spark, big data, sql',
  97.17837572097778),
 ('cloud, data processing, docker, computer science, data science, python, azure, 

In [25]:
rank_jobs('cv.pdf',df_skill)

Extracted Skills Section:


[('business sql', 83.03248286247253),
 ('statistiques github python databricks monitoring documentation git machine learning',
  82.78393745422363),
 ('big data json data quality mode flux crm business support cloud',
  80.60728311538696),
 ('business support security', 76.48777961730957),
 ('azure sql spark databricks server bigquery flux power bi support devops cloud dataviz',
  75.1282811164856),
 ('temps réel', 74.99754428863525),
 ('mobile software jira game development support engineering',
  73.54434728622437),
 ('javascript sql support java', 72.89952039718628),
 ('jenkins collaboration gitlab documentation java', 72.87307381629944),
 ('docker postgresql python documentation kubernetes django',
  71.31582498550415)]

In [38]:
df_skill = df.head(100)
df_skill = df_skill.drop_duplicates(subset=['skills'])
df_skill.shape

(74, 8)

In [39]:
rank_jobs('cv.pdf',df_skill)

Extracted Skills Section:




[('azure big data data engineering data analytics business intelligence power bi business support design dataviz engineering',
  95.80404758453369),
 ('computer vision ai deep learning c++ algorithms electron python engineering artificial intelligence computer science business signal processing support image processing design data acquisition',
  95.48447132110596),
 ('windows collaboration data engineering spark data engineer finance python aws jira s3 mode scala business airflow cloud github',
  91.02325439453125),
 ('data scientist natural language languages library documentation data quality data analysis analytics security cloud machine learning llm',
  89.00728225708008),
 ('data scientist natural language ai languages documentation data quality data analysis analytics security cloud machine learning llm',
  88.95745277404785),
 ('c++ statistiques kafka spark hadoop data engineer databricks microsoft azure scala git devops cloud impala',
  87.95238733291626),
 ('data scientist ml

In [None]:
# # import tempfile
# # import pandas as pd
# # import streamlit as st
# # from App import recommend_jobs, read_data, job_by_source, job_by_type,job_type_by_source,most_offered_company,top_skills

# # st.set_page_config(page_title="Job Recommendation App", layout="wide")

# # # Sidebar navigation
# # # nav_options = {
# # #     "Accueil": "accueil",
# # #     "Job recommender": "recommender",
# # #     "Dashboard" : "dashboard"
# # # }

# # st.sidebar.header("Navigation")

# # # Navigation options with icons
# # nav_options = {
# #     "Accueil": "🏠",
# #     "Job Recommender": "🔍",
# #     "Dashboard": "📊"
# # }

# # # Add radio button for navigation
# # nav_selection = st.sidebar.radio(
# #     "Application",
# #     options=list(nav_options.keys()),
# #     format_func=lambda x: f"{nav_options[x]} {x}"
# # )



# # # Main function for the 'Accueil' page
# # def accueil():
# #     st.title("Welcome")
# # def recommender():
# #     st.title("Job Recommender System")

# #     col1, _ = st.columns(2)

# #     with col1:
# #         uploaded_file = st.file_uploader("Choose a CV (PDF only)", type="pdf")

# #     if uploaded_file is not None:
# #         if st.button('Match'):
# #             with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_file:
# #                 tmp_file.write(uploaded_file.read())
# #                 pdf_path = tmp_file.name

# #             st.subheader("Best jobs found")
# #             df = recommend_jobs(pdf_path, 10)

# #             # Creating a card-like layout for job recommendations
# #             num_columns = 3  # Adjust this number to change how many cards per row
# #             for i in range(0, len(df), num_columns):
# #                 cols = st.columns(num_columns)
# #                 for col, (_, row) in zip(cols, df.iloc[i:i + num_columns].iterrows()):
# #                     with col:
# #                         st.markdown(
# #                             f"""
# #                             <div style="border: 1px solid #e6e6e6; 
# #                                         border-radius: 5px; padding: 15px; 
# #                                         margin-bottom: 10px; 
# #                                         box-shadow: 5px 8px 15px rgba(0,0,0,0.4);">
# #                                 <h4 style="color:#FF4500">{row['job_name']}</h4>
# #                                 <p><b>Location:</b> {row['job_location']}</p>
# #                                 <p>{row['job_text'][:150]}...</p>
# #                                 <a href="{row['job_link']}">Read more</a>
# #                             </div>
# #                             """, unsafe_allow_html=True
# #                         )

# # df = read_data()

# # def dashbord_page():
# #     st.title("Dashbord")

# #     col1, col2 = st.columns(2)

# #     with col1:
# #         fig = job_by_source(df)
# #         st.plotly_chart(fig)
        
# #     with col2:
# #         fig1 = most_offered_company(df)
# #         st.plotly_chart(fig1)
        
# #     with col1:
# #         fig2 = job_by_type(df)
# #         st.plotly_chart(fig2)
        
# #     with col2:
# #         fig3 = job_type_by_source(df)
# #         st.plotly_chart(fig3)

# #     _,col3,_ = st.columns([1,2,1])

# #     with col3:
# #         fig4 = top_skills(df,'skills')
# #         st.plotly_chart(fig4)
        

# # # # Mapping selections to functions
# # # if nav_options[nav_selection] == "accueil":
# # #     accueil()

# # # if nav_options[nav_selection] == "recommender":
# # #     recommender()

# # # if nav_options[nav_selection] == "dashboard":
# # #     dashbord_page()


# import tempfile
# import pandas as pd
# import streamlit as st
# from App import (recommend_jobs, read_data, job_by_source, job_by_type, 
#                  job_type_by_source, most_offered_company, top_skills)
# import psycopg2
# from datetime import datetime, timedelta

# st.set_page_config(page_title="Job Recommendation App", layout="wide")

# # Sidebar navigation with icons
# st.sidebar.header("Navigation")

# # Navigation options with icons
# nav_options = {
#     "Accueil": "🏠",
#     "Job Recommender": "🔍",
#     "Dashboard": "📊"
# }

# nav_selection = st.sidebar.radio(
#     "Select Page",
#     options=list(nav_options.keys()),
#     format_func=lambda x: f"{nav_options[x]} {x}"
# )


# time_filter = st.sidebar.radio(
#     'Select Time Period',
#     options=['All Time', 'Last Day', 'Last Week', 'Last Month']
# )

# # Main function for the 'Accueil' page
# def accueil():
#     st.title("Welcome")

# def recommender():
#     st.title("Job Recommender System")

#     col1, _ = st.columns(2)

#     with col1:
#         uploaded_file = st.file_uploader("Choose a CV (PDF only)", type="pdf")

#     if uploaded_file is not None:
#         if st.button('Match'):
#             with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_file:
#                 tmp_file.write(uploaded_file.read())
#                 pdf_path = tmp_file.name

#             st.subheader("Best jobs found")
#             df = recommend_jobs(pdf_path, 10)

#             # Creating a card-like layout for job recommendations
#             num_columns = 3  # Adjust this number to change how many cards per row
#             for i in range(0, len(df), num_columns):
#                 cols = st.columns(num_columns)
#                 for col, (_, row) in zip(cols, df.iloc[i:i + num_columns].iterrows()):
#                     with col:
#                         st.markdown(
#                             f"""
#                             <div style="border: 1px solid #e6e6e6; 
#                                         border-radius: 5px; padding: 15px; 
#                                         margin-bottom: 10px; 
#                                         box-shadow: 5px 8px 15px rgba(0,0,0,0.4);">
#                                 <h4 style="color:#FF4500">{row['job_name']}</h4>
#                                 <p><b>Location:</b> {row['job_location']}</p>
#                                 <p>{row['job_text'][:150]}...</p>
#                                 <a href="{row['job_link']}">Read more</a>
#                             </div>
#                             """, unsafe_allow_html=True
#                         )

# # Filtering function based on selected time period
# def filter_data_by_time(df, period):
#     if period == 'Last Day':
#         start_date = datetime.now() - timedelta(days=1)
#     elif period == 'Last Week':
#         start_date = datetime.now() - timedelta(weeks=1)
#     elif period == 'Last Month':
#         start_date = datetime.now() - timedelta(30)
#     else:
#         return df  # No filtering for 'All Time'

#     df['job_date'] = pd.to_datetime(df['job_date'])
#     return df[df['job_date'] >= start_date]

# def dashbord_page():
#     st.title("Dashboard")

#     df = read_data()

#     # Apply time filter
#     df = filter_data_by_time(df, time_filter)

#     col1, col2 = st.columns(2)

#     with col1:
#         fig = job_by_source(df)
#         st.plotly_chart(fig)
        
#     with col2:
#         fig1 = most_offered_company(df)
#         st.plotly_chart(fig1)
        
#     with col1:
#         fig2 = job_by_type(df)
#         st.plotly_chart(fig2)
        
#     with col2:
#         fig3 = job_type_by_source(df)
#         st.plotly_chart(fig3)

#     _, col3, _ = st.columns([1, 2, 1])

#     with col3:
#         fig4 = top_skills(df, 'skills')
#         st.plotly_chart(fig4)

# # Display the selected page
# if nav_selection == "Accueil":
#     accueil()
# elif nav_selection == "Job Recommender":
#     recommender()
# elif nav_selection == "Dashboard":
#     dashbord_page()

# def connect():
#     try:
#         conn = psycopg2.connect("host=127.0.0.1 dbname=postgres user=postgres password=huser")
#         print('DB connected successfully')
#     except Exception as e:
#         print(f"Error connecting to DB: {e}")
#         raise
#     return conn

# def get_data():
#     try:
#         conn = connect()
#         cursor = conn.cursor()

#         fetch_data_sql = "SELECT job_link, job_name, job_text, job_company, job_location, job_type, job_date, skills FROM jobs_data;"

#         try:
#             cursor.execute(fetch_data_sql)
#             rows = cursor.fetchall()
#             colnames = [desc[0] for desc in cursor.description]
#             df = pd.DataFrame(rows, columns=colnames)
#             df.drop_duplicates(subset=['skills'], inplace=True)
#             return df

#         except Exception as ex:
#             print(f"An error occurred while fetching data: {ex}")

#         finally:
#             cursor.close()
#             conn.close()

#     except Exception as e:
#         print(f"Error connecting to the database: {e}") 