In [1]:
from flask import Flask, render_template, redirect, url_for
from flask_bootstrap import Bootstrap
from flask_wtf import FlaskForm
from wtforms import StringField, SubmitField
from wtforms.validators import DataRequired

from sentence_transformers import SentenceTransformer, util
from sklearn.cluster import KMeans
import pickle

from fpdf import FPDF
from flask import send_file
import fpdf
import pandas as pd
pd.options.plotting.backend = "plotly"
import plotly as plt
import numpy as np
import plotly.express as px
import plotly.graph_objects as go

k31_full = pickle.load(open('docker/k_31_full', 'rb'))
cluster_label_bigrams = pickle.load(open('docker/cluster_label_bigrams','rb')) 
cluster_importance = pickle.load(open('docker/cluster_importance', 'rb'))

df_cv_summary = pd.read_csv('df_cv_summary.csv',index_col=0)

https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations


In [2]:
cluster_labels_unformated = []
for label in cluster_label_bigrams:
    cluster_labels_unformated.append(label.replace("</b>","").replace("<b>",""))
cluster_labels_unformated

['team player, ability work, team members',
 'knowledge sql, relational databases, sql queries',
 'ability work, work independently, fast-paced environment',
 'years experience, related field, years relevant',
 'learn new, new technologies, eager learn',
 'track record, research projects, scientific technical',
 'machine learning, learning models, learning algorithms',
 'communication skills, excellent communication, skills excellent',
 'big data, experience working, experience big',
 'attention detail, natural language, social media',
 'experience working, software development, project management',
 'data sets, data analysis, data sources',
 'machine learning, learning techniques, machine learning,',
 'data science, data scientists, data scientist',
 'data visualization, visualization tools, experience data',
 'years experience, experience data, data science',
 'business stakeholders, work closely, internal external',
 'problem-solving skills, problem solving, strong analytical',
 'pr

In [3]:
file = open("docker/text/intro.txt")
intro = file.read()#.replace("\n", " ")

intro = intro.encode('latin-1', 'replace').decode('latin-1')
file.close()
print(intro)

file = open("text/plot_explanation.txt")
plot_explanation = file.read()
plot_explanation = plot_explanation.encode('latin-1', 'replace').decode('latin-1')
file.close()

file = open("text/per_skill_description.txt")
per_skill_description = file.read().replace("/n", " ").replace("\br", "\n")
per_skill_description = per_skill_description.encode('latin-1', 'replace').decode('latin-1')
file.close()

file = open("text/table_description.txt")
table_description = file.read().replace("/n", " ").replace("\br", "\n")
table_description = table_description.encode('latin-1', 'replace').decode('latin-1')
file.close()

Skill Scanner uses AI to compare skills:
  1.  Comparisson to Demand: your skills compared to employer demands.
  2.  Comparisson to Competition: your skills compared to representative CV's.
  3.  Recommendation for Education: course recommendations that complement your CV.
For an in depth explanation of our technique please refer to the last page.


In [4]:
def return_pdf(skills):
    analysis = get_plot(skills)
    plot = analysis[0]
    df = analysis[1]
    df_sim = analysis[2]
    cv_scores = analysis[3]
    
    plot.write_image("fig1.png", scale=1)#, width=500, height=750)
    
    table = get_table(cv_scores)
    table.write_image("table1.png")
    
    pdf=FPDF('P', 'mm', 'A4')
    pdf.add_page()
    
    pdf.set_font('Arial', 'B', 14) #setting font for title
    pdf.cell(40, 0, 'Skill Scanner CV Report - Data Scientist', ln=2) #Write Title
    pdf.set_font('Arial', '', 9) #setting font for text cells
    pdf.set_xy(10,15) #place cursor
    pdf.multi_cell(w=190, h=5, txt=intro, align='J')
    
    #Total Score Output
    total_score = df['score'].mean()
    total_score_s = str(round(total_score,2))
    competition_mean = 0.49
    competition_mean_s = "0.49"
    top10 = 0.72
    top10_s = "0.72"
    top25 = 0.67
    top25_s = "0.67"
    top50 = 0.61
    top50_s = "0.61"
    text = "Your total score is "+total_score_s+" This is "
    if total_score<competition_mean:
        text = text+"a low score in comparison to a dataset of 65 Data Scientist CV's. "
    elif total_score<top25:
        text = text+"an average score in comparison to a dataset of 65 Data Scientist CV's. "
        if(total_score<top50):
            text = text+"Please note that allthough you score is average, more than 50% of Data Scientist CV's score better than yours. "
    else:
        text = text+"a high score in comparison to a dataset of 65 Data Scientist CV's. "
    text = text+"The mean score among these data scientist CV's is "\
    +competition_mean_s+". The top 10% of these CV's scored "+top10_s \
    +". The top 25% of these CV's scored "+top25_s+". The top 50% of these CV's scored "+top50_s+"."
    
    text = text.encode('latin-1', 'replace').decode('latin-1')
    pdf.set_xy(pdf.get_x(), pdf.get_y()+5)
    pdf.set_font('Arial', 'B', 11) #setting font for title
    pdf.cell(40, 0, 'Your Score: '+total_score_s, ln=2) #Write Title
    pdf.set_font('Arial', '', 9)
    pdf.set_xy(pdf.get_x(), pdf.get_y()+5)
    pdf.multi_cell(w=190, h=5, txt=text)
    
    pdf.set_xy(pdf.get_x(), pdf.get_y()+5)
    pdf.set_font('Arial', 'B', 11) #setting font for title
    pdf.cell(40, 0, 'Comparrison Plot', ln=2) #Write Title
    pdf.set_font('Arial', '', 9)
    pdf.set_xy(pdf.get_x(), pdf.get_y()+5)
    pdf.multi_cell(w=190, h=5, txt=plot_explanation)
    
    pdf.image('fig1.png', w=200)#x = pdf.get_x, y = 15, w = 200)#, h = 200, type = '', link = '')
    
    pdf.add_page()
    pdf.set_font('Arial', 'B', 11) #setting font for title
    pdf.cell(40, 0, 'Analysis per input skill', ln=2) #Write Title
    pdf.set_font('Arial', '', 9)
    pdf.set_xy(pdf.get_x(), pdf.get_y()+5)
    pdf.multi_cell(w=190, h=5, txt=per_skill_description)
    pdf.set_xy(pdf.get_x(),pdf.get_y()+5)
    
    for index, row in df.iterrows():
        mean_score = df_cv_summary[df_cv_summary['cluster']==row['cluster']]['mean'].mean()
        mean_score_s = str(round(mean_score,2))
        score = round(row['score'],2)
        text = "Input Skill "+str(index+1)+":\nYour input skill \""+row['skill']+"\" was clustered in cluster "+str(row['cluster'])+\
        " which contains skills regarding "+cluster_label_bigrams[row['cluster']]+\
        ". Your score for this skill is "+str(score)+"."
        
        if score<mean_score:
            text = text + "This score is quite low, the average score among Data Scientist CV's is "+mean_score_s+" this may be due to a misclassification of our model but this could also indicate an opportunity to further clarify your CV."
        else:
            text = text + " this is above the average score among Data Scientist CV's which is "+mean_score_s
            
        text = text+"\n------------------------------------------------------------------\n"
        
        text = text.encode('latin-1', 'replace').decode('latin-1')
        pdf.multi_cell(w=190, h=5, txt = text)
    
    pdf.add_page()
    pdf.set_font('Arial', 'B', 11) #setting font for title
    pdf.cell(40, 0, 'Your score compared to Data Scientist CV\'s', ln=2) #Write Title
    pdf.set_font('Arial', '', 9)
    pdf.set_xy(pdf.get_x(), pdf.get_y()+5)
    pdf.multi_cell(w=190, h=5, txt=table_description)
    pdf.set_xy(pdf.get_x(),pdf.get_y()+5)
    pdf.image('table1.png',w=200)
    return pdf
skills = ['programming skills','computer science','experience as a software developer','using git and github','machine learning techniques']
pdf = return_pdf(skills)
pdf.output('report.pdf', 'F')

NameError: name 'get_plot' is not defined

In [5]:
def get_plot(cv): #takes in list of skills and returns a plot with score for each cluster
    df_sim = pd.DataFrame()
    df_sim['cluster'] = range(len(k31_full.cluster_centers_))
    
    labels = cluster_label_bigrams
    importance = cluster_importance
    
    model = 'all-distilroberta-v1'
    model = SentenceTransformer(model)
    embeddings_cv = model.encode(cv)
    embeddings_f = embeddings_cv.astype(float)
    clusters_cv = k31_full.predict(embeddings_f)
    clusters_cv_l  = clusters_cv.tolist()
    
    cv_scores = []
    
    for i, cluster in enumerate(clusters_cv):
        cv_scores.append(util.pytorch_cos_sim(k31_full.cluster_centers_[cluster], embeddings_f[i]).item())
    
    df_report = pd.DataFrame()
    df_report['skill']=cv
    df_report['cluster']=clusters_cv_l
    df_report['score']=cv_scores
    
    scores = []
    for cluster in range(len(k31_full.cluster_centers_)):
        if cluster not in clusters_cv_l:
            scores.append(0)
        else:
            score = 0
            indexes = np.where(clusters_cv==cluster)[0]
            for i in indexes:
                if cv_scores[i] > score:
                    score = cv_scores[i]
            scores.append(score)   
    
    df_sim['score'] = scores
    df_sim['importance'] = importance
    df_sim['labels'] = labels
    
    df_sim['Your Coverage'] = df_sim['importance']*df_sim['score']
    df_sim['Skill Group Importance'] = df_sim['importance']-df_sim['Your Coverage']
    df_sim = df_sim.sort_values('importance')
    
    fig = px.bar(df_sim, y='labels', x=["Your Coverage","Skill Group Importance"], hover_data = ['importance'])
    fig.update_layout(height=3*300, width=3*300, \
                          #font=dict(size=10),\
                          title = 'Input CV Similarity to Requirements in Job Postings',\
                          barmode='stack', \
                          legend_title_text = '', \
                          yaxis_title="Importance",\
                          xaxis_title="Coverage",\
                          legend=dict(yanchor="bottom",y=0,xanchor="right",x=1
                        
))
    return fig, df_report, df_sim, scores

In [56]:
obj=get_plot(['python','R'])
obj[0].show()

In [7]:
obj[0].update_layout(height=3*200, width=3*300, \
                          #font=dict(size=10),\
                          title = 'Input CV Similarity to Requirements in Job Postings<br>',\
                          barmode='stack', \
                          legend_title_text = '', \
                        
                          legend=dict(yanchor="bottom",y=0,xanchor="right",x=1
                                     ))
obj[0].show()

In [8]:
pdf=FPDF('P', 'mm', 'A4')
help(pdf.image)

Help on method image in module fpdf.fpdf:

image(name, x=None, y=None, w=0, h=0, type='', link='') method of fpdf.fpdf.FPDF instance
    Put an image on the page



In [9]:
def get_table(scores):
    scores = [round(num, 2) for num in scores]
    
    labels = cluster_label_bigrams
    
    mean = df_cv_summary['mean'].tolist()
    mean = [round(num, 2) for num in mean]
    
    top10 = df_cv_summary['top10'].tolist()
    top10 = [round(num, 2) for num in top10]
    
    top25 = df_cv_summary['top25'].tolist()
    top25 = [round(num, 2) for num in top25]
    
    top50 = df_cv_summary['top50'].tolist()
    top50 = [round(num, 2) for num in top50]
    
    fill_color = []
    for score in scores:
        if score<=top50[scores.index(score)]:#less then average
            fill_color.append('#ffcccc')
        elif score<=top25[scores.index(score)]:
            fill_color.append('#fff5e6')
        elif score<=top10[scores.index(score)]:
            fill_color.append('#e6ffb3')                 
        else:
            fill_color.append('#ccffcc')
    
    print(fill_color)
    fig = go.Figure(data=[go.Table(
        columnwidth = [1100,100],
        header=dict(values=['Skill Cluster','Your Score','Average','Top 10%','Top 25%', 'Top 50%']),
        
        cells=dict(values=[labels, scores,mean,top10,top25,top50],
              fill = dict(color=['rgb(245, 245, 255)',fill_color])
                  )
    )])
    fig.update_layout(height = 3.4*300, width = 3*300,margin=go.layout.Margin(l=0, r=0,b=0, t=0 ))
    
    return(fig)
get_table(range(31))

['#ffcccc', '#ccffcc', '#ccffcc', '#ccffcc', '#ccffcc', '#ccffcc', '#ccffcc', '#ccffcc', '#ccffcc', '#ccffcc', '#ccffcc', '#ccffcc', '#ccffcc', '#ccffcc', '#ccffcc', '#ccffcc', '#ccffcc', '#ccffcc', '#ccffcc', '#ccffcc', '#ccffcc', '#ccffcc', '#ccffcc', '#ccffcc', '#ccffcc', '#ccffcc', '#ccffcc', '#ccffcc', '#ccffcc', '#ccffcc', '#ccffcc']


In [10]:
def get_overview(score):
    
    fill_color = []
    n = 5

    vals = [round(score, 2),72,67,61,49]
    
    fill_color = []
    for v in vals:
        if v<=vals[4]:#less then average
            fill_color.append('#ffcccc')
        elif v<vals[3]:#less than top50
            fill_color.append('#fff5e6')
        elif v<vals[2]:#less than top25
            fill_color.append('#e6ffb3')
        else:
            fill_color.append('#ccffcc')
        
    vals = ['<b>'+str(round(score, 2))+'</b>',49,72,67,61]    
    
    
    fig = go.Figure(data=[go.Table(
        columnwidth = [100,100],
        header=dict(values=['<b>Your CV Coverage [%]</b>','Data Scientist<br>Top 10% [%]','Data Scientist <br>Top 25% [%]', 'Data Scientist<br>Top 50% [%]','Data Scientist<br>Average [%]']),
        cells=dict(values=vals, fill_color = fill_color)
    )])
    
    
    fig.update_layout(height = 1*300, width = 3*300,margin=go.layout.Margin(l=10, r=10,b=0, t=0 )
    )
    
    return(fig)
get_overview(70)

In [166]:
def get_overview():
    
    vals = [49,61,67]
    
    relative_vals = [49,12,6,33]
    
        
    y=[1,1,1,1]

    color=['#ffcccc','#fff5e6','#e6ffb3','#ccffcc']
    
    
    fig = px.bar(x=relative_vals,y=y, orientation = 'h', color = color, color_discrete_sequence=['#ffcccc','#fff5e6','#e6ffb3','#ccffcc'])
    
    fig.update_layout(barmode="stack", \
                      plot_bgcolor = 'rgba(0,0,0,0)', \
                      paper_bgcolor = 'rgba(0,0,0,0)', \
                      showlegend=False,\
                      yaxis_title="",\
                      xaxis_title="",         
                      margin=dict(l=0,r=0,b=0,t=0),
                      height=200
                     )
    fig.update_yaxes(visible=False, showticklabels=False)
    fig.update_xaxes(visible=False, showticklabels=False)

    fig.add_annotation(x=30,y=-0.1,
    text="Your Score "+str(30),
    showarrow=False)
    
    fig.add_annotation(x=30,y=1,
    text="────────────",
    showarrow=False,
    textangle=-90)
    
    for val in vals:
        fig.add_annotation(x=vals[vals.index(val)],#y=0,
        font=dict(size=25),
        text = str(val),
        yshift = 0,                   
        showarrow=True,
        arrowhead=1)
    
    fig.add_annotation(x=vals[0], y=1,
            text="Needs Improvement",
            showarrow=False,
            xshift=-100)
    fig.add_annotation(x=vals[1], y=1,
            text="Fair",
            showarrow=False,
            xshift=-80)
    fig.add_annotation(x=vals[2], y=1,
            text="Good",
            showarrow=False,
            xshift=-30)
    fig.add_annotation(x=vals[2], y=1,
            text="Excellent",
            showarrow=False,
            xshift=50)
    return(fig)
get_overview()

In [16]:
df_handbook = pd.read_csv('df_handbook_DS_60.csv')

In [17]:
df_handbook.head()

Unnamed: 0.1,Unnamed: 0,objective,module,study_program,cluster
0,0,understand the fundamental building blocks of ...,Advanced Statistics,DS_60,12
1,1,analyze stochastic data in terms of the underl...,Advanced Statistics,DS_60,11
2,2,utilize Bayesian statistics techniques.,Advanced Statistics,DS_60,12
3,3,summarize the properties of observed data usin...,Advanced Statistics,DS_60,11
4,4,apply data visualization techniques to design ...,Advanced Statistics,DS_60,14


In [18]:
objectives = df_handbook['objective'].tolist()
model = 'all-distilroberta-v1'
model = SentenceTransformer(model)

embeddings_o = model.encode(objectives)
embeddings_f = embeddings_o.astype(float)
clusters = df_handbook['cluster'].tolist()

objective_scores = []
for i, cluster in enumerate(clusters):
    objective_scores.append(util.pytorch_cos_sim(k31_full.cluster_centers_[cluster], embeddings_f[i]).item())

In [19]:
df_handbook['score']=objective_scores

In [20]:
df_handbook

Unnamed: 0.1,Unnamed: 0,objective,module,study_program,cluster,score
0,0,understand the fundamental building blocks of ...,Advanced Statistics,DS_60,12,0.523433
1,1,analyze stochastic data in terms of the underl...,Advanced Statistics,DS_60,11,0.4412
2,2,utilize Bayesian statistics techniques.,Advanced Statistics,DS_60,12,0.517667
3,3,summarize the properties of observed data usin...,Advanced Statistics,DS_60,11,0.497379
4,4,apply data visualization techniques to design ...,Advanced Statistics,DS_60,14,0.727068
5,5,evaluate model parameters using parameter esti...,Advanced Statistics,DS_60,19,0.472755
6,6,create hypothesis tests to discriminate betwee...,Advanced Statistics,DS_60,19,0.357829
7,7,analyze use cases and their requirements regar...,Use Case and Evaluation,DS_60,25,0.594256
8,8,apply common metrics to evaluate predictions.,Use Case and Evaluation,DS_60,23,0.469091
9,9,evaluate key performance indicators to asses p...,Use Case and Evaluation,DS_60,25,0.606785


In [21]:
df_handbook.to_csv('docker/handbook.csv')

In [22]:
df_cv = obj[2].sort_values(['Skill Group Importance'], ascending=False)
recommendations = []
recommendation_scores=[]
recommendation_labels = []

for index, row in df_cv.iterrows():
    if row['score']==0:
        df_handbook_filtered = df_handbook[df_handbook['cluster']==row['cluster']]
        if len(df_handbook_filtered)>0 and df_handbook_filtered['score'].max()>0.5:
            recommendation = df_handbook_filtered['score'].argmax()
            recommendation = df_handbook_filtered['module'].iloc[recommendation]
            recommendation_label = "Recommended Module: <b>"+recommendation+'</b><br>Skill Group: '+row['labels']
            
            recommendations.append(recommendation)
            recommendation_scores.append(df_handbook_filtered['score'].max())
            recommendation_labels.append(recommendation_label)
        else:
            recommendations.append(None)
            recommendation_scores.append(0)
            recommendation_labels.append(None)
    else:
        recommendations.append(None)
        recommendation_scores.append(0)
        recommendation_labels.append(None)
df_cv['recommendation']=recommendations
df_cv['recommendation_score']=recommendation_scores
df_cv['recommendation_label']=recommendation_labels
df_cv['coverage of recommendation'] = df_cv['recommendation_score']*df_cv['Skill Group Importance']
df_cv['Recommendation Importance']=df_cv['Skill Group Importance']-df_cv['coverage of recommendation']
df_cv=df_cv[df_cv['recommendation_score']>0]
df_cv=df_cv.sort_values('Skill Group Importance',ascending=False)

In [55]:
fig = px.bar(df_cv.head(5), y='recommendation_label', x=["coverage of recommendation","Recommendation Importance"])
fig.update_layout(height=2*300, width=3*300, \
                     #font=dict(size=10),\
                     title = 'Study Program Module Recommendations',\
                     barmode='stack', \
                     legend_title_text = '', \
                     #ylabel('Y Label','FontSize',20,'Color','r','Position',[-10 50] )
                     yaxis_title="Importance ───>1  ──────»2"+' \u2192 3'+' \u21FE 4',\
                     xaxis_title="<- Coverage ->",\
                     legend=dict(yanchor="bottom",y=0,xanchor="right",x=1))

fig.show()

In [24]:
def get_recommendation(df_cv):
    recommendations = []
    recommendation_scores=[]
    recommendation_labels = []

    for index, row in df_cv.iterrows():
        if row['score']==0:
            df_handbook_filtered = df_handbook[df_handbook['cluster']==row['cluster']]
            if len(df_handbook_filtered)>0 and df_handbook_filtered['score'].max()>0.5:
                recommendation = df_handbook_filtered['score'].argmax()
                recommendation = df_handbook_filtered['module'].iloc[recommendation]
                recommendation_label = "Recommended Module: <b>"+recommendation+'</b><br>Skill Group: '+row['labels']
                
                recommendations.append(recommendation)
                recommendation_scores.append(df_handbook_filtered['score'].max())
                recommendation_labels.append(recommendation_label)
            else:
                recommendations.append(None)
                recommendation_scores.append(0)
                recommendation_labels.append(None)
        else:
            recommendations.append(None)
            recommendation_scores.append(0)
            recommendation_labels.append(None)
    df_cv['recommendation']=recommendations
    df_cv['recommendation_score']=recommendation_scores
    df_cv['recommendation_label']=recommendation_labels
    df_cv['coverage of recommendation'] = df_cv['recommendation_score']*df_cv['Skill Group Importance']
    df_cv['Recommendation Importance']=df_cv['Skill Group Importance']-df_cv['coverage of recommendation']
    df_cv=df_cv[df_cv['recommendation_score']>0]
    df_cv=df_cv.sort_values('Skill Group Importance',ascending=False)
            
    fig = px.bar(df_cv.head(5), y='recommendation_label', x=["coverage of recommendation","Recommendation Importance"])
    fig.update_layout(height=2*300, width=3*300, \
                         #font=dict(size=10),\
                         title = 'Study Program Module Recommendations',\
                         barmode='stack', \
                         legend_title_text = '', \
                         yaxis_title="<- Importance ->",\
                         xaxis_title="<- Coverage ->",\
                         legend=dict(yanchor="bottom",y=0,xanchor="right",x=1))
    return fig

In [25]:
df_test = obj[2]
get_recommendation(df_test)