# Import Library

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import joblib
import pickle

Kode di atas bertujuan untuk melakukan import library, seperti pandas untuk membuat Dataframe, numpy untuk melakukan komputasi numerik, joblib, dan pickle untuk menyimpan model. Selan itu juga terdapat beberapa model dari library scikit-learn seperti TF-IDF untuk vectorizer dan cosine similarity untuk menghitung skor rekomendasi.

# Data Loading

In [2]:
df_people = pd.read_csv('dataset/01_people.csv')
df_people

Unnamed: 0,person_id,name,email,phone,linkedin
0,1,Database Administrator,,,
1,2,Database Administrator,,,
2,3,Oracle Database Administrator,,,
3,4,Amazon Redshift Administrator and ETL Develope...,,,
4,5,Scrum Master Scrum Master Scrum Master,,,
...,...,...,...,...,...
54928,54929,Lead Python Developer,,,
54929,54930,Full Stack Python Developer,,,
54930,54931,Eli Lilly,,,
54931,54932,Python Developer,,,


In [3]:
df_ability = pd.read_csv('dataset/02_abilities.csv')
df_ability

Unnamed: 0,person_id,ability
0,1,Installation and Building Server
1,1,Running Backups
2,1,Recovering and Restoring Models
3,1,Support various MS SQL Server
4,1,MS SQL Server 2005/2008
...,...,...
1219468,54933,Developing Controllers using django framework
1219469,54933,Designed APIs to retrieve the list of new tasks
1219470,54933,Creating a Data models for the app
1219471,54933,Developing the static pages using Bootstrap an...


In [4]:
df_education = pd.read_csv('dataset/03_education.csv')
df_education

Unnamed: 0,person_id,institution,program,start_date,location
0,1,Lead City University,Bachelor of Science,07/2013,
1,2,lagos state university,bsc in computer science,,"Lagos, GU"
2,3,"JNTU - Kakinada, Andhra Pradesh",Master of Computer Applications in Science and...,2013,"Kakinada, Andhra Pradesh"
3,4,University of Informatics,Bachelor in Computer Science,06/07,June 2007
4,5,Virginia Commomwealth University,,08/2013,"Richmond, VA"
...,...,...,...,...,...
75994,54923,Wilmington University,Master's in Information technology,2017,
75995,54924,JNT University,Bachelor's Degree in Computer Science,,
75996,54925,JNTU,Masters in Electrical and Electronics Engineering,,
75997,54927,Andhra University,Bachelor of Science in Computers,,


In [5]:
df_experience = pd.read_csv('dataset/04_experience.csv')
df_experience

Unnamed: 0,person_id,title,firm,start_date,end_date,location
0,1,Database Administrator,Family Private Care LLC,04/2017,Present,"Roswell, GA"
1,1,Database Administrator,Incomm,01/2014,02/2017,"Alpharetta, GA"
2,2,Database Administrator,Intercontinental Registry,12/2008,08/2011,"Lagos, GU"
3,3,Oracle Database Administrator,Cognizant,06/2016,Present,"Hyderabad, Telangana"
4,3,Oracle Database Administrator,Convergys,06/2014,06/2016,"Hyderabad, Telangana"
...,...,...,...,...,...,...
265399,54932,Python Developer,Hexaware Technologies Limited,07/2010,10/2011,
265400,54932,Software Developer,Vision InfoTech Pvt Ltd,03/2009,06/2010,
265401,54933,MetroBikes,MetroBikes,09/2018,Present,
265402,54933,Python/Flask Developer,TechJini Solutions Pvt. Ltd,04/2017,07/2018,"Bangalore, KARNATAKA, IN"


In [6]:
df_person_skills = pd.read_csv('dataset/05_person_skills.csv')
df_person_skills

Unnamed: 0,person_id,skill
0,1,Database administration
1,1,Database
2,1,Ms sql server
3,1,Ms sql server 2005
4,1,Sql server
...,...,...
2483371,54933,Designed APIs to retrieve the list of new tasks
2483372,54933,Creating a Data models for the app
2483373,54933,Developing the static pages using Bootstrap an...
2483374,54933,Integrating modules and performing unit testing


In [7]:
df_skills = pd.read_csv('dataset/06_skills.csv')
df_skills

Unnamed: 0,skill
0,Mongo DB-3.2
1,JNDI LDAP
2,Stored Procedures
3,Perform ad-hoc analysis
4,Monitored and resolved flight crew legality is...
...,...
226755,Retention through VE-135 (6 mos after graduati...
226756,Remedy Service Management Tool
226757,Offshore Management
226758,virus and malware removal


# Data Understanding

In [8]:
df_people.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54933 entries, 0 to 54932
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   person_id  54933 non-null  int64 
 1   name       54819 non-null  object
 2   email      1593 non-null   object
 3   phone      1833 non-null   object
 4   linkedin   8538 non-null   object
dtypes: int64(1), object(4)
memory usage: 2.1+ MB


In [9]:
df_people.isnull().sum()

person_id        0
name           114
email        53340
phone        53100
linkedin     46395
dtype: int64

In [10]:
df_people.duplicated().sum()

np.int64(0)

In [11]:
df_ability.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1219473 entries, 0 to 1219472
Data columns (total 2 columns):
 #   Column     Non-Null Count    Dtype 
---  ------     --------------    ----- 
 0   person_id  1219473 non-null  int64 
 1   ability    1219473 non-null  object
dtypes: int64(1), object(1)
memory usage: 18.6+ MB


In [12]:
df_ability.isnull().sum()

person_id    0
ability      0
dtype: int64

In [13]:
df_ability.duplicated().sum()

np.int64(34656)

In [14]:
df_education.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 75999 entries, 0 to 75998
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   person_id    75999 non-null  int64 
 1   institution  74430 non-null  object
 2   program      68238 non-null  object
 3   start_date   54870 non-null  object
 4   location     52743 non-null  object
dtypes: int64(1), object(4)
memory usage: 2.9+ MB


In [15]:
df_education.isnull().sum()

person_id          0
institution     1569
program         7761
start_date     21129
location       23256
dtype: int64

In [16]:
df_education.duplicated().sum()

np.int64(51)

In [17]:
df_experience.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 265404 entries, 0 to 265403
Data columns (total 6 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   person_id   265404 non-null  int64 
 1   title       265293 non-null  object
 2   firm        261201 non-null  object
 3   start_date  263142 non-null  object
 4   end_date    262680 non-null  object
 5   location    212349 non-null  object
dtypes: int64(1), object(5)
memory usage: 12.1+ MB


In [18]:
df_experience.isnull().sum()

person_id         0
title           111
firm           4203
start_date     2262
end_date       2724
location      53055
dtype: int64

In [19]:
df_experience.duplicated().sum()

np.int64(51)

In [20]:
df_person_skills.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2483376 entries, 0 to 2483375
Data columns (total 2 columns):
 #   Column     Dtype 
---  ------     ----- 
 0   person_id  int64 
 1   skill      object
dtypes: int64(1), object(1)
memory usage: 37.9+ MB


In [21]:
df_person_skills.isnull().sum()

person_id    0
skill        9
dtype: int64

In [22]:
df_person_skills.duplicated().sum()

np.int64(587511)

# Data Preprocessing

In [23]:
df_people = df_people.drop(['email', 'phone', 'linkedin'], axis=1)
df_people

Unnamed: 0,person_id,name
0,1,Database Administrator
1,2,Database Administrator
2,3,Oracle Database Administrator
3,4,Amazon Redshift Administrator and ETL Develope...
4,5,Scrum Master Scrum Master Scrum Master
...,...,...
54928,54929,Lead Python Developer
54929,54930,Full Stack Python Developer
54930,54931,Eli Lilly
54931,54932,Python Developer


In [24]:
df_agg = df_ability.groupby('person_id', as_index=False)['ability'].agg(', '.join)
df_agg

Unnamed: 0,person_id,ability
0,1,"Installation and Building Server, Running Back..."
1,2,"database management systems administration, de..."
2,3,"Over 4+ years of Experience as Architecture, E..."
3,4,"SQL management, PostgresSQL, Oracle, MySQL, mi..."
4,5,"Scrum Master, Agile software development, Prod..."
...,...,...
54925,54929,"Qualified Python Developer, Web Application De..."
54926,54930,"Over 7 years of IT Experience, Designing, deve..."
54927,54931,"Developing and designing Web Based, Multi-tier..."
54928,54932,"Python Developer, Web/Application Developer, A..."


In [25]:
df_people = pd.merge(df_people, df_agg, on='person_id', how='left')
df_people

Unnamed: 0,person_id,name,ability
0,1,Database Administrator,"Installation and Building Server, Running Back..."
1,2,Database Administrator,"database management systems administration, de..."
2,3,Oracle Database Administrator,"Over 4+ years of Experience as Architecture, E..."
3,4,Amazon Redshift Administrator and ETL Develope...,"SQL management, PostgresSQL, Oracle, MySQL, mi..."
4,5,Scrum Master Scrum Master Scrum Master,"Scrum Master, Agile software development, Prod..."
...,...,...,...
54928,54929,Lead Python Developer,"Qualified Python Developer, Web Application De..."
54929,54930,Full Stack Python Developer,"Over 7 years of IT Experience, Designing, deve..."
54930,54931,Eli Lilly,"Developing and designing Web Based, Multi-tier..."
54931,54932,Python Developer,"Python Developer, Web/Application Developer, A..."


In [26]:
df_person_skills = df_person_skills.dropna(subset=['skill'])
df_person_skills

Unnamed: 0,person_id,skill
0,1,Database administration
1,1,Database
2,1,Ms sql server
3,1,Ms sql server 2005
4,1,Sql server
...,...,...
2483371,54933,Designed APIs to retrieve the list of new tasks
2483372,54933,Creating a Data models for the app
2483373,54933,Developing the static pages using Bootstrap an...
2483374,54933,Integrating modules and performing unit testing


In [27]:
df_agg_skills = df_person_skills.groupby('person_id', as_index=False)['skill'].agg(', '.join)
df_agg_skills

Unnamed: 0,person_id,skill
0,1,"Database administration, Database, Ms sql serv..."
1,2,"sql server management studio, visual studio, s..."
2,3,"DATABASES, ORACLE (4 years), ORACLE 10G, SQL, ..."
3,4,Maintain multiple database environments (Redsh...
4,5,"Scrum, Agile software development, Product bac..."
...,...,...
54853,54929,"Django, Angular JS, JavaScript, JQuery, Node.j..."
54854,54930,"Python, Django, AWS, AngularJS, Bootstrap, Jav..."
54855,54931,"Python 2.7, HTML5, CSS3, AJAX, JSON, JQuery, A..."
54856,54932,"Python 3.1x, PyQuery, PyQt, Django, Angular.js..."


In [28]:
df_people = pd.merge(df_people, df_agg_skills, on='person_id', how='left')
df_people

Unnamed: 0,person_id,name,ability,skill
0,1,Database Administrator,"Installation and Building Server, Running Back...","Database administration, Database, Ms sql serv..."
1,2,Database Administrator,"database management systems administration, de...","sql server management studio, visual studio, s..."
2,3,Oracle Database Administrator,"Over 4+ years of Experience as Architecture, E...","DATABASES, ORACLE (4 years), ORACLE 10G, SQL, ..."
3,4,Amazon Redshift Administrator and ETL Develope...,"SQL management, PostgresSQL, Oracle, MySQL, mi...",Maintain multiple database environments (Redsh...
4,5,Scrum Master Scrum Master Scrum Master,"Scrum Master, Agile software development, Prod...","Scrum, Agile software development, Product bac..."
...,...,...,...,...
54928,54929,Lead Python Developer,"Qualified Python Developer, Web Application De...","Django, Angular JS, JavaScript, JQuery, Node.j..."
54929,54930,Full Stack Python Developer,"Over 7 years of IT Experience, Designing, deve...","Python, Django, AWS, AngularJS, Bootstrap, Jav..."
54930,54931,Eli Lilly,"Developing and designing Web Based, Multi-tier...","Python 2.7, HTML5, CSS3, AJAX, JSON, JQuery, A..."
54931,54932,Python Developer,"Python Developer, Web/Application Developer, A...","Python 3.1x, PyQuery, PyQt, Django, Angular.js..."


In [29]:
df_experience.head(10)

Unnamed: 0,person_id,title,firm,start_date,end_date,location
0,1,Database Administrator,Family Private Care LLC,04/2017,Present,"Roswell, GA"
1,1,Database Administrator,Incomm,01/2014,02/2017,"Alpharetta, GA"
2,2,Database Administrator,Intercontinental Registry,12/2008,08/2011,"Lagos, GU"
3,3,Oracle Database Administrator,Cognizant,06/2016,Present,"Hyderabad, Telangana"
4,3,Oracle Database Administrator,Convergys,06/2014,06/2016,"Hyderabad, Telangana"
5,4,Amazon Redshift Administrator and ETL Develope...,"MSP Recovery - Fort Lauderdale, FL",02/18,Present,"Plantation, FL"
6,4,Database Administrator,"CEAACES - Quito, Provincia de Pichincha",11/14,12/15,QUITO ECUADOR
7,4,Database Administrator,"Ceige - Havana, IL",09/07,10/14,"CEIGE, LA HAVANA CUBA"
8,5,Scrum Master,Quest Technologies,10/2015,04/2019,"Washington, DC"
9,5,Oracle Database Administrator/ Scrum Master,Prudential,06/2013,10/2015,"Richmond, VA"


In [30]:
df_experience = df_experience.dropna(subset=['title'])

In [31]:
# df_agg_experience = (
#     df_experience
#         .groupby('person_id', as_index=False)['title']
#         .reset_index()
# )
# df_agg_experience

In [32]:
df_unique_experience = df_experience.drop_duplicates(subset=['person_id', 'title'])
df_unique_experience = df_unique_experience[['person_id', 'title']]
df_unique_experience

Unnamed: 0,person_id,title
0,1,Database Administrator
2,2,Database Administrator
3,3,Oracle Database Administrator
5,4,Amazon Redshift Administrator and ETL Develope...
6,4,Database Administrator
...,...,...
265395,54932,Python Developer
265400,54932,Software Developer
265401,54933,MetroBikes
265402,54933,Python/Flask Developer


In [33]:
df_people = pd.merge(df_people, df_unique_experience, on='person_id', how='left')
df_people

Unnamed: 0,person_id,name,ability,skill,title
0,1,Database Administrator,"Installation and Building Server, Running Back...","Database administration, Database, Ms sql serv...",Database Administrator
1,2,Database Administrator,"database management systems administration, de...","sql server management studio, visual studio, s...",Database Administrator
2,3,Oracle Database Administrator,"Over 4+ years of Experience as Architecture, E...","DATABASES, ORACLE (4 years), ORACLE 10G, SQL, ...",Oracle Database Administrator
3,4,Amazon Redshift Administrator and ETL Develope...,"SQL management, PostgresSQL, Oracle, MySQL, mi...",Maintain multiple database environments (Redsh...,Amazon Redshift Administrator and ETL Develope...
4,4,Amazon Redshift Administrator and ETL Develope...,"SQL management, PostgresSQL, Oracle, MySQL, mi...",Maintain multiple database environments (Redsh...,Database Administrator
...,...,...,...,...,...
228712,54932,Python Developer,"Python Developer, Web/Application Developer, A...","Python 3.1x, PyQuery, PyQt, Django, Angular.js...",Python Developer
228713,54932,Python Developer,"Python Developer, Web/Application Developer, A...","Python 3.1x, PyQuery, PyQt, Django, Angular.js...",Software Developer
228714,54933,Job Seeker,"Analyzed and gathered business requirements, D...","Analyzed and gathered business requirements, D...",MetroBikes
228715,54933,Job Seeker,"Analyzed and gathered business requirements, D...","Analyzed and gathered business requirements, D...",Python/Flask Developer


In [34]:
df_education = df_education.dropna(subset=['program'])

In [35]:
df_agg_education = df_education.groupby('person_id', as_index=False)['program'].agg(', '.join)
df_agg_education

Unnamed: 0,person_id,program
0,1,Bachelor of Science
1,2,bsc in computer science
2,3,Master of Computer Applications in Science and...
3,4,Bachelor in Computer Science
4,6,General/Business/Science Courses
...,...,...
43828,54920,Bachelor of Engineering in Computer Science
43829,54923,Master's in Information technology
43830,54924,Bachelor's Degree in Computer Science
43831,54925,Masters in Electrical and Electronics Engineering


In [36]:
df_people = pd.merge(df_people, df_agg_education, on='person_id', how='left')
df_people

Unnamed: 0,person_id,name,ability,skill,title,program
0,1,Database Administrator,"Installation and Building Server, Running Back...","Database administration, Database, Ms sql serv...",Database Administrator,Bachelor of Science
1,2,Database Administrator,"database management systems administration, de...","sql server management studio, visual studio, s...",Database Administrator,bsc in computer science
2,3,Oracle Database Administrator,"Over 4+ years of Experience as Architecture, E...","DATABASES, ORACLE (4 years), ORACLE 10G, SQL, ...",Oracle Database Administrator,Master of Computer Applications in Science and...
3,4,Amazon Redshift Administrator and ETL Develope...,"SQL management, PostgresSQL, Oracle, MySQL, mi...",Maintain multiple database environments (Redsh...,Amazon Redshift Administrator and ETL Develope...,Bachelor in Computer Science
4,4,Amazon Redshift Administrator and ETL Develope...,"SQL management, PostgresSQL, Oracle, MySQL, mi...",Maintain multiple database environments (Redsh...,Database Administrator,Bachelor in Computer Science
...,...,...,...,...,...,...
228712,54932,Python Developer,"Python Developer, Web/Application Developer, A...","Python 3.1x, PyQuery, PyQt, Django, Angular.js...",Python Developer,
228713,54932,Python Developer,"Python Developer, Web/Application Developer, A...","Python 3.1x, PyQuery, PyQt, Django, Angular.js...",Software Developer,
228714,54933,Job Seeker,"Analyzed and gathered business requirements, D...","Analyzed and gathered business requirements, D...",MetroBikes,
228715,54933,Job Seeker,"Analyzed and gathered business requirements, D...","Analyzed and gathered business requirements, D...",Python/Flask Developer,


In [37]:
df_people = df_people.dropna(subset=['skill', 'program'])
df_people

Unnamed: 0,person_id,name,ability,skill,title,program
0,1,Database Administrator,"Installation and Building Server, Running Back...","Database administration, Database, Ms sql serv...",Database Administrator,Bachelor of Science
1,2,Database Administrator,"database management systems administration, de...","sql server management studio, visual studio, s...",Database Administrator,bsc in computer science
2,3,Oracle Database Administrator,"Over 4+ years of Experience as Architecture, E...","DATABASES, ORACLE (4 years), ORACLE 10G, SQL, ...",Oracle Database Administrator,Master of Computer Applications in Science and...
3,4,Amazon Redshift Administrator and ETL Develope...,"SQL management, PostgresSQL, Oracle, MySQL, mi...",Maintain multiple database environments (Redsh...,Amazon Redshift Administrator and ETL Develope...,Bachelor in Computer Science
4,4,Amazon Redshift Administrator and ETL Develope...,"SQL management, PostgresSQL, Oracle, MySQL, mi...",Maintain multiple database environments (Redsh...,Database Administrator,Bachelor in Computer Science
...,...,...,...,...,...,...
228688,54925,Verveba Richardson,"Design, Development, Implementation, Python, D...","Python 3.x, Django 1.4, MySQL, Linux, HTML, XH...",Senior Python Developer,Masters in Electrical and Electronics Engineering
228689,54925,Verveba Richardson,"Design, Development, Implementation, Python, D...","Python 3.x, Django 1.4, MySQL, Linux, HTML, XH...",Python Developer,Masters in Electrical and Electronics Engineering
228690,54925,Verveba Richardson,"Design, Development, Implementation, Python, D...","Python 3.x, Django 1.4, MySQL, Linux, HTML, XH...",Java / SQL Developer,Masters in Electrical and Electronics Engineering
228694,54927,Python Developer,"Python Developer, Web/Application Developer, A...","Python, Django, PyQT, Panda API, HTML, CSS, AJ...",Python Developer,Bachelor of Science in Computers


In [38]:
df_ready = df_people.drop(['person_id', 'name'], axis=1)
df_ready

Unnamed: 0,ability,skill,title,program
0,"Installation and Building Server, Running Back...","Database administration, Database, Ms sql serv...",Database Administrator,Bachelor of Science
1,"database management systems administration, de...","sql server management studio, visual studio, s...",Database Administrator,bsc in computer science
2,"Over 4+ years of Experience as Architecture, E...","DATABASES, ORACLE (4 years), ORACLE 10G, SQL, ...",Oracle Database Administrator,Master of Computer Applications in Science and...
3,"SQL management, PostgresSQL, Oracle, MySQL, mi...",Maintain multiple database environments (Redsh...,Amazon Redshift Administrator and ETL Develope...,Bachelor in Computer Science
4,"SQL management, PostgresSQL, Oracle, MySQL, mi...",Maintain multiple database environments (Redsh...,Database Administrator,Bachelor in Computer Science
...,...,...,...,...
228688,"Design, Development, Implementation, Python, D...","Python 3.x, Django 1.4, MySQL, Linux, HTML, XH...",Senior Python Developer,Masters in Electrical and Electronics Engineering
228689,"Design, Development, Implementation, Python, D...","Python 3.x, Django 1.4, MySQL, Linux, HTML, XH...",Python Developer,Masters in Electrical and Electronics Engineering
228690,"Design, Development, Implementation, Python, D...","Python 3.x, Django 1.4, MySQL, Linux, HTML, XH...",Java / SQL Developer,Masters in Electrical and Electronics Engineering
228694,"Python Developer, Web/Application Developer, A...","Python, Django, PyQT, Panda API, HTML, CSS, AJ...",Python Developer,Bachelor of Science in Computers


In [39]:
df_ready = df_ready.dropna(subset=['ability'])
df_ready

Unnamed: 0,ability,skill,title,program
0,"Installation and Building Server, Running Back...","Database administration, Database, Ms sql serv...",Database Administrator,Bachelor of Science
1,"database management systems administration, de...","sql server management studio, visual studio, s...",Database Administrator,bsc in computer science
2,"Over 4+ years of Experience as Architecture, E...","DATABASES, ORACLE (4 years), ORACLE 10G, SQL, ...",Oracle Database Administrator,Master of Computer Applications in Science and...
3,"SQL management, PostgresSQL, Oracle, MySQL, mi...",Maintain multiple database environments (Redsh...,Amazon Redshift Administrator and ETL Develope...,Bachelor in Computer Science
4,"SQL management, PostgresSQL, Oracle, MySQL, mi...",Maintain multiple database environments (Redsh...,Database Administrator,Bachelor in Computer Science
...,...,...,...,...
228688,"Design, Development, Implementation, Python, D...","Python 3.x, Django 1.4, MySQL, Linux, HTML, XH...",Senior Python Developer,Masters in Electrical and Electronics Engineering
228689,"Design, Development, Implementation, Python, D...","Python 3.x, Django 1.4, MySQL, Linux, HTML, XH...",Python Developer,Masters in Electrical and Electronics Engineering
228690,"Design, Development, Implementation, Python, D...","Python 3.x, Django 1.4, MySQL, Linux, HTML, XH...",Java / SQL Developer,Masters in Electrical and Electronics Engineering
228694,"Python Developer, Web/Application Developer, A...","Python, Django, PyQT, Panda API, HTML, CSS, AJ...",Python Developer,Bachelor of Science in Computers


# Modeling

In [40]:
df_ready['combined_text'] = df_ready['ability'] + ' ' + df_ready['skill']
df_ready = df_ready.drop_duplicates(subset=['title', 'combined_text'])

vectorizer = TfidfVectorizer(
    lowercase=True,
    stop_words='english',
    ngram_range=(1, 2),
    max_features=1000
)

tfidf_matrix = vectorizer.fit_transform(df_ready['combined_text'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_ready['combined_text'] = df_ready['ability'] + ' ' + df_ready['skill']


In [41]:
# Misal new_cv seperti ini (list beberapa CV baru)
new_cv = {
    'ability': [
        'Develop and deploy machine learning models, Data preprocessing, Feature engineering, Model evaluation',
        'Cloud infrastructure management, Serverless architecture, Auto-scaling configuration, Logging and monitoring',
        'Design and build responsive web applications, integrate RESTful APIs, UI/UX best practices',
        'Data visualization, Dashboard creation, KPI reporting, Cross-functional collaboration',
        'Automation scripting using Python and Bash, CI/CD pipeline maintenance, Docker containerization'
    ],
    'skill': [
        'Python, Scikit-learn, Tensorflow, Pandas, Jupyter Notebook',
        'AWS, Azure, Terraform, CloudFormation, Kubernetes',
        'JavaScript, React, HTML/CSS, Git, Figma',
        'Power BI, Tableau, SQL, Business Analysis',
        'GitLab CI, Jenkins, Docker, Bash scripting, YAML'
    ],
    'program': [
        'Master in Artificial Intelligence',
        'Bachelor in Cloud Computing and Infrastructure',
        'Bachelor in Software Engineering',
        'Bachelor of Business Analytics',
        'Diploma in DevOps and Automation'
    ]
}

# Gabungkan per baris jadi satu string lengkap tiap CV baru
new_texts = [
    ability + ' ' + skill + ' ' + program
    for ability, skill, program in zip(new_cv['ability'], new_cv['skill'], new_cv['program'])
]

# Transform semua teks baru sekaligus
new_tfidf = vectorizer.transform(new_texts)

# Hitung cosine similarity setiap CV baru dengan dataset lama
cosine_similarities = cosine_similarity(new_tfidf, tfidf_matrix)

# Untuk tiap CV baru, ambil 5 rekomendasi teratas
top_n = 5
for i, sims in enumerate(cosine_similarities):
    top_indices = sims.argsort()[-top_n:][::-1]
    print(f"\nRekomendasi untuk CV baru ke-{i+1}:")
    for idx in top_indices:
        print(f"  Title: {df_ready.iloc[idx]['title']} (score: {(sims[idx] * 100).round(2)}%)")


Rekomendasi untuk CV baru ke-1:
  Title: Student Researcher (score: 52.11%)
  Title: Python Developer (score: 52.11%)
  Title: Technology Analyst Summer Intern (score: 52.11%)
  Title: Senior Analytics Consultant (score: 45.07%)
  Title: Solution Developer (score: 45.07%)

Rekomendasi untuk CV baru ke-2:
  Title: SR. CLOUD ENGINEER (score: 57.87%)
  Title: IT HELPDESK TECHNICIAN (score: 57.87%)
  Title: DEVOPS ENGINEER\ IT SECURITY ANALYST (score: 57.87%)
  Title: Oracle Database Administrator (score: 52.84%)
  Title: AWS Cloud Engineer (score: 52.84%)

Rekomendasi untuk CV baru ke-3:
  Title: UI Designer (score: 46.52%)
  Title: Web Designer (score: 46.52%)
  Title: Developer (score: 46.52%)
  Title: New Media Designer (score: 46.52%)
  Title: Freelance Front End (score: 46.52%)

Rekomendasi untuk CV baru ke-4:
  Title: IT Business Analyst, Java Developer - Intern (score: 44.86%)
  Title: Data Engineer, Business Risk Analyst (score: 44.86%)
  Title: Enterprise Business Intelligence D

# Save Model

In [42]:
# Simpan TF-IDF vectorizer
joblib.dump(vectorizer, 'models/tfidf_vectorizer.joblib')

# Simpan matrix hasil transformasi
joblib.dump(tfidf_matrix, 'models/tfidf_matrix.joblib')

# Simpan dataframe referensi
df_ready[['title', 'combined_text']].to_csv('dataset/job_reference_data.csv', index=False)