In [1]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import TreebankWordTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import joblib

In [2]:
nltk.download('stopwords')
nltk.download('punkt')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\iliya_pc\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\iliya_pc\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
df_resume = pd.read_csv(r'E:\Python-Repo\Part\cangrow2-raw\DataSet\Role-Resume-Dataset.csv')
df_jobs = pd.read_csv(r'E:\Python-Repo\Part\cangrow2-raw\DataSet\Job-Description-Dataset.csv')

In [4]:
df_resume.head()

Unnamed: 0.1,Unnamed: 0,job_title,resume
0,0,Business Analyst,"Proficient in Content Marketing, Budget Manage..."
1,1,Business Analyst,"Proficient in Brand Strategy, SEO, Budget Mana..."
2,2,Business Analyst,"Proficient in Forecasting, Budgeting, Market R..."
3,3,Business Analyst,"Proficient in Supply Chain Optimization, Vendo..."
4,4,Business Analyst,"Proficient in Budgeting, Logistics, Procuremen..."


In [5]:
df_jobs.head()

Unnamed: 0.1,Unnamed: 0,Experience,Qualifications,Salary Range,Country,Work Type,Company Size,Preference,Job Title,Role,Job Description,skills,Responsibilities
0,0,1 to 15 Years,B.Com,$65K-$130K,Seychelles,Full-Time,129360,Male,Account Director,Business Development Director,Business Development Directors lead efforts to...,Business development Sales strategy Market ana...,Develop and execute business development strat...
1,1,3 to 12 Years,B.Tech,$56K-$102K,UK,Intern,98903,Male,Account Director,Account Strategist,Account Strategists work in advertising or mar...,Account management Client relations Marketing ...,Develop account strategies and marketing plans...
2,2,1 to 10 Years,M.Tech,$55K-$110K,Romania,Full-Time,80235,Male,Account Director,Business Development Director,Business Development Directors lead efforts to...,Business development Sales strategy Market ana...,Develop and execute business development strat...
3,3,3 to 8 Years,MBA,$65K-$90K,West Bank and Gaza,Contract,59873,Female,Account Director,Account Strategist,Account Strategists work in advertising or mar...,Account management Client relations Marketing ...,Develop account strategies and marketing plans...
4,4,1 to 13 Years,B.Com,$65K-$102K,Equatorial Guinea,Intern,52777,Both,Account Director,Account Strategist,Account Strategists work in advertising or mar...,Account management Client relations Marketing ...,Develop account strategies and marketing plans...


In [6]:
df_resume.isnull().sum()

Unnamed: 0    0
job_title     0
resume        0
dtype: int64

In [7]:
df_jobs.isnull().sum()

Unnamed: 0          0
Experience          0
Qualifications      0
Salary Range        0
Country             0
Work Type           0
Company Size        0
Preference          0
Job Title           0
Role                0
Job Description     0
skills              0
Responsibilities    0
dtype: int64

In [8]:
df_jobs.drop(columns=['Unnamed: 0'], inplace=True)

In [9]:
df_resume.drop(columns=['Unnamed: 0'], inplace=True)

In [10]:
df_jobs.columns

Index(['Experience', 'Qualifications', 'Salary Range', 'Country', 'Work Type',
       'Company Size', 'Preference', 'Job Title', 'Role', 'Job Description',
       'skills', 'Responsibilities'],
      dtype='object')

In [11]:
df_resume.columns

Index(['job_title', 'resume'], dtype='object')

In [12]:
df_jobs.duplicated().sum()

np.int64(0)

In [13]:
df_resume.duplicated().sum()

np.int64(0)

In [14]:
df_resume['job_title'].value_counts()

job_title
Business Analyst                   8
Data Scientist                     8
Construction Engineer              8
Mechanical Engineer                8
Fitness Coach                      8
Lawyer                             8
Artist                             8
Author                             8
Developer                          8
Operations Manager                 8
HR Specialist                      8
Machine Learning Engineer          8
Web Developer                      8
Salesman                           8
Environmental Scientist            8
Electrical Engineer                7
Event Planner                      7
Project Manager                    7
SEO Specialist                     7
Social Worker                      7
Cloud Architect                    7
Database Developer and Analyst     7
Cybersecurity Analyst              7
Customer Service Representative    7
Web Designer and Developer         7
Teacher                            7
Nurse                       

In [15]:
df_jobs['Job Title'].value_counts()

Job Title
Account Director            5
Account Executive           5
Account Manager             5
Accountant                  5
Administrative Assistant    5
                           ..
Urban Planner               5
Veterinarian                5
Web Designer                5
Web Developer               5
Wedding Planner             5
Name: count, Length: 147, dtype: int64

In [16]:
df_resume['clean_resume'] = df_resume['resume'].str.lower()
df_resume['clean_resume'] = df_resume['clean_resume'].apply(lambda x: re.sub(r'[^a-zA-Z\s]', ' ', x))

In [17]:
tokenizer = TreebankWordTokenizer()
df_resume['clean_resume'] = df_resume['clean_resume'].apply(tokenizer.tokenize)

In [18]:
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

# استفاده از stopwords آماده sklearn
stop_words = set(ENGLISH_STOP_WORDS)

# # افزودن کلمات پرت و نویز که باید حذف شوند
# custom_stopwords = {'like', 'using', 'etc', 'must', 'should', 'will', 'can', 'want', 'get', 'make'}
# stop_words.update(custom_stopwords)


In [19]:

df_resume['clean_resume'] = df_resume['clean_resume'].apply(lambda tokens: [word for word in tokens if word not in stop_words])

In [20]:
df_resume['clean_resume'] = df_resume['clean_resume'].apply(lambda tokens: ' '.join(tokens))

In [21]:
df_resume_clean = df_resume.drop(columns='resume')
df_resume_clean.head()

Unnamed: 0,job_title,clean_resume
0,Business Analyst,proficient content marketing budget management...
1,Business Analyst,proficient brand strategy seo budget managemen...
2,Business Analyst,proficient forecasting budgeting market resear...
3,Business Analyst,proficient supply chain optimization vendor ma...
4,Business Analyst,proficient budgeting logistics procurement tra...


In [None]:
df_jobs['clean_description'] = df_jobs['Job Description'].str.lower()
df_jobs['clean_description'] = df_jobs['clean_description'].apply(lambda x: re.sub(r'[^a-zA-Z\s]', ' ', str(x)))
df_jobs['clean_description'] = df_jobs['clean_description'].apply(tokenizer.tokenize)
df_jobs['clean_description'] = df_jobs['clean_description'].apply(lambda tokens: [word for word in tokens if word not in stop_words])
df_jobs['clean_description'] = df_jobs['clean_description'].apply(lambda tokens: ' '.join(tokens))

In [None]:
X = df_resume_clean['clean_resume']
y = df_resume_clean['job_title']

vectorizer = TfidfVectorizer(max_features=5000)
X_vect = vectorizer.fit_transform(X)

label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X_vect, y_encoded, test_size=0.2, random_state=42)

In [None]:
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print("Model accuracy : ", accuracy_score(y_test, y_pred))
print("Classification report : ", classification_report(y_test, y_pred))