In [275]:
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from sentence_transformers import SentenceTransformer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error

In [276]:
def embed_with_sentence_transformers(sentences, model_name='all-MiniLM-L6-v2'):
    print(f"\n--- Generating embeddings with 'sentence-transformers' library ({model_name}) ---")
    model = SentenceTransformer(model_name)
    sentence_embeddings = model.encode(sentences, convert_to_tensor=True)
    print("Embedding generation complete.")
    return sentence_embeddings.cpu()


In [277]:
def remove_stopwords(sentence):
    tokenizer = RegexpTokenizer(r'\w+')  # Matches one or more alphanumeric characters or underscores
    tokens = tokenizer.tokenize(sentence)
    english_stopwords = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word not in english_stopwords]
    cleaned_text = ' '.join(filtered_tokens)
    return cleaned_text


In [278]:
def preprocessing(data):
    sentences = list(data)
    print("Lowering sentences...")
    lowered_sentences = list(map(lambda X: X.lower(), sentences))
    print("Cleaning sentences...")
    cleaned_sentences = list(map(remove_stopwords, lowered_sentences))
    print("Tokenizing sentences...")
    embeddings = embed_with_sentence_transformers(cleaned_sentences)
    print("Embedding generation complete.")
    return np.array(embeddings)

In [279]:
data = pd.read_csv('job match\\resume_data.csv')

In [280]:
print(data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9544 entries, 0 to 9543
Data columns (total 35 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   address                              784 non-null    object 
 1   career_objective                     4740 non-null   object 
 2   skills                               9488 non-null   object 
 3   educational_institution_name         9460 non-null   object 
 4   degree_names                         9460 non-null   object 
 5   passing_years                        9460 non-null   object 
 6   educational_results                  9460 non-null   object 
 7   result_types                         9460 non-null   object 
 8   major_field_of_studies               9460 non-null   object 
 9   professional_company_names           9460 non-null   object 
 10  company_urls                         9460 non-null   object 
 11  start_dates                   

In [281]:
text_cols = ['career_objective', 'degree_names', 'major_field_of_studies', 'positions', 'skills', 'positions',
             'responsibilities']
for col in text_cols:
    data[col] = data[col].fillna('')

In [282]:
print(data.columns)

Index(['address', 'career_objective', 'skills', 'educational_institution_name',
       'degree_names', 'passing_years', 'educational_results', 'result_types',
       'major_field_of_studies', 'professional_company_names', 'company_urls',
       'start_dates', 'end_dates', 'related_skils_in_job', 'positions',
       'locations', 'responsibilities', 'extra_curricular_activity_types',
       'extra_curricular_organization_names',
       'extra_curricular_organization_links', 'role_positions', 'languages',
       'proficiency_levels', 'certification_providers', 'certification_skills',
       'online_links', 'issue_dates', 'expiry_dates', '﻿job_position_name',
       'educationaL_requirements', 'experiencere_requirement',
       'age_requirement', 'responsibilities.1', 'skills_required',
       'matched_score'],
      dtype='object')


In [283]:
feature_columns = [
    'skills',
    'career_objective',
    'degree_names',
    'major_field_of_studies',
    'positions',
    'responsibilities'
]

In [284]:
for col in feature_columns:
    data[col] = data[col].astype(str)

In [285]:
new_data = pd.DataFrame(columns=["resume_job", "match_score"])
new_data['resume_job'] = data['skills'] + data['career_objective'] + data['degree_names'] + data[
    'major_field_of_studies'] + data['positions'] + data['responsibilities']
new_data['match_score'] = data['matched_score'].astype(float)

In [286]:
print(new_data.head())
print(new_data.info())

                                          resume_job  match_score
0  ['Big Data', 'Hadoop', 'Hive', 'Python', 'Mapr...     0.850000
1  ['Data Analysis', 'Data Analytics', 'Business ...     0.750000
2  ['Software Development', 'Machine Learning', '...     0.416667
3  ['accounts payables', 'accounts receivables', ...     0.760000
4  ['Analytical reasoning', 'Compliance testing k...     0.650000
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9544 entries, 0 to 9543
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   resume_job   9544 non-null   object 
 1   match_score  9544 non-null   float64
dtypes: float64(1), object(1)
memory usage: 149.2+ KB
None


In [287]:
X = preprocessing(new_data['resume_job'])

Lowering sentences...
Cleaning sentences...
Tokenizing sentences...

--- Generating embeddings with 'sentence-transformers' library (all-MiniLM-L6-v2) ---
Embedding generation complete.
Embedding generation complete.


In [288]:
Y = np.array(new_data['match_score'])

In [289]:
print(X.shape)
print(Y.shape)

(9544, 384)
(9544,)


In [290]:
x, x_test, y, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [291]:
x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.25, random_state=42)

In [292]:
print(x_train.shape, x_val.shape, x_test.shape, y_train.shape, y_val.shape, y_test.shape)

(5726, 384) (1909, 384) (1909, 384) (5726,) (1909,) (1909,)


In [293]:
model = Ridge(alpha=0.04, max_iter=1000, random_state=42)
model.fit(x_train, y_train)

In [294]:
print(mean_squared_error(y_val, model.predict(x_val)))

0.015410870766941845


In [295]:
print(mean_squared_error(y_test, model.predict(x_test)))

0.013800027724212129


In [296]:
random_resumes_idxs = np.random.choice(len(data), 20, replace=False)
print(random_resumes_idxs)

[5996 6531 7550  425 1234 7418 8983 2582 8306 8812 3895 2575 4763 8108
 3043 3831 2327 2955  857 6781]


In [297]:
random_resumes = pd.DataFrame()
for i in ['skills',
          'career_objective',
          'degree_names',
          'major_field_of_studies']:
    random_resumes[i] = data.loc[random_resumes_idxs][i]

In [298]:
random_resumes_trial_1 = pd.DataFrame(random_resumes)
random_resumes_trial_2 = pd.DataFrame(random_resumes)


In [299]:
trial_1 = {
    "positions": "Data Scientist"
    ,
    "responsibilities":
        "Develop and implement machine learning models to solve business problems." +
        "Perform exploratory data analysis to identify trends, patterns, and insights." +
        "Clean, preprocess, and wrangle large datasets from various sources." +
        "Design and conduct A/B tests to evaluate product features and marketing strategies." +
        "Communicate findings and recommendations to stakeholders through visualizations and reports." +
        "Collaborate with engineering teams to deploy models into production environments."

}
trial_2 = {
    "positions": "Software Engineer",
    "responsibilities":

        "Design, develop, and maintain scalable and robust software applications." +
        "Write clean, efficient, and well-documented code in languages such as Python, Java, or C++." +
        "Collaborate with cross-functional teams including product managers and designers to deliver high-quality software." +
        "Participate in code reviews to maintain code quality and share knowledge." +
        "Troubleshoot, debug, and upgrade existing software systems." +
        "Implement and maintain CI/CD pipelines for automated testing and deployment."

}

In [300]:
random_resumes_trial_1[['positions', 'responsibilities']] = pd.DataFrame(columns=['positions', 'responsibilities'],
                                                                         dtype=str)
random_resumes_trial_2[['positions', 'responsibilities']] = pd.DataFrame(columns=['positions', 'responsibilities'],
                                                                         dtype=str)
random_resumes_trial_1['positions'].fillna(trial_1['positions'], inplace=True)
random_resumes_trial_2['positions'].fillna(trial_2['positions'], inplace=True)
random_resumes_trial_1['responsibilities'].fillna(trial_1['responsibilities'], inplace=True)
random_resumes_trial_2['responsibilities'].fillna(trial_2['responsibilities'], inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  random_resumes_trial_1['positions'].fillna(trial_1['positions'], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  random_resumes_trial_2['positions'].fillna(trial_2['positions'], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never wor

In [301]:
print(random_resumes_trial_1.columns)
print(random_resumes_trial_2.columns)

Index(['skills', 'career_objective', 'degree_names', 'major_field_of_studies',
       'positions', 'responsibilities'],
      dtype='object')
Index(['skills', 'career_objective', 'degree_names', 'major_field_of_studies',
       'positions', 'responsibilities'],
      dtype='object')


In [302]:
new_trial_1 = pd.DataFrame(columns=["resume_job"])
new_trial_2 = pd.DataFrame(columns=["resume_job"])
new_trial_1['resume_job'] = random_resumes_trial_1['skills'] + random_resumes_trial_1['career_objective'] + \
                            random_resumes_trial_1['degree_names'] + random_resumes_trial_1[
                                'major_field_of_studies'] + random_resumes_trial_1['positions'] + \
                            random_resumes_trial_1['responsibilities']
new_trial_2['resume_job'] = random_resumes_trial_2['skills'] + random_resumes_trial_2['career_objective'] + \
                            random_resumes_trial_2['degree_names'] + random_resumes_trial_2[
                                'major_field_of_studies'] + random_resumes_trial_2['positions'] + \
                            random_resumes_trial_2['responsibilities']

In [303]:
for col in new_trial_1.columns:
    new_trial_1[col]=new_trial_1[col].astype(str)
    new_trial_2[col]=new_trial_2[col].astype(str)

In [304]:
x_trial_1=preprocessing(new_trial_1['resume_job'])
x_trial_2=preprocessing(new_trial_2['resume_job'])

Lowering sentences...
Cleaning sentences...
Tokenizing sentences...

--- Generating embeddings with 'sentence-transformers' library (all-MiniLM-L6-v2) ---
Embedding generation complete.
Embedding generation complete.
Lowering sentences...
Cleaning sentences...
Tokenizing sentences...

--- Generating embeddings with 'sentence-transformers' library (all-MiniLM-L6-v2) ---
Embedding generation complete.
Embedding generation complete.


In [305]:
trial_1_predictions = np.array(model.predict(x_trial_1))
trial_2_predictions = np.array(model.predict(x_trial_2))

In [306]:
print(trial_1_predictions)
print(trial_2_predictions)

[0.6164988  0.62092155 0.6840404  0.49399412 0.52638066 0.74659884
 0.5584202  0.6060369  0.5910728  0.61839306 0.4997305  0.60806876
 0.5512189  0.5609694  0.5584202  0.54724205 0.6210226  0.58063805
 0.39846358 0.5853854 ]
[0.697981   0.6696443  0.7505527  0.45525247 0.58501995 0.7160052
 0.6030106  0.6879548  0.608755   0.55714357 0.5207721  0.6276414
 0.6052947  0.5785549  0.6030106  0.5694086  0.63848734 0.5422159
 0.38295513 0.51088583]


In [307]:
sorted_trial_1=np.argsort(trial_1_predictions)
sorted_trial_2=np.argsort(trial_2_predictions)

In [308]:
for i in sorted_trial_1[::-1]:
    print(random_resumes_trial_1.iloc[i])
    print(f"match score: {trial_1_predictions[i]}")

skills                    ['Operations management', 'Change management',...
career_objective                                                           
degree_names                                      ['B.S', 'M.S', 'B.S.I.T']
major_field_of_studies    ['Mechanical Engineering', 'Electrical Enginee...
positions                                                    Data Scientist
responsibilities          Develop and implement machine learning models ...
Name: 7418, dtype: object
match score: 0.7465988397598267
skills                    ['Software Developer', 'Python', 'SQL', 'Machi...
career_objective          Understanding and solving problems has been on...
degree_names                                                        ['BCA']
major_field_of_studies                                              ['N/A']
positions                                                    Data Scientist
responsibilities          Develop and implement machine learning models ...
Name: 7550, dtype: object
matc

In [309]:
for i in sorted_trial_2[::-1]:
    print(random_resumes_trial_2.iloc[i])
    print(f"match score: {trial_2_predictions[i]}")

skills                    ['Software Developer', 'Python', 'SQL', 'Machi...
career_objective          Understanding and solving problems has been on...
degree_names                                                        ['BCA']
major_field_of_studies                                              ['N/A']
positions                                                 Software Engineer
responsibilities          Design, develop, and maintain scalable and rob...
Name: 7550, dtype: object
match score: 0.7505527138710022
skills                    ['Operations management', 'Change management',...
career_objective                                                           
degree_names                                      ['B.S', 'M.S', 'B.S.I.T']
major_field_of_studies    ['Mechanical Engineering', 'Electrical Enginee...
positions                                                 Software Engineer
responsibilities          Design, develop, and maintain scalable and rob...
Name: 7418, dtype: object
matc