**Hammad Latif**


In [None]:
import numpy as np
import pandas as pd
import nltk
import re
from nltk.corpus import stopwords, wordnet
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag



In [None]:
# Download NLTK data (if not already downloaded)
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')  # For additional wordnet data
nltk.download('averaged_perceptron_tagger')  # For POS tagging
nltk.download('averaged_perceptron_tagger')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [None]:
df = pd.read_csv("/content/sample_data/job_descriptions.csv")
df.shape

(88418, 23)

In [None]:
df = pd.read_csv("/content/sample_data/job_descriptions.csv")


In [None]:
df.shape

(200180, 23)

In [None]:
df.isnull().sum()

Unnamed: 0,0
Job Id,0
Experience,0
Qualifications,0
Salary Range,0
location,0
Country,0
latitude,0
longitude,0
Work Type,0
Company Size,0


In [None]:
df.drop(["Company Profile"], axis=1, inplace=True)

In [None]:
df.dropna(inplace=True)
df.shape

(200179, 22)

In [None]:
textual_features = df[["Job Title", "Role", "Job Description", "skills", "Responsibilities" ]].copy()
textual_features.head()

Unnamed: 0,Job Title,Role,Job Description,skills,Responsibilities
0,Digital Marketing Specialist,Social Media Manager,Social Media Managers oversee an organizations...,"Social media platforms (e.g., Facebook, Twitte...","Manage and grow social media accounts, create ..."
1,Web Developer,Frontend Web Developer,Frontend Web Developers design and implement u...,"HTML, CSS, JavaScript Frontend frameworks (e.g...","Design and code user interfaces for websites, ..."
2,Operations Manager,Quality Control Manager,Quality Control Managers establish and enforce...,Quality control processes and methodologies St...,Establish and enforce quality control standard...
3,Network Engineer,Wireless Network Engineer,"Wireless Network Engineers design, implement, ...",Wireless network design and architecture Wi-Fi...,"Design, configure, and optimize wireless netwo..."
4,Event Manager,Conference Manager,A Conference Manager coordinates and manages c...,Event planning Conference logistics Budget man...,Specialize in conference and convention planni...


In [None]:
def get_wordnet_pos(treebank_tag):
    """
    Map NLTK's POS tags to WordNet's POS tags for lemmatization.
    """
    if treebank_tag.startswith('J'):
        return wordnet.ADJ  # Adjective
    elif treebank_tag.startswith('V'):
        return wordnet.VERB  # Verb
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN  # Noun
    elif treebank_tag.startswith('R'):
        return wordnet.ADV  # Adverb
    else:
        return wordnet.NOUN  # Default to noun

# preprocessing on Textual Columns
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
  # Remove HTML tags
    Text = re.sub(r'<.*?>', '', text)
    # Remove special characters and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = text.lower()
    tokens = word_tokenize(text)
    filtered_tokens = [word for word in tokens if word not in stop_words]
    # Initialize lemmatizer
    lemmatizer = WordNetLemmatizer()

    # Get POS tags for the tokens
    pos_tags = pos_tag(filtered_tokens)

    # Lemmatize tokens with POS tags
    lemmatized_tokens = [
        lemmatizer.lemmatize(word, get_wordnet_pos(pos_tag))
        for word, pos_tag in pos_tags
    ]

    return ' '.join(lemmatized_tokens)


In [None]:
# preprocess the Textual Columns
for col in textual_features.columns:
    textual_features[col] = textual_features[col].apply(preprocess_text)

In [None]:
# combine Textual columns into one column
textual_features["Combined_Text"] = textual_features[["Job Title", "Role", "Job Description", "skills", "Responsibilities"]].apply(lambda x: ' '.join(x), axis=1)
textual_features.drop(["Job Title", "Role", "Job Description", "skills", "Responsibilities"], axis=1, inplace=True)
textual_features.head()

In [None]:
textual_features["Combined_Text"].head()

Unnamed: 0,Combined_Text
0,digital marketing specialist social medium man...
1,web developer frontend web developer frontend ...
2,operation manager quality control manager qual...
3,network engineer wireless network engineer wir...
4,event manager conference manager conference ma...


In [None]:
# calculate tfidf score of Combined_Text column and store it in tfidf_scores
vectorizer = TfidfVectorizer()
tfidf_scores = vectorizer.fit_transform(textual_features["Combined_Text"])

# Get the original feature names from TfidfVectorizer
feature_names = vectorizer.get_feature_names_out()

tfidf_df = pd.DataFrame(tfidf_scores.toarray(), columns=feature_names)
tfidf_df.head()

Unnamed: 0,ab,ability,abnormality,abuse,academic,accept,acceptable,access,accessibility,accessible,...,wound,write,writer,writing,xd,xml,yard,young,zendesk,zone
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
tfidf_df.shape

(200179, 1951)

<!-- -------------------------------------------------------------------------------- -->

**User Profile Part**

For the sake of convenience, i will take skills, and experience(Job titles, Roles) as user profile

In [None]:
# user profile
# combining more than one titles, roles
textual = {
        'titles': "Data Analyst Machine Learning Engineer",
        'roles':
            "Analyzed business data and generated reports Built machine learning models for predictive analytics"
        ,
        'skills': "Sklearn Pandas NumPy Matplotlib Seaborn Python SQL"
    }

**NOTE:** Need ordinal_encoders

In [None]:
textual_df = pd.DataFrame([textual])
textual_df.head()

Unnamed: 0,titles,roles,skills
0,Data Analyst Machine Learning Engineer,Analyzed business data and generated reports B...,Sklearn Pandas NumPy Matplotlib Seaborn Python...


In [None]:
textual_df["text_combined"] =  textual_df[["titles", "roles", "skills"]].apply(lambda x: ' '.join(x), axis=1)
textual_df.drop(["titles", "roles", "skills"], axis=1, inplace=True)
textual_df.head()

Unnamed: 0,text_combined
0,Data Analyst Machine Learning Engineer Analyze...


In [None]:
# preprocess the Textual Columns
textual_df["text_combined"] = textual_df["text_combined"].apply(preprocess_text)

In [None]:
textual_df.head()

Unnamed: 0,text_combined
0,data analyst machine learn engineer analyzed b...


In [None]:
# transform into tfidf score
user_profile = vectorizer.transform(textual_df["text_combined"] )

In [None]:
user_profile.shape

(1, 1951)

In [None]:
# Get the original feature names from TfidfVectorizer
f_names = vectorizer.get_feature_names_out()

user_profile = pd.DataFrame(user_profile.toarray(), columns=f_names)
user_profile.head()

Unnamed: 0,ab,ability,abnormality,abuse,academic,accept,acceptable,access,accessibility,accessible,...,wound,write,writer,writing,xd,xml,yard,young,zendesk,zone
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
from sklearn.metrics.pairwise import cosine_similarity

# Compute cosine similarity between the user input and all job postings
similarity_scores = cosine_similarity(user_profile, tfidf_df).flatten()

# Get the indices of the top N similar job postings
N = 20
top_n_indices = similarity_scores.argsort()[-N:][::-1]
# print(top_n_indices)




[ 89147  93519  33781 139543  69995  70035 177622  26197  70070 185270
 105111  99012  26000 170836 157159 157284 138821  49658  49643 178063]


In [None]:
# Retrieve and display the top N job postings
for idx in top_n_indices:
    print(f"Job Index: {idx}, Similarity Score: {similarity_scores[idx]:.4f}")
    # Access job posting details using idx
    print(df.iloc[idx][['Job Title', 'Job Description', 'Company', 'location', 'Country']])
    print('---')

Job Index: 89147, Similarity Score: 0.6425
Job Title                                             Data Scientist
Job Description    Machine Learning Engineers develop machine lea...
Company                                                        Chewy
location                                              Guatemala City
Country                                                    Guatemala
Name: 89147, dtype: object
---
Job Index: 93519, Similarity Score: 0.6425
Job Title                                             Data Scientist
Job Description    Machine Learning Engineers develop machine lea...
Company                                           Berry Global Group
location                                                     Papeete
Country                                             French Polynesia
Name: 93519, dtype: object
---
Job Index: 33781, Similarity Score: 0.6425
Job Title                                             Data Scientist
Job Description    Machine Learning Engineers deve

**Report**

1. I downloaded dataset from Kaggle and selected "Job Title", "Role", "Job Description", "skills", and  "Responsibilities" columns as they were relevant to find job

2. Preprocessed these columns to remove html tags, special characters, numbers using NLTK

3. Combined these columns into a single columns and then converted it into tfidf values.

4. Then took user data for "Job Title", "Role", and "skills". He may enter more than one this data. Combined them into one column and performed preprocessing. After that calculated tfidf value of it and to find most relevant jobs compared it with job posting (tfidf score) using cosine similarity.

5. Dispplayed top 20 relevant jobs.