In [3]:
import numpy as np
import pandas as pd
import nltk
nltk.download("punkt")
nltk.download("stopwords")
nltk.download("wordnet")
nltk.download("averaged_perceptron_tagger")
nltk.download("omw-1.4")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [4]:

data = pd.read_csv(
    "/content/raw.csv",
    encoding="utf-8",
    nrows= 60000
)

In [5]:
data.shape

(60000, 4)

In [6]:
df= pd.DataFrame(data)

In [7]:
df.columns

Index(['company_name', 'title', 'description', 'location'], dtype='object')

In [8]:
df.head()

Unnamed: 0,company_name,title,description,location
0,Corcoran Sawyer Smith,Marketing Coordinator,Job descriptionA leading real estate firm in N...,"Princeton, NJ"
1,,Mental Health Therapist/Counselor,"At Aspen Therapy and Wellness , we are committ...","Fort Collins, CO"
2,The National Exemplar,Assitant Restaurant Manager,The National Exemplar is accepting application...,"Cincinnati, OH"
3,"Abrams Fensterman, LLP",Senior Elder Law / Trusts and Estates Associat...,Senior Associate Attorney - Elder Law / Trusts...,"New Hyde Park, NY"
4,,Service Technician,Looking for HVAC service tech with experience ...,"Burlington, IA"


In [9]:
use_cols = [
    "title",
    "description",
    "location",
    "company_name",
]

In [10]:
drop_cols=[]
for col in df.columns:
    if col not in use_cols:
        drop_cols.append(col)

In [11]:
drop_cols

[]

In [12]:
df.drop(columns=drop_cols,inplace= True,axis=0)

In [13]:
df.isnull().sum()

Unnamed: 0,0
company_name,1034
title,0
description,0
location,0


In [14]:
df['description']

Unnamed: 0,description
0,Job descriptionA leading real estate firm in N...
1,"At Aspen Therapy and Wellness , we are committ..."
2,The National Exemplar is accepting application...
3,Senior Associate Attorney - Elder Law / Trusts...
4,Looking for HVAC service tech with experience ...
...,...
59995,Company Description\n\nHeadquartered in Southe...
59996,InterDent is a well-established and growing or...
59997,PRIMARY FUNCTION:\n\nSHIFT: FIRST SHIFT\n\nPri...
59998,About Us:LTIMindtree is a global technology co...


In [15]:
df = df.dropna(subset=["description"])


In [16]:
df

Unnamed: 0,company_name,title,description,location
0,Corcoran Sawyer Smith,Marketing Coordinator,Job descriptionA leading real estate firm in N...,"Princeton, NJ"
1,,Mental Health Therapist/Counselor,"At Aspen Therapy and Wellness , we are committ...","Fort Collins, CO"
2,The National Exemplar,Assitant Restaurant Manager,The National Exemplar is accepting application...,"Cincinnati, OH"
3,"Abrams Fensterman, LLP",Senior Elder Law / Trusts and Estates Associat...,Senior Associate Attorney - Elder Law / Trusts...,"New Hyde Park, NY"
4,,Service Technician,Looking for HVAC service tech with experience ...,"Burlington, IA"
...,...,...,...,...
59995,Skechers,Retail Product Specialist,Company Description\n\nHeadquartered in Southe...,"Fort Lauderdale, FL"
59996,InterDent Service Corporation,System Engineer,InterDent is a well-established and growing or...,"Vancouver, WA"
59997,Wayne-Sanderson Farms,Utility Associate,PRIMARY FUNCTION:\n\nSHIFT: FIRST SHIFT\n\nPri...,"Fernwood, MS"
59998,LTIMindtree,Quality Management Specialist,About Us:LTIMindtree is a global technology co...,"Houston, TX"


In [18]:
import nltk
nltk.download('punkt_tab')
from nltk import word_tokenize, sent_tokenize
df["tokens"] = df["description"].apply(word_tokenize)

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


In [19]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words("english"))
df["tokens"] = df["tokens"].apply(
    lambda x: [word for word in x if word.lower() not in stop_words]
)

In [22]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
import nltk
nltk.download('averaged_perceptron_tagger_eng')

lemmatizer= WordNetLemmatizer()
def get_wordnet_pos(word):
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_map = {
        "V": wordnet.VERB,
        "N": wordnet.NOUN,
        "R": wordnet.ADV
    }
    return tag_map.get(tag, wordnet.NOUN)


df["tokens"] = df["tokens"].apply(
    lambda x: [lemmatizer.lemmatize(word, get_wordnet_pos(word)) for word in x]
)

[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.


In [None]:
df

In [23]:
import re

def clean_tokens(tokens):
    cleaned = []

    for w in tokens:
        # Lowercase
        w = w.lower()

        # Keep only letters
        w = re.sub(r"[^a-z]", "", w)

        # Remove empty strings
        if len(w) > 1:
            cleaned.append(w)

    return cleaned


df["tokens"] = df["tokens"].apply(clean_tokens)


In [24]:
df.to_csv("cleaned_data.csv", index=False)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

df["text"] = df["tokens"].apply(lambda x: " ".join(x))

vectorizer = TfidfVectorizer(
    max_features=10000,
    min_df=5,
    max_df=0.8,
    ngram_range=(1,2)
)

X_tfidf = vectorizer.fit_transform(df["text"])

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

sim = cosine_similarity(X_tfidf)

sim[0][:10]