In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd

In [5]:
import pandas as pd

# Create a mock DataFrame with 5 example jobs
jobs_data = {
    'id': ['job1', 'job2', 'job3', 'job4', 'job5'],
    'title': [
        'Fix Kitchen Sink',
        'Help Moving Furniture',
        'Gardening Work',
        'Electrical Wiring Repair',
        'Clean Office Space'
    ],
    'description': [
        'Repair broken pipes under kitchen sink, must have plumbing tools',
        'Carry boxes and furniture to moving truck, heavy lifting required',
        'Mow lawn, trim hedges, and plant flowers in backyard',
        'Fix faulty wiring in residential building, licensed electricians only',
        'Deep clean office desks, floors, and windows twice a week'
    ],
    'category': [
        'Plumbing',
        'Moving',
        'Gardening',
        'Electrical',
        'Cleaning'
    ]
}

jobs_df = pd.DataFrame(jobs_data)

# Display the mock data
print("Sample DataFrame:")
print(jobs_df)

Sample DataFrame:
     id                     title  \
0  job1          Fix Kitchen Sink   
1  job2     Help Moving Furniture   
2  job3            Gardening Work   
3  job4  Electrical Wiring Repair   
4  job5        Clean Office Space   

                                         description    category  
0  Repair broken pipes under kitchen sink, must h...    Plumbing  
1  Carry boxes and furniture to moving truck, hea...      Moving  
2  Mow lawn, trim hedges, and plant flowers in ba...   Gardening  
3  Fix faulty wiring in residential building, lic...  Electrical  
4  Deep clean office desks, floors, and windows t...    Cleaning  


In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1,2))
tfidf_matrix = None
job_indices = None

def compute_tfidf(jobs_df):
    global tfidf_matrix, job_indices
    jobs_df['text_features'] = jobs_df['title'] + ' ' + jobs_df['description'] + ' ' + jobs_df['category']
    tfidf_matrix = vectorizer.fit_transform(jobs_df['text_features'])
    job_indices = pd.Series(jobs_df.index, index=jobs_df['id'])
    print("TF-IDF matrix shape:", tfidf_matrix.shape)

# Execute
compute_tfidf(jobs_df)

TF-IDF matrix shape: (5, 92)


In [13]:
# See the combined text features
print("\nCombined text features:")
print(jobs_df['text_features'])

# See the TF-IDF matrix (first 2 jobs)
print("\nTF-IDF matrix samples:")
print(tfidf_matrix[:2].toarray())  # Convert sparse matrix to dense for viewing

# See feature names (words/phrases)
print("\nFeature names (first 20):")
print(vectorizer.get_feature_names_out()[:20])

# Verify job indices mapping
print("\nJob ID to matrix row mapping:")
print(job_indices)


Combined text features:
0    Fix Kitchen Sink Repair broken pipes under kit...
1    Help Moving Furniture Carry boxes and furnitur...
2    Gardening Work Mow lawn, trim hedges, and plan...
3    Electrical Wiring Repair Fix faulty wiring in ...
4    Clean Office Space Deep clean office desks, fl...
Name: text_features, dtype: object

TF-IDF matrix samples:
[[0.         0.         0.         0.         0.18797181 0.18797181
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.15165447
  0.         0.18797181 0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.37594362
  0.37594362 0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.18797181 0.18797181 0.
  0.         0.37

In [14]:
print(jobs_df)

     id                     title  \
0  job1          Fix Kitchen Sink   
1  job2     Help Moving Furniture   
2  job3            Gardening Work   
3  job4  Electrical Wiring Repair   
4  job5        Clean Office Space   

                                         description    category  \
0  Repair broken pipes under kitchen sink, must h...    Plumbing   
1  Carry boxes and furniture to moving truck, hea...      Moving   
2  Mow lawn, trim hedges, and plant flowers in ba...   Gardening   
3  Fix faulty wiring in residential building, lic...  Electrical   
4  Deep clean office desks, floors, and windows t...    Cleaning   

                                       text_features  
0  Fix Kitchen Sink Repair broken pipes under kit...  
1  Help Moving Furniture Carry boxes and furnitur...  
2  Gardening Work Mow lawn, trim hedges, and plan...  
3  Electrical Wiring Repair Fix faulty wiring in ...  
4  Clean Office Space Deep clean office desks, fl...  


In [21]:
def get_similar_jobs(job_id, top_n=5):
    global tfidf_matrix, job_indices
    if tfidf_matrix is None or job_id not in job_indices:
        return [] # Not ready or job not found

    idx = job_indices[job_id] # Get the matrix row index for the job_id
    # Compute cosine similarity between this job (idx) and all jobs
    cosine_sim = cosine_similarity(tfidf_matrix[idx], tfidf_matrix)

    # Get similarity scores for all jobs, sort them
    sim_scores = list(enumerate(cosine_sim[0])) # [(idx, score), ...]
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get scores of the top_n most similar jobs (excluding itself)
    sim_scores = sim_scores[1:top_n+1] # Skip the first one (itself)

    # Get the job indices from the scores
    job_idx_matches = [i[0] for i in sim_scores]

    # Convert matrix indices back to original job IDs
    similar_job_ids = job_indices.iloc[job_idx_matches].index.tolist()
    return similar_job_ids

get_similar_jobs(2, top_n=5)

[]