In [11]:
import pandas as pd

# 1. Load the CSV
df_jobs = pd.read_csv('job_descriptions.csv')

# 2. Quick look at the first few rows
print("Columns:", df_jobs.columns.tolist())
print(df_jobs.head())

# 3. Check for missing values
print("\nMissing values per column:")
print(df_jobs.isnull().sum())


Columns: ['job_title', 'company_name', 'job_description', 'clean_text']
                                       job_title  \
0  HyresadministratÃ¶r till uppdrag i GÃ¶teborg!   
1                   Registered Nurse - RGN / RMN   
2                               Specialist Nurse   
3           Care Worker - Wells & Shepton Mallet   
4   Kundansvarig redovisningskonsult, Karlskrona   

                           company_name  \
0                          TNG Group AB   
1            Maria Mallaband Care Group   
2  Bromley Healthcare Diabetes Services   
3                               Networx   
4                                 Aspia   

                                     job_description  \
0  Ã„r du extra serviceorienterad och samtidigt Ã...   
1  Registered Nurse - RGN / RMN   CAPTION: Job de...   
2  Diabetes Specialist Nurse - Band 7   Band 7   ...   
3  At Somerset Care, our values and culture is wh...   
4  Ã„r du en driven och engagerad redovisningskon...   

                   

In [12]:
import re
import nltk
from nltk.corpus import stopwords

# Download stopwords once
nltk.download('stopwords')
STOPWORDS = set(stopwords.words('english'))

def clean_text(text: str) -> str:
    """Lowercase, strip non-alphanumerics, remove stopwords."""
    text = text.lower()                             # 1️⃣ lowercase everything
    text = re.sub(r'[^a-z0-9\s]', ' ', text)        # 2️⃣ strip out punctuation/special chars
    tokens = text.split()                           # 3️⃣ split into individual words
    tokens = [t for t in tokens if t not in STOPWORDS]  # 4️⃣ remove common “stop words” like “and”, “the”
    return ' '.join(tokens)                         # 5️⃣ stitch back into a single string



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [13]:
import pandas as pd

# Load job descriptions (assumes 'clean_text' already exists)
df_jobs = pd.read_csv('job_descriptions.csv')

# Load resumes
df_resumes = pd.read_csv('Resume.csv')

# Apply our cleaner to resumes
df_resumes['clean_text'] = df_resumes['Resume_str'].apply(clean_text)

# Quick sanity checks
print("Jobs:", df_jobs.shape, "Resumes:", df_resumes.shape)
print("Sample job clean text:", df_jobs['clean_text'].iloc[0][:200])
print("Sample resume clean text:", df_resumes['clean_text'].iloc[0][:200])


Jobs: (159589, 4) Resumes: (2484, 5)
Sample job clean text: r du extra serviceorienterad och samtidigt lskar administration ? sk jobb som hyresadministratr hos vr kund gteborg ! r du en strukturerad och ordningsam person med stark knsla fr service ? nu sker vi
Sample resume clean text: hr administrator marketing associate hr administrator summary dedicated customer service manager 15 years experience hospitality customer service management respected builder leader customer focused t


In [14]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

# 2. Initialize TF-IDF vectorizer
vectorizer = TfidfVectorizer(max_features=5000)  # limit vocab for speed

# 3. Fit on job descriptions
vectorizer.fit(df_jobs['clean_text'])

# 4. Transform both sets
X_jobs = vectorizer.transform(df_jobs['clean_text'])
X_resumes = vectorizer.transform(df_resumes['clean_text'])

print("TF-IDF matrix shapes:")
print("  Jobs:", X_jobs.shape)
print("  Resumes:", X_resumes.shape)


TF-IDF matrix shapes:
  Jobs: (159589, 5000)
  Resumes: (2484, 5000)


In [15]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def get_top_job_matches(resume_idx: int, top_n: int = 5):
    """
    For resume at index `resume_idx` in df_resumes,
    compute cosine similarity against all jobs,
    and return the top_n job indices and scores.
    """
    # 1. Grab the TF-IDF row for that resume (1 x F)
    v_res = X_resumes[resume_idx]

    # 2. Compute cosine similarity to every job (1 x n_jobs)
    sims = cosine_similarity(v_res, X_jobs).flatten()

    # 3. Get top N indices (highest similarity)
    top_idxs = np.argsort(sims)[::-1][:top_n]
    top_scores = sims[top_idxs]

    # 4. Retrieve job metadata
    results = df_jobs.iloc[top_idxs][['job_title', 'company_name']]
    results = results.assign(score=np.round(top_scores, 3)).reset_index(drop=True)
    return results

# --- Demo on the first resume ---
print("Resume sample:")
print(df_resumes.loc[0, ['ID', 'Category']])
print("\nTop 5 Job Matches:")
get_top_job_matches(0, top_n=5)


Resume sample:
ID          16852973
Category          HR
Name: 0, dtype: object

Top 5 Job Matches:


Unnamed: 0,job_title,company_name,score
0,Marketing Manager,Bluebird Network,0.319
1,Sales and Marketing Manager,Intelek Technologies,0.312
2,Public Relations and Communications Assistant ...,Vertical Solutions,0.31
3,Marketing Representative,NJ AEY,0.307
4,Marketing Coordinator / Administrative Assistant,Intelligent IT Designs,0.297


In [16]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from fastapi import FastAPI
from pydantic import BaseModel
from fastapi.middleware.cors import CORSMiddleware


app = FastAPI(title="Resume-to-Job Matcher")

app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],
    allow_methods=["*"],
    allow_headers=["*"],
)

class MatchRequest(BaseModel):
    text: str
    top_n: int = 5

class MatchResult(BaseModel):
    job_title: str
    company_name: str
    score: float

@app.post("/match", response_model=list[MatchResult])
def match_resume(req: MatchRequest):
    # 1. Clean & vectorize the incoming resume text
    clean = clean_text(req.text)
    v_res = vectorizer.transform([clean])
    # 2. Compute similarities
    sims = cosine_similarity(v_res, X_jobs).flatten()
    top_idxs = sims.argsort()[::-1][:req.top_n]
    # 3. Build response
    results = []
    for idx in top_idxs:
        results.append(MatchResult(
            job_title=df_jobs.at[idx, 'job_title'],
            company_name=df_jobs.at[idx, 'company_name'],
            score=round(float(sims[idx]), 3)
        ))
    return results


In [None]:
from pyngrok import ngrok
import nest_asyncio
import uvicorn
from fastapi import FastAPI

nest_asyncio.apply()

ngrok.set_auth_token("2yM8nhm2Qz3INt98iQqJXCexfNP_4YkW3W1u1d3snn3aNCLdS")

public_url = ngrok.connect(8000)
print(f"Your app is live at: {public_url}")

uvicorn.run(app, host="0.0.0.0", port=8000)

Your app is live at: NgrokTunnel: "https://387b-34-106-97-71.ngrok-free.app" -> "http://localhost:8000"


INFO:     Started server process [242]
INFO:     Waiting for application startup.
INFO:     Application startup complete.
INFO:     Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit)
