In [1]:
# Step 1: Install Required Libraries
!pip install faiss-cpu transformers datasets accelerate diffusers streamlit openai torch torchvision torchaudio




In [2]:
import pandas as pd

# Load dataset
df = pd.read_csv("/content/job_postings.csv")  # Update path if needed

# Check available columns
print("Columns in dataset:", df.columns)

# Select relevant columns dynamically
available_columns = list(df.columns)
relevant_columns = [col for col in ['job_title', 'company_name', 'job_description'] if col in available_columns]

# Ensure relevant text exists
if not relevant_columns:
    raise ValueError("⚠️ No relevant text columns found in the dataset!")

# Merge selected columns
df['text'] = df[relevant_columns].astype(str).agg(' '.join, axis=1)

# Display sample data
print(df.head())


Columns in dataset: Index(['job_link', 'last_processed_time', 'last_status', 'got_summary',
       'got_ner', 'is_being_worked', 'job_title', 'company', 'job_location',
       'first_seen', 'search_city', 'search_country', 'search_position',
       'job_level', 'job_type'],
      dtype='object')
                                            job_link  \
0  https://www.linkedin.com/jobs/view/senior-mach...   
1  https://www.linkedin.com/jobs/view/principal-s...   
2  https://www.linkedin.com/jobs/view/senior-etl-...   
3  https://www.linkedin.com/jobs/view/senior-data...   
4  https://www.linkedin.com/jobs/view/lead-data-e...   

             last_processed_time   last_status got_summary got_ner  \
0  2024-01-21 08:08:48.031964+00  Finished NER           t       t   
1  2024-01-20 04:02:12.331406+00  Finished NER           t       t   
2  2024-01-21 08:08:31.941595+00  Finished NER           t       t   
3  2024-01-20 15:30:55.796572+00  Finished NER           t       t   
4  2024-01-21 08

In [26]:
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer

# Load FAISS index
index = faiss.read_index("faiss_index.bin")

# Load dataset
df = pd.read_csv("/content/job_postings.csv")

# Ensure we use the right columns
available_columns = list(df.columns)
relevant_columns = [col for col in ['job_title', 'company_name', 'job_description', 'location'] if col in available_columns]

def retrieve_jobs(query, top_k=5):
    """Retrieve top-k most relevant job postings"""
    embed_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
    query_embedding = embed_model.encode([query]).astype('float32')

    distances, indices = index.search(query_embedding, top_k)
    results = []

    for idx in indices[0]:  # Loop through retrieved job postings
        job_details = df.iloc[idx][relevant_columns].to_dict()
        job_text = f"**{job_details.get('job_title', 'N/A')}** at {job_details.get('company_name', 'N/A')}\n"
        job_text += f"{job_details.get('job_description', 'No description available')}\n"
        job_text += f"Location: {job_details.get('location', 'Unknown')}\n"
        results.append(job_text)

    return results


In [25]:
'''import faiss
import numpy as np
from sentence_transformers import SentenceTransformer

# Initialize embedding model
embed_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

# Compute embeddings for all job postings
embeddings = embed_model.encode(df['text'].tolist(), show_progress_bar=True)

# Convert to NumPy
embeddings_np = np.array(embeddings).astype('float32')

# Create FAISS index
index = faiss.IndexFlatL2(embeddings_np.shape[1])
index.add(embeddings_np)

# Save FAISS index
faiss.write_index(index, "faiss_index.bin")

# Function to retrieve top-k relevant documents
def retrieve_docs(query, top_k=3):
    """Retrieve top-k most relevant job descriptions"""
    query_embedding = embed_model.encode([query]).astype('float32')
    distances, indices = index.search(query_embedding, top_k)
    return [df.iloc[idx]['text'] for idx in indices[0]]'''


Batches:   0%|          | 0/382 [00:00<?, ?it/s]

In [27]:
%%writefile app.py
import streamlit as st
from transformers import T5ForConditionalGeneration, T5Tokenizer
import faiss
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer

# Load dataset
df = pd.read_csv("/content/job_postings.csv")  # Ensure correct path

# Load FAISS index
index = faiss.read_index("faiss_index.bin")

# Load T5 model
model = T5ForConditionalGeneration.from_pretrained("t5_finetuned")
tokenizer = T5Tokenizer.from_pretrained("t5_finetuned")

# Load embedding model
embed_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

# Ensure we use the correct columns
available_columns = list(df.columns)
relevant_columns = [col for col in ['job_title', 'company_name', 'job_description', 'location'] if col in available_columns]

def retrieve_jobs(query, top_k=5):
    """Retrieve top-k most relevant job postings"""
    query_embedding = embed_model.encode([query]).astype('float32')

    distances, indices = index.search(query_embedding, top_k)
    results = []

    for idx in indices[0]:  # Loop through retrieved job postings
        job_details = df.iloc[idx][relevant_columns].to_dict()
        job_text = f"**{job_details.get('job_title', 'N/A')}** at {job_details.get('company_name', 'N/A')}\n"
        job_text += f"{job_details.get('job_description', 'No description available')}\n"
        job_text += f"Location: {job_details.get('location', 'Unknown')}\n"
        results.append(job_text)

    return results

def generate_answer(query):
    """Retrieve relevant job postings and generate a response using T5"""
    retrieved_jobs = retrieve_jobs(query)

    if not retrieved_jobs:
        return "No relevant job postings found."

    context = "\n".join(retrieved_jobs)  # Combine retrieved job postings

    input_text = f"question: {query} context: {context}"
    input_ids = tokenizer(input_text, return_tensors="pt").input_ids
    output = model.generate(input_ids)

    return tokenizer.decode(output[0], skip_special_tokens=True)

# Streamlit UI
st.title("Remote AI Job Finder 🤖")
st.write("Ask about job opportunities!")

query = st.text_input("Enter your job search query (e.g., 'Remote AI Engineer jobs'):")

if query:
    response = generate_answer(query)
    st.write("### **Job Search Results:**")
    st.write(response)


Overwriting app.py


In [4]:
from transformers import T5ForConditionalGeneration, T5Tokenizer, Trainer, TrainingArguments
import torch
from torch.utils.data import Dataset

# Load pre-trained T5 model & tokenizer
model_name = "t5-small"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

# Sample fine-tuning dataset
train_data = [
    {"question": "What are the requirements for a software engineer?", "answer": "Strong coding skills, problem-solving, and experience in programming languages."},
    {"question": "What is the salary range for a data scientist?", "answer": "The average salary for a data scientist is $100,000 to $150,000 per year."}
]

# Tokenize inputs and labels
input_texts = [f"question: {d['question']} context: {d['answer']}" for d in train_data]
labels = [d['answer'] for d in train_data]

# Encode training data
train_encodings = tokenizer(input_texts, padding=True, truncation=True, return_tensors="pt")
label_encodings = tokenizer(labels, padding=True, truncation=True, return_tensors="pt")

# Create PyTorch Dataset
class JobQADataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.encodings["input_ids"])

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels["input_ids"][idx])
        return item

# Load dataset into Trainer
train_dataset = JobQADataset(train_encodings, label_encodings)

# Define Training Arguments
training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=2,
    num_train_epochs=3,
    logging_dir="./logs"
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset
)

# Train model
trainer.train()

# Save fine-tuned model
model.save_pretrained("t5_finetuned")
tokenizer.save_pretrained("t5_finetuned")


You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mmaheshwarraobandi14[0m ([33mmaheshwarraobandi14-university-of-missouri-kansas-city[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item["labels"] = torch.tensor(self.labels["input_ids"][idx])
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Step,Training Loss


('t5_finetuned/tokenizer_config.json',
 't5_finetuned/special_tokens_map.json',
 't5_finetuned/spiece.model',
 't5_finetuned/added_tokens.json')

In [21]:
from transformers import T5ForConditionalGeneration, T5Tokenizer

# Load fine-tuned model and tokenizer
model = T5ForConditionalGeneration.from_pretrained("t5_finetuned")
tokenizer = T5Tokenizer.from_pretrained("t5_finetuned")

# Test a sample query
query = "What are the requirements for a software engineer?"
input_text = f"question: {query}"

# Tokenize input
input_ids = tokenizer(input_text, return_tensors="pt").input_ids

# Generate response
output = model.generate(input_ids)
response = tokenizer.decode(output[0], skip_special_tokens=True)

print("Generated Response:", response)


Generated Response: a software engineer?


In [22]:
# Save fine-tuned model properly
model.save_pretrained("t5_finetuned")
tokenizer.save_pretrained("t5_finetuned")


('t5_finetuned/tokenizer_config.json',
 't5_finetuned/special_tokens_map.json',
 't5_finetuned/spiece.model',
 't5_finetuned/added_tokens.json')

In [6]:
from diffusers import StableDiffusionPipeline
import torch

# Load Stable Diffusion model
pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4")
pipe.to("cuda" if torch.cuda.is_available() else "cpu")

def generate_image(prompt):
    """Generate an image using Stable Diffusion"""
    image = pipe(prompt).images[0]
    image.save("generated_image.png")

# Example usage
generate_image("A futuristic AI chatbot assisting job seekers")


Loading pipeline components...:   0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

In [23]:
%%writefile app.py
import streamlit as st
from transformers import T5ForConditionalGeneration, T5Tokenizer

# Load fine-tuned model and tokenizer
model = T5ForConditionalGeneration.from_pretrained("t5_finetuned")
tokenizer = T5Tokenizer.from_pretrained("t5_finetuned")

st.title("Job Assistant Chatbot 🤖")
st.write("Ask any job-related questions!")

def generate_answer(query):
    """Retrieve relevant documents and generate an answer"""
    input_text = f"question: {query}"
    input_ids = tokenizer(input_text, return_tensors="pt").input_ids
    output = model.generate(input_ids)
    return tokenizer.decode(output[0], skip_special_tokens=True)

query = st.text_input("Enter your question:")

if query:
    response = generate_answer(query)
    st.write("**Chatbot Response:**", response)


Overwriting app.py


In [18]:
!wget -O cloudflared https://github.com/cloudflare/cloudflared/releases/latest/download/cloudflared-linux-amd64
!chmod +x cloudflared
!mv cloudflared /usr/local/bin/


--2025-03-14 02:09:29--  https://github.com/cloudflare/cloudflared/releases/latest/download/cloudflared-linux-amd64
Resolving github.com (github.com)... 140.82.116.4
Connecting to github.com (github.com)|140.82.116.4|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://github.com/cloudflare/cloudflared/releases/download/2025.2.1/cloudflared-linux-amd64 [following]
--2025-03-14 02:09:29--  https://github.com/cloudflare/cloudflared/releases/download/2025.2.1/cloudflared-linux-amd64
Reusing existing connection to github.com:443.
HTTP request sent, awaiting response... 302 Found
Location: https://objects.githubusercontent.com/github-production-release-asset-2e65be/106867604/eac8237f-c554-46b5-95ea-f2f5873e69a5?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=releaseassetproduction%2F20250314%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20250314T020929Z&X-Amz-Expires=300&X-Amz-Signature=1298112b1420a665138583cde1fd6c71b36af6f3b0605370618921231c9904b3&X-Amz-S

In [19]:
import os
os.system("streamlit run app.py &")


0

In [None]:
import os
os.system("streamlit run app.py &")
!cloudflared tunnel --url http://localhost:8501


[90m2025-03-14T02:26:04Z[0m [32mINF[0m Thank you for trying Cloudflare Tunnel. Doing so, without a Cloudflare account, is a quick way to experiment and try it out. However, be aware that these account-less Tunnels have no uptime guarantee, are subject to the Cloudflare Online Services Terms of Use (https://www.cloudflare.com/website-terms/), and Cloudflare reserves the right to investigate your use of Tunnels for violations of such terms. If you intend to use Tunnels in production you should use a pre-created named tunnel by following: https://developers.cloudflare.com/cloudflare-one/connections/connect-apps
[90m2025-03-14T02:26:04Z[0m [32mINF[0m Requesting new quick Tunnel on trycloudflare.com...
[90m2025-03-14T02:26:08Z[0m [32mINF[0m +--------------------------------------------------------------------------------------------+
[90m2025-03-14T02:26:08Z[0m [32mINF[0m |  Your quick Tunnel has been created! Visit it at (it may take some time to be reachable):  |
[90m2025