Setting up environment

In [None]:
!pip install pandas numpy scikit-learn sentence-transformers rapidfuzz faker nltk
import nltk
nltk.download('punkt')




[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

Uploading datasets

In [None]:
import pandas as pd
df = pd.read_csv('internship companies.csv')
df.head()


Unnamed: 0.1,Unnamed: 0,company,internship,location,start date,duration,stipend,posted on,apply by
0,0,Internshala,Web Development,Gurgaon,Immediately,6 Months,20000 /month,29 Feb'20,18 Apr'20
1,1,Delhi Technological University - Karyon,Campus Ambassador,Work From Home,Immediately,1 Month,Performance Based,18 Feb'20,19 Apr'20
2,2,Internshala,Operations,Gurgaon,15 Apr'20,6 Months,20000 /month,18 Mar'20,29 Apr'20
3,3,Your Digital Boat,Content Writing,Work From Home,Immediately,2 Months,5000 /month,10 Apr'20,8 May'20
4,4,American Institute Of Big Data Professionals,Web Development,Work From Home,Immediately,1 Month,2000-5000 /month,10 Apr'20,8 May'20


Cleaning the data

In [None]:
df.columns = df.columns.str.strip().str.lower()
df = df.drop_duplicates().reset_index(drop=True)
df['internship'] = df['internship'].astype(str).str.lower().str.strip()
df.isna().sum()

Unnamed: 0,0
unnamed: 0,0
company,2
internship,0
location,4
start date,2
duration,2
stipend,2
posted on,2
apply by,2


In [None]:
!pip install pandas numpy scikit-learn sentence-transformers rapidfuzz faker

import pandas as pd
df = pd.read_csv("internship companies.csv")
df = df.drop(columns=['Unnamed: 0'])  # Drop index column
df = df.dropna(subset=['internship'])  # Drop rows with missing internship title
df.head()




Unnamed: 0,company,internship,location,start date,duration,stipend,posted on,apply by
0,Internshala,Web Development,Gurgaon,Immediately,6 Months,20000 /month,29 Feb'20,18 Apr'20
1,Delhi Technological University - Karyon,Campus Ambassador,Work From Home,Immediately,1 Month,Performance Based,18 Feb'20,19 Apr'20
2,Internshala,Operations,Gurgaon,15 Apr'20,6 Months,20000 /month,18 Mar'20,29 Apr'20
3,Your Digital Boat,Content Writing,Work From Home,Immediately,2 Months,5000 /month,10 Apr'20,8 May'20
4,American Institute Of Big Data Professionals,Web Development,Work From Home,Immediately,1 Month,2000-5000 /month,10 Apr'20,8 May'20


**Clean & Preprocess**

--> Convert text to lowercase

--> Handle missing values

--> Standardize location field

In [None]:
df['internship'] = df['internship'].str.lower().str.strip()
df['company'] = df['company'].fillna('Unknown').str.lower()
df['location'] = df['location'].fillna('unspecified').str.lower()
df['remote'] = df['location'].str.contains('work from home')


In [None]:
import re

skill_keywords = {
    'web development': ['html','css','javascript','react'],
    'data science': ['python','pandas','machine learning'],
    'content writing': ['writing','seo'],
    'graphic design': ['photoshop','illustrator'],
    'marketing': ['social media','communication']
}

def extract_skills(title):
    matched = []
    for key, skills in skill_keywords.items():
        if re.search(key, title):
            matched.extend(skills)
    return matched

df['skills'] = df['internship'].apply(extract_skills)
df[['internship','skills']].head()


Unnamed: 0,internship,skills
0,web development,"[html, css, javascript, react]"
1,campus ambassador,[]
2,operations,[]
3,content writing,"[writing, seo]"
4,web development,"[html, css, javascript, react]"


Creating candidate profile

In [None]:
candidate_profile = {
    "skills": ["python","machine learning"],
    "location": "work from home"
}


Matching logic

In [None]:
from sentence_transformers import SentenceTransformer, util
model = SentenceTransformer('all-MiniLM-L6-v2')

candidate_text = " ".join(candidate_profile['skills'])
candidate_emb = model.encode(candidate_text, convert_to_tensor=True)
intern_emb = model.encode(df['internship'].tolist(), convert_to_tensor=True)

semantic_scores = util.cos_sim(candidate_emb, intern_emb)[0].cpu().numpy()

# Skill match score
def skill_score(candidate_skills, internship_skills):
    if not internship_skills: return 0
    overlap = len(set(candidate_skills) & set(internship_skills))
    return overlap / len(internship_skills)

df['skill_score'] = df['skills'].apply(lambda x: skill_score(candidate_profile['skills'], x))
df['semantic_score'] = semantic_scores

# Location match boost
df['location_score'] = df['location'].apply(lambda x: 1 if candidate_profile['location'] in x else 0)

# Final score
df['final_score'] = 0.5*df['semantic_score'] + 0.3*df['skill_score'] + 0.2*df['location_score']
top_recommendations = df.sort_values(by='final_score', ascending=False).head(5)
top_recommendations[['company','internship','location','skills','final_score']]


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Unnamed: 0,company,internship,location,skills,final_score
2311,challenge katta,data science,work from home,"[python, pandas, machine learning]",0.606982
1194,shyena tech yarns private limited,data science,work from home,"[python, pandas, machine learning]",0.606982
2534,360digitmg,data science,work from home,"[python, pandas, machine learning]",0.606982
8251,kapwise technologies,data science,work from home,"[python, pandas, machine learning]",0.606982
4597,skillbit,data science,work from home,"[python, pandas, machine learning]",0.606982


In [None]:
for _, row in top_recommendations.iterrows():
    print(f" {row['company'].title()} | 📍 {row['location']}")
    print(f" {row['internship'].title()}")
    print(f" Matched Skills: {', '.join(row['skills']) if row['skills'] else 'N/A'}")
    print(f" Score: {row['final_score']:.2f}\n")


 Challenge Katta | 📍 work from home
 Data Science
 Matched Skills: python, pandas, machine learning
 Score: 0.61

 Shyena Tech Yarns Private Limited | 📍 work from home
 Data Science
 Matched Skills: python, pandas, machine learning
 Score: 0.61

 360Digitmg | 📍 work from home
 Data Science
 Matched Skills: python, pandas, machine learning
 Score: 0.61

 Kapwise Technologies | 📍 work from home
 Data Science
 Matched Skills: python, pandas, machine learning
 Score: 0.61

 Skillbit | 📍 work from home
 Data Science
 Matched Skills: python, pandas, machine learning
 Score: 0.61



In [None]:
import ipywidgets as widgets

skills_input = widgets.Text(description="Skills")
location_input = widgets.Text(description="Location")
display(skills_input, location_input)


Text(value='', description='Skills')

Text(value='', description='Location')

In [None]:
import ipywidgets as widgets
from IPython.display import display, clear_output
from sentence_transformers import SentenceTransformer, util

# Load embedding model (do this once at the top)
model = SentenceTransformer('all-MiniLM-L6-v2')

# Create widgets
skills_input = widgets.Text(
    description="Skills",
    placeholder="e.g. python, data science, html"
)
location_input = widgets.Text(
    description="Location",
    placeholder="e.g. delhi, work from home"
)
button = widgets.Button(description="Get Recommendations", button_style='success')
output = widgets.Output()

# Function to handle button click
def on_button_click(b):
    with output:
        clear_output()  # Clear previous results

        candidate_skills = [s.strip().lower() for s in skills_input.value.split(',') if s.strip()]
        candidate_location = location_input.value.lower()

        # Encode candidate skills as a single sentence
        candidate_text = " ".join(candidate_skills) if candidate_skills else ""
        candidate_emb = model.encode(candidate_text, convert_to_tensor=True)
        intern_emb = model.encode(df['internship'].tolist(), convert_to_tensor=True)

        semantic_scores = util.cos_sim(candidate_emb, intern_emb)[0].cpu().numpy()

        # Skill score
        def skill_score(candidate_skills, internship_skills):
            if not internship_skills: return 0
            overlap = len(set(candidate_skills) & set(internship_skills))
            return overlap / len(internship_skills)

        df['skill_score'] = df['skills'].apply(lambda x: skill_score(candidate_skills, x))
        df['semantic_score'] = semantic_scores
        df['location_score'] = df['location'].apply(lambda x: 1 if candidate_location in x else 0)

        df['final_score'] = 0.5*df['semantic_score'] + 0.3*df['skill_score'] + 0.2*df['location_score']
        top_recommendations = df.sort_values(by='final_score', ascending=False).head(5)

        # Display nicely
        for _, row in top_recommendations.iterrows():
            print(f" {row['company'].title()} | 📍 {row['location']}")
            print(f" {row['internship'].title()}")
            print(f" Matched Skills: {', '.join(row['skills']) if row['skills'] else 'N/A'}")
            print(f" Score: {row['final_score']:.2f}\n")

# Link button click to function
button.on_click(on_button_click)

# Display widgets + button + output area
display(skills_input, location_input, button, output)


Text(value='', description='Skills', placeholder='e.g. python, data science, html')

Text(value='', description='Location', placeholder='e.g. delhi, work from home')

Button(button_style='success', description='Get Recommendations', style=ButtonStyle())

Output()

In [None]:
candidate_skills = ["python","machine learning"]
candidate_location = "work from home"

from sentence_transformers import SentenceTransformer, util
model = SentenceTransformer('all-MiniLM-L6-v2')

candidate_text = " ".join(candidate_skills)
candidate_emb = model.encode(candidate_text, convert_to_tensor=True)
intern_emb = model.encode(df['internship'].tolist(), convert_to_tensor=True)

semantic_scores = util.cos_sim(candidate_emb, intern_emb)[0].cpu().numpy()

df['skill_score'] = df['skills'].apply(lambda x: len(set(candidate_skills) & set(x))/len(x) if x else 0)
df['semantic_score'] = semantic_scores
df['location_score'] = df['location'].apply(lambda x: 1 if candidate_location in x else 0)

df['final_score'] = 0.5*df['semantic_score'] + 0.3*df['skill_score'] + 0.2*df['location_score']
df.sort_values(by='final_score', ascending=False).head(5)[['company','internship','skills','location','final_score']]


Unnamed: 0,company,internship,skills,location,final_score
2311,challenge katta,data science,"[python, pandas, machine learning]",work from home,0.606982
1194,shyena tech yarns private limited,data science,"[python, pandas, machine learning]",work from home,0.606982
2534,360digitmg,data science,"[python, pandas, machine learning]",work from home,0.606982
8251,kapwise technologies,data science,"[python, pandas, machine learning]",work from home,0.606982
4597,skillbit,data science,"[python, pandas, machine learning]",work from home,0.606982
