### Loading libraries

In [1]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import numpy as np
import random
import warnings
import os
warnings.filterwarnings("ignore")

  from .autonotebook import tqdm as notebook_tqdm


### Dataset Preparation

In [2]:
skills = ['Python', 'ML', 'Data Analysis', 'Cloud', 'Web Dev', 'Excel', 'Communication']
domains = ['AI', 'Cybersecurity', 'Marketing', 'Cloud Computing', 'Web Development']
locations = ['Delhi', 'Mumbai', 'Remote', 'Bangalore', 'Hyderabad']
tags = ['remote', 'beginner-friendly', 'paid', 'certificate']

internship_data = pd.read_csv("intershala_internship.csv")
user_profiles = pd.DataFrame({
    'user_id': [f'U{i:03d}' for i in range(50)],
    'education_level': [random.choice(['B.Tech', 'Diploma', 'B.Sc']) for _ in range(50)],
    'skills': [random.sample(skills, k=3) for _ in range(50)],
    'interests': [random.sample(domains, k=2) for _ in range(50)],
    'location': [random.choice(locations) for _ in range(50)],
    'preferred_mode': [random.choice(['Remote', 'On-site', 'Hybrid']) for _ in range(50)]
})

print("Internships Sample: \n", internship_data.head())
print("User Profiles Sample: \n", user_profiles.head())

user_profiles.to_csv("user_information.csv", index=False)

Internships Sample: 
    Unnamed: 0  actively_hiring                     Type_of_internship  \
0           0              1.0                      Sales & Marketing   
1           1              1.0                            Fundraising   
2           2              1.0                      Digital Marketing   
3           3              1.0  General Management (Founders' Office)   
4           4              1.0                Social Entrepreneurship   

                                     company_name        location  \
0                                  Paru Creations       Faridabad   
1  Odisha Development Management Programme (ODMP)  Work From Home   
2                          Tare Zameen Foundation  Work From Home   
3       Elation Edtech Private Limited (Tinkerly)          Jaipur   
4                              Hamari Pahchan NGO  Work From Home   

                stipend  duration  
0        ₹ 5,000 /month  3 Months  
1     ₹ 225-5,000 /week    1 Week  
2               

### Vectorization of Data

In [5]:
def combine_user_profile(row): 
    return f"{row['education_level']} | {' '.join(row['skills'])} | {' '.join(row['interests'])} | {row['location']} | {row['preferred_mode']}"

def combine_internship(row):
    return f"{row['Type_of_internship']} | {row['company_name']} | {row['location']} | {row['stipend']} | {row['duration']}"

In [6]:
model = SentenceTransformer('all-MiniLM-L6-v2')

user_profiles['combined_text'] = user_profiles.apply(combine_user_profile, axis=1)
internship_data['combined_text'] = internship_data.apply(combine_internship, axis=1)

user_embeddings = model.encode(user_profiles['combined_text'].tolist(), convert_to_tensor=True)
internship_embeddings = model.encode(internship_data['combined_text'].tolist(), convert_to_tensor=True)

np.save("user_embeddings.npy", user_embeddings.cpu().numpy())
np.save("internship_embeddings.npy", internship_embeddings.cpu().numpy())

In [7]:
print(user_embeddings)

tensor([[-0.0591, -0.0434, -0.0939,  ...,  0.0025,  0.0026,  0.0128],
        [ 0.0240, -0.0209, -0.0527,  ..., -0.0084, -0.0828,  0.0436],
        [-0.0191, -0.0296, -0.0928,  ..., -0.0432, -0.0405, -0.0232],
        ...,
        [ 0.0163, -0.0626, -0.0386,  ..., -0.0140, -0.0065, -0.0386],
        [ 0.0102, -0.1047, -0.0462,  ..., -0.0162, -0.0324, -0.0631],
        [-0.0328, -0.0629, -0.0604,  ..., -0.0296,  0.0169, -0.0053]])


In [8]:
print(internship_embeddings)

tensor([[ 0.0346, -0.0553, -0.1026,  ..., -0.1394, -0.0084,  0.0402],
        [-0.0016,  0.0206, -0.0387,  ..., -0.1361, -0.0472, -0.0010],
        [-0.0237, -0.0270, -0.0182,  ..., -0.1404,  0.0113,  0.0320],
        ...,
        [ 0.0345, -0.0107, -0.0759,  ..., -0.0766, -0.0503,  0.0318],
        [ 0.0111, -0.0454, -0.0697,  ..., -0.0974, -0.0478,  0.0409],
        [ 0.0641, -0.0417, -0.0516,  ..., -0.1846, -0.0540,  0.0228]])


### User Testing

In [9]:
user_embeddings = np.load("user_embeddings.npy")
internship_embeddings = np.load("internship_embeddings.npy")

similarities = cosine_similarity([user_embeddings[0]], internship_embeddings)[0]

In [10]:
sorted(list(enumerate(similarities)), reverse=True, key= lambda x : x[1])

[(6410, np.float32(0.6042242)),
 (3398, np.float32(0.6014023)),
 (3382, np.float32(0.5833106)),
 (894, np.float32(0.58134174)),
 (235, np.float32(0.57735837)),
 (4239, np.float32(0.5723704)),
 (892, np.float32(0.56776536)),
 (579, np.float32(0.5675633)),
 (1072, np.float32(0.56184644)),
 (6188, np.float32(0.5597865)),
 (3583, np.float32(0.55919695)),
 (2807, np.float32(0.5585178)),
 (2195, np.float32(0.5575102)),
 (5612, np.float32(0.5569533)),
 (6670, np.float32(0.55640125)),
 (876, np.float32(0.5555222)),
 (896, np.float32(0.55307716)),
 (5697, np.float32(0.5528846)),
 (3304, np.float32(0.55250216)),
 (4799, np.float32(0.55243206)),
 (3363, np.float32(0.5520997)),
 (769, np.float32(0.55209607)),
 (5579, np.float32(0.55111843)),
 (5605, np.float32(0.55070174)),
 (5837, np.float32(0.54776883)),
 (1708, np.float32(0.5466056)),
 (3918, np.float32(0.5460635)),
 (5343, np.float32(0.54521537)),
 (4248, np.float32(0.5431814)),
 (2914, np.float32(0.5416521)),
 (5069, np.float32(0.5397557)),
 

### User Testing with Input

In [11]:
education = input("Enter your education level (e.g., B.Tech, Diploma): ")
skills = input("Enter your skills (comma-separated): ").split(',')
interests = input("Enter your interests (comma-separated): ").split(',')
location = input("Enter your location: ")
mode = input("Preferred internship mode (Remote, On-site, Hybrid): ")

user_text = f"{education} | {' '.join(skills)} | {' '.join(interests)} | {location} | {mode}"
user_embedding = model.encode([user_text])

similarities = cosine_similarity(user_embedding, internship_embeddings)[0]

top_indices = similarities.argsort()[::-1][:5]
top_matches = internship_data.iloc[top_indices]

In [None]:
print(f"User Data:\nEducation Level: {education}\nSkills: {skills}\nInterests: {interests}\nLocation: {location}\nMode: {mode}\n")

for j, (_, row) in enumerate(top_matches.iterrows(), start=1):
    print(f"{j} -> {row['Type_of_internship']} | {row['company_name']} | {row['location']} | {row['duration']} | Stipend: {row['stipend']}")

User Data:
Education Level: B.Tech
Skills: ['Python', ' C++', ' Java', ' DSA', ' OOPs']
Interests: ['Computer Science', ' Software Engineering']
Location: Delhi
Mode: On-site

1 -> Software Development Engineering (Web) | KYRO | Chennai | 6 Months | Stipend: ₹ 25,000 /month
2 -> Software Development Engineering (Web) | Zonasol Globe Private Limited | Jaipur | 6 Months | Stipend: ₹ 3,500-8,000 /month
3 -> Python Development | Medius Technologies Private Limited | Mumbai | 6 Months | Stipend: ₹ 8,000 /month
4 -> Software Development Engineering (Web) | Track Tech Solution | Erode | 6 Months | Stipend: ₹ 10,000 /month
5 -> Python Development | Angels Virtual World | Delhi | 4 Months | Stipend: ₹ 5,000 /month
