In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
import ast
import random

# --- 1. Load and Preprocess Data ---

try:
    df = pd.read_csv('internship_data.csv')
except FileNotFoundError:
    print("Error: The file 'internship_data.csv' was not found.")
    exit()

# Clean and parse string representations of lists
def clean_and_parse_list_string(s):
    try:
        parsed_list = ast.literal_eval(s)
        return [str(item).strip().lower() for item in parsed_list]
    except (ValueError, SyntaxError):
        return []

df['Skills'] = df['Skills'].apply(clean_and_parse_list_string)
# This cleaning step ensures Location is always a list of strings
df['Location'] = df['Location'].apply(lambda s: [loc.strip().lower() for loc in str(s).strip("(),'").split(',')])
df['Role_Keywords'] = df['Role'].apply(lambda r: [word.lower() for word in r.split()])

# --- 2. Create Vocabularies and Mappings ---

all_skills = sorted(list(set(skill for skills_list in df['Skills'] for skill in skills_list)))
all_locations = sorted(list(set(loc for loc_list in df['Location'] for loc in loc_list)))
all_role_keywords = sorted(list(set(word for word_list in df['Role_Keywords'] for word in word_list)))

skill_to_idx = {skill: i+1 for i, skill in enumerate(all_skills)}
location_to_idx = {loc: i+1 for i, loc in enumerate(all_locations)}
role_to_idx = {word: i+1 for i, word in enumerate(all_role_keywords)}
skill_to_idx['<unk>'] = 0
location_to_idx['<unk>'] = 0
role_to_idx['<unk>'] = 0

# --- 3. Generate Training Data ---

positive_samples = []
for idx, row in df.iterrows():
    if row['Skills']:
        positive_samples.append([idx, row['Skills'][0], row['Location'][0], row['Role_Keywords'][0], 1])

negative_samples = []
for i in range(len(positive_samples)):
    random_internship_idx = random.randint(0, len(df)-1)
    random_skill = random.choice(all_skills)
    random_location = random.choice(all_locations)
    random_role = random.choice(all_role_keywords)
    negative_samples.append([random_internship_idx, random_skill, random_location, random_role, 0])

training_data = positive_samples + negative_samples
random.shuffle(training_data)

train_df = pd.DataFrame(training_data, columns=['internship_idx', 'skill', 'location', 'role_keyword', 'label'])

# --- 4. PyTorch Dataset and DataLoader ---

class InternshipDataset(Dataset):
    def __init__(self, dataframe):
        self.df = dataframe

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        skill_idx = skill_to_idx.get(row['skill'], 0)
        location_idx = location_to_idx.get(row['location'], 0)
        role_idx = role_to_idx.get(row['role_keyword'], 0)
        label = row['label']
        return torch.tensor(skill_idx, dtype=torch.long), \
               torch.tensor(location_idx, dtype=torch.long), \
               torch.tensor(role_idx, dtype=torch.long), \
               torch.tensor(label, dtype=torch.float32)

train_dataset = InternshipDataset(train_df)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

# --- 5. Define the ANN Model ---

class RecommendationANN(nn.Module):
    def __init__(self, num_skills, num_locations, num_roles, embedding_dim=16):
        super(RecommendationANN, self).__init__()
        self.skill_embedding = nn.Embedding(num_skills, embedding_dim)
        self.location_embedding = nn.Embedding(num_locations, embedding_dim)
        self.role_embedding = nn.Embedding(num_roles, embedding_dim)
        
        self.fc1 = nn.Linear(embedding_dim * 3, 64)
        self.fc2 = nn.Linear(64, 32)
        self.fc3 = nn.Linear(32, 1)
        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()

    def forward(self, skill_idx, location_idx, role_idx):
        skill_embed = self.skill_embedding(skill_idx)
        location_embed = self.location_embedding(location_idx)
        role_embed = self.role_embedding(role_idx)
        
        x = torch.cat([skill_embed, location_embed, role_embed], dim=1)
        
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        x = self.sigmoid(self.fc3(x))
        return x.squeeze()

model = RecommendationANN(
    num_skills=len(skill_to_idx),
    num_locations=len(location_to_idx),
    num_roles=len(role_to_idx)
)

# --- 6. Training Loop ---

criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
num_epochs = 5

print("Starting model training...")
model.train()
for epoch in range(num_epochs):
    for skills, locations, roles, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(skills, locations, roles)
        # Ensure outputs and labels have the same shape
        if outputs.dim() == 0:
            outputs = outputs.unsqueeze(0)
        if labels.dim() == 0:
            labels = labels.unsqueeze(0)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}")
print("Training finished.")

# --- 7. Inference and Recommendation ---

def get_ann_recommendations(candidate_profile, model, top_n=5):
    model.eval()
    scores = []

    candidate_skill_idx = torch.tensor([skill_to_idx.get(s.lower(), 0) for s in candidate_profile['skills']], dtype=torch.long)
    candidate_loc_idx = torch.tensor([location_to_idx.get(l.lower(), 0) for l in candidate_profile['locations']], dtype=torch.long)
    candidate_interest_idx = torch.tensor([role_to_idx.get(i.lower(), 0) for i in candidate_profile['interests']], dtype=torch.long)

    with torch.no_grad():
        for idx, internship in df.iterrows():
            total_score = 0
            count = 0
            
            internship_skills = torch.tensor([skill_to_idx.get(s, 0) for s in internship['Skills']], dtype=torch.long)
            for s_idx in candidate_skill_idx:
                for i_s_idx in internship_skills:
                    score = model(s_idx.unsqueeze(0), candidate_loc_idx[0].unsqueeze(0), candidate_interest_idx[0].unsqueeze(0))
                    total_score += score.item()
                    count += 1
            
            scores.append((total_score / (count if count > 0 else 1), idx))

    sorted_scores = sorted(scores, key=lambda x: x[0], reverse=True)
    top_indices = [index for score, index in sorted_scores[:top_n]]
    return df.loc[top_indices]

# --- 8. Example Usage ---

candidate = {
    "skills": ["MS-Excel", "Content Writing", "Social Media Marketing"],
    "locations": ["Work from home", "Delhi"],
    "interests": ["Marketing", "Content"]
}

recommendations = get_ann_recommendations(candidate, model)

print("\n======================================================")
print("Top 5 ANN-Based Internship Recommendations For You")
print(f"Based on your profile (Skills: {', '.join(candidate['skills'])}; Interests: {', '.join(candidate['interests'])})")
print("======================================================")

if recommendations.empty:
    print("\nNo matching internships found.")
else:
    for i, row in recommendations.iterrows():
        # **FIXED LINE BELOW**
        location_text = ", ".join(row['Location']).title()
        print(f"\n✅ Role: {row['Role']}")
        print(f"   🏢 Company: {row['Company Name']}")
        print(f"   📍 Location(s): {location_text}")
        print(f"   💰 Stipend: {row['Stipend']}")
        print(f"   ⏳ Duration: {row['Duration']}")
        print("   -------------------------------------------------")

Starting model training...
Epoch [1/5], Loss: 0.4332
Epoch [2/5], Loss: 0.0740
Epoch [3/5], Loss: 0.2539
Epoch [4/5], Loss: 0.0047
Epoch [5/5], Loss: 0.0006
Training finished.

Top 5 ANN-Based Internship Recommendations For You
Based on your profile (Skills: MS-Excel, Content Writing, Social Media Marketing; Interests: Marketing, Content)

✅ Role: Business Development (Sales) Internship
   🏢 Company: Madbrains Technologies LLP
   📍 Location(s): Chandigarh, Mohali
   💰 Stipend: ₹ 5,000-12,000 /month
   ⏳ Duration: 4 Months
   -------------------------------------------------

✅ Role: Human Resources (HR) Internship
   🏢 Company: Jobs Flash Consulting Services
   📍 Location(s): Gurgaon
   💰 Stipend: ₹ 8,000 /month
   ⏳ Duration: 6 Months
   -------------------------------------------------

✅ Role: Content & E-Commerce Management Internship
   🏢 Company: Fall For Flora
   📍 Location(s): Faridabad, Delhi, Gurgaon, Noida
   💰 Stipend: ₹ 7,000 /month
   ⏳ Duration: 4 Months
   -------------