In [13]:
import pandas as pd
import numpy as np
import random
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

class AdvancedInternshipRecommender:
    def __init__(self):
        self.domains_data = self._initialize_domains()
        self.students_data = self._generate_synthetic_students()
        self.vectorizer = TfidfVectorizer(max_features=1000, stop_words='english')
        self.scaler = StandardScaler()
        self.kmeans_model = None
        self.rf_model = None
        
    def _initialize_domains(self):
        """Initialize comprehensive domain database with enhanced features"""
        domains = {
            'domain': [
                "Web Development", "Data Science", "Digital Marketing", 
                "Mobile App Development", "UI/UX Design", "DevOps Engineering",
                "Cybersecurity", "Artificial Intelligence", "Finance Technology",
                "Business Development", "Cloud Computing", "Game Development",
                "Blockchain Development", "IoT Engineering", "Robotics Engineering"
            ],
            'description': [
                "HTML CSS JavaScript React Node.js Frontend Backend API Development REST GraphQL",
                "Python Pandas NumPy Machine Learning Statistics Analytics Data Visualization Tableau",
                "SEO Content Writing Analytics Social Media Marketing Strategy PPC Google Ads",
                "Android iOS Flutter Swift Kotlin React Native Mobile UI UX App Store",
                "Figma Photoshop User Research Wireframing Prototyping Design Thinking Adobe XD",
                "Docker Kubernetes AWS CI CD Jenkins Terraform Cloud Infrastructure Automation",
                "Ethical Hacking Network Security Encryption Penetration Testing Security Audit CISSP",
                "Deep Learning Neural Networks TensorFlow PyTorch Computer Vision NLP OpenAI",
                "Financial Modeling Excel Bloomberg Python R Risk Management Trading Quantitative",
                "Sales Communication Market Research Strategy Partnership Business Analysis CRM",
                "AWS Azure Google Cloud Serverless Microservices Container Orchestration",
                "Unity Unreal Engine C# C++ Game Design 3D Modeling Animation Gaming",
                "Solidity Ethereum Smart Contracts DeFi Web3 Cryptocurrency Bitcoin NFT",
                "Arduino Raspberry Pi Sensors Embedded Systems Hardware IoT Protocols MQTT",
                "ROS Python C++ Computer Vision Sensor Fusion Autonomous Systems Control Theory"
            ],
            'difficulty_level': [1, 3, 1, 2, 1, 4, 4, 5, 3, 1, 3, 3, 4, 3, 5],
            'market_demand': [4, 5, 3, 4, 4, 5, 5, 5, 3, 2, 4, 3, 4, 3, 3],
            'avg_salary': [75000, 95000, 60000, 85000, 70000, 110000, 105000, 130000, 90000, 65000, 100000, 80000, 120000, 85000, 115000],
            'growth_rate': [15, 22, 10, 18, 13, 25, 31, 40, 8, 5, 20, 12, 35, 15, 18],
            'required_gpa': [3.0, 3.3, 2.8, 3.2, 3.0, 3.5, 3.4, 3.7, 3.5, 2.9, 3.3, 3.1, 3.6, 3.2, 3.8]
        }
        return pd.DataFrame(domains)
    
    def _generate_synthetic_students(self, n_students=100):
        """Generate comprehensive synthetic student dataset"""
        skills_pool = [
            "Python", "JavaScript", "React", "Node.js", "SQL", "SEO", "Analytics",
            "Figma", "Photoshop", "Docker", "Kubernetes", "TensorFlow", "Excel",
            "Sales", "Communication", "AWS", "HTML", "CSS", "Pandas", "C++",
            "Java", "Swift", "Kotlin", "Unity", "Blender", "R", "Tableau",
            "Git", "Linux", "MongoDB", "PostgreSQL", "Redis", "Elasticsearch"
        ]
        
        interests_pool = [
            "AI", "Design", "Marketing", "Security", "Cloud", "Data", "Startups",
            "Finance", "Entrepreneurship", "Web", "Mobile", "Gaming", "Blockchain",
            "IoT", "Robotics", "Machine Learning", "Deep Learning", "Computer Vision"
        ]
        
        experience_pool = [
            "freelance web development", "data analysis projects", "social media management",
            "team lead in hackathons", "mobile app development", "financial modeling",
            "network security research", "UI design for startups", "open source contributions",
            "research assistant", "teaching assistant", "internship at tech company"
        ]
        
        students = []
        for i in range(n_students):
            student = {
                "student_id": f"STU_{i+1:03d}",
                "name": f"Student {i+1}",
                "skills": ", ".join(random.sample(skills_pool, k=random.randint(3, 8))),
                "interests": ", ".join(random.sample(interests_pool, k=random.randint(2, 5))),
                "experience": random.choice(experience_pool),
                "gpa": round(random.uniform(2.5, 4.0), 2),
                "year": random.choice(['freshman', 'sophomore', 'junior', 'senior']),
                "programming_exp": random.randint(0, 5),  # years of programming experience
                "project_count": random.randint(0, 15),
                "preferred_location": random.choice(['Remote', 'San Francisco', 'New York', 'Seattle', 'Austin', 'Boston'])
            }
            students.append(student)
        
        return pd.DataFrame(students)
    
    def build_tfidf_features(self):
        """Build TF-IDF features for domains and students"""
        # Combine domain descriptions
        domain_corpus = self.domains_data['description'].tolist()
        
        # Create student profiles text
        student_corpus = []
        for _, student in self.students_data.iterrows():
            profile_text = f"{student['skills']} {student['interests']} {student['experience']}"
            student_corpus.append(profile_text)
        
        # Fit TF-IDF vectorizer on combined corpus
        combined_corpus = domain_corpus + student_corpus
        tfidf_matrix = self.vectorizer.fit_transform(combined_corpus)
        
        # Split back into domain and student features
        n_domains = len(domain_corpus)
        self.domain_tfidf = tfidf_matrix[:n_domains]
        self.student_tfidf = tfidf_matrix[n_domains:]
        
        return self.domain_tfidf, self.student_tfidf
    
    def create_advanced_features(self):
        """Create advanced numerical features for ML models"""
        # Student features
        student_features = []
        for _, student in self.students_data.iterrows():
            features = [
                student['gpa'],
                student['programming_exp'],
                student['project_count'],
                len(student['skills'].split(', ')),  # skill count
                len(student['interests'].split(', ')),  # interest count
                {'freshman': 1, 'sophomore': 2, 'junior': 3, 'senior': 4}[student['year']]
            ]
            student_features.append(features)
        
        self.student_features = np.array(student_features)
        
        # Domain features
        domain_features = self.domains_data[['difficulty_level', 'market_demand', 'avg_salary', 'growth_rate', 'required_gpa']].values
        self.domain_features = domain_features
        
        return self.student_features, self.domain_features
    
    def perform_clustering_analysis(self, n_clusters=5):
        """Perform K-means clustering on students"""
        # Combine TF-IDF and numerical features
        tfidf_dense = self.student_tfidf.toarray()
        combined_features = np.hstack([tfidf_dense, self.scaler.fit_transform(self.student_features)])
        
        # Apply PCA for dimensionality reduction
        pca = PCA(n_components=50)
        features_pca = pca.fit_transform(combined_features)
        
        # K-means clustering
        self.kmeans_model = KMeans(n_clusters=n_clusters, random_state=42)
        clusters = self.kmeans_model.fit_predict(features_pca)
        
        self.students_data['cluster'] = clusters
        
        # Analyze clusters
        self._analyze_clusters()
        
        return clusters
    
    def _analyze_clusters(self):
        """Analyze and describe student clusters"""
        print("\n🔍 STUDENT CLUSTER ANALYSIS")
        print("=" * 50)
        
        for cluster_id in sorted(self.students_data['cluster'].unique()):
            cluster_students = self.students_data[self.students_data['cluster'] == cluster_id]
            
            print(f"\n📊 Cluster {cluster_id} ({len(cluster_students)} students):")
            print(f"   Average GPA: {cluster_students['gpa'].mean():.2f}")
            print(f"   Average Programming Experience: {cluster_students['programming_exp'].mean():.1f} years")
            print(f"   Average Project Count: {cluster_students['project_count'].mean():.1f}")
            
            # Most common skills in cluster
            all_skills = ' '.join(cluster_students['skills']).lower()
            common_skills = []
            for skill in ['python', 'javascript', 'sql', 'react', 'java']:
                if skill in all_skills:
                    common_skills.append(skill)
            print(f"   Common Skills: {', '.join(common_skills[:5])}")
    
    def build_ml_recommender(self):
        """Build machine learning-based recommender using Random Forest"""
        # Create training data (synthetic preferences)
        training_data = []
        training_labels = []
        
        for _, student in self.students_data.iterrows():
            student_idx = student.name
            student_vector = self.student_tfidf[student_idx].toarray().flatten()
            student_numerical = self.student_features[student_idx]
            
            # Calculate preferences based on similarity and constraints
            for domain_idx, domain in self.domains_data.iterrows():
                domain_vector = self.domain_tfidf[domain_idx].toarray().flatten()
                
                # Feature combination
                cosine_sim = cosine_similarity([student_vector], [domain_vector])[0][0]
                gpa_match = 1 if student['gpa'] >= domain['required_gpa'] else 0
                difficulty_match = self._calculate_difficulty_match(student, domain)
                
                features = np.concatenate([
                    student_numerical,
                    [cosine_sim, gpa_match, difficulty_match, domain['market_demand'], domain['growth_rate']]
                ])
                
                # Synthetic label (preference score)
                preference_score = cosine_sim * 0.4 + gpa_match * 0.2 + difficulty_match * 0.2 + domain['market_demand'] * 0.1 + domain['growth_rate'] * 0.1
                label = 1 if preference_score > 0.6 else 0
                
                training_data.append(features)
                training_labels.append(label)
        
        # Train Random Forest model
        X_train = np.array(training_data)
        y_train = np.array(training_labels)
        
        self.rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
        self.rf_model.fit(X_train, y_train)
        
        print(f"\n🤖 ML MODEL TRAINING COMPLETED")
        print(f"Training Accuracy: {self.rf_model.score(X_train, y_train):.3f}")
    
    def _calculate_difficulty_match(self, student, domain):
        """Calculate how well student's level matches domain difficulty"""
        year_levels = {'freshman': 1, 'sophomore': 2, 'junior': 3, 'senior': 4}
        student_level = year_levels[student['year']] + student['programming_exp'] * 0.5
        
        difficulty_diff = abs(student_level - domain['difficulty_level'])
        return max(0, 1 - difficulty_diff / 5)
    
    def get_recommendations(self, user_profile, method='hybrid', top_k=5):
        """Get recommendations using specified method"""
        if method == 'tfidf':
            return self._recommend_tfidf(user_profile, top_k)
        elif method == 'ml':
            return self._recommend_ml(user_profile, top_k)
        elif method == 'hybrid':
            return self._recommend_hybrid(user_profile, top_k)
        else:
            raise ValueError("Method must be 'tfidf', 'ml', or 'hybrid'")
    
    def _recommend_tfidf(self, user_profile, top_k):
        """TF-IDF based recommendations"""
        user_text = f"{user_profile['skills']} {user_profile['interests']} {user_profile['experience']}"
        user_vector = self.vectorizer.transform([user_text])
        
        similarities = cosine_similarity(user_vector, self.domain_tfidf).flatten()
        top_indices = similarities.argsort()[-top_k:][::-1]
        
        recommendations = []
        for idx in top_indices:
            domain = self.domains_data.iloc[idx]
            recommendations.append({
                'domain': domain['domain'],
                'score': similarities[idx],
                'method': 'TF-IDF',
                'details': domain.to_dict()
            })
        
        return recommendations
    
    def _recommend_ml(self, user_profile, top_k):
        """Machine Learning based recommendations"""
        # Create user features
        user_numerical = [
            user_profile['gpa'],
            user_profile['programming_exp'],
            user_profile['project_count'],
            len(user_profile['skills'].split(', ')),
            len(user_profile['interests'].split(', ')),
            {'freshman': 1, 'sophomore': 2, 'junior': 3, 'senior': 4}[user_profile['year']]
        ]
        
        user_text = f"{user_profile['skills']} {user_profile['interests']} {user_profile['experience']}"
        user_tfidf = self.vectorizer.transform([user_text]).toarray().flatten()
        
        recommendations = []
        for idx, domain in self.domains_data.iterrows():
            # Prepare features for prediction
            cosine_sim = cosine_similarity([user_tfidf], [self.domain_tfidf[idx].toarray().flatten()])[0][0]
            gpa_match = 1 if user_profile['gpa'] >= domain['required_gpa'] else 0
            difficulty_match = self._calculate_difficulty_match_user(user_profile, domain)
            
            features = np.array(user_numerical + [cosine_sim, gpa_match, difficulty_match, domain['market_demand'], domain['growth_rate']]).reshape(1, -1)
            
            # Get prediction probability
            
            proba = self.rf_model.predict_proba(features)[0]
            if len(proba) == 2:
                prob = proba[1]
            else:
                if self.rf_model.classes_[0] == 1:
                    prob = 1.0
                else:
                    prob = 0.0

            
            recommendations.append({
                'domain': domain['domain'],
                'score': prob,
                'method': 'Machine Learning',
                'details': domain.to_dict()
            })
        
        # Sort by score and return top k
        recommendations.sort(key=lambda x: x['score'], reverse=True)
        return recommendations[:top_k]
    
    def _calculate_difficulty_match_user(self, user_profile, domain):
        """Calculate difficulty match for user profile"""
        year_levels = {'freshman': 1, 'sophomore': 2, 'junior': 3, 'senior': 4}
        user_level = year_levels[user_profile['year']] + user_profile['programming_exp'] * 0.5
        
        difficulty_diff = abs(user_level - domain['difficulty_level'])
        return max(0, 1 - difficulty_diff / 5)
    
    def _recommend_hybrid(self, user_profile, top_k):
        """Hybrid recommendation combining multiple methods"""
        tfidf_recs = self._recommend_tfidf(user_profile, top_k * 2)
        ml_recs = self._recommend_ml(user_profile, top_k * 2)
        
        # Combine scores with weights
        domain_scores = {}
        for rec in tfidf_recs:
            domain_scores[rec['domain']] = {'tfidf': rec['score'], 'ml': 0, 'details': rec['details']}
        
        for rec in ml_recs:
            if rec['domain'] in domain_scores:
                domain_scores[rec['domain']]['ml'] = rec['score']
            else:
                domain_scores[rec['domain']] = {'tfidf': 0, 'ml': rec['score'], 'details': rec['details']}
        
        # Calculate hybrid score
        hybrid_recommendations = []
        for domain, scores in domain_scores.items():
            hybrid_score = 0.6 * scores['tfidf'] + 0.4 * scores['ml']
            hybrid_recommendations.append({
                'domain': domain,
                'score': hybrid_score,
                'tfidf_score': scores['tfidf'],
                'ml_score': scores['ml'],
                'method': 'Hybrid',
                'details': scores['details']
            })
        
        # Sort and return top k
        hybrid_recommendations.sort(key=lambda x: x['score'], reverse=True)
        return hybrid_recommendations[:top_k]
    
    
    
    def generate_detailed_report(self, user_profile, recommendations):
        """Generate comprehensive recommendation report"""
        print("\n" + "="*80)
        print(f"🎯 PERSONALIZED INTERNSHIP RECOMMENDATION REPORT")
        print(f"👤 Student: {user_profile['name']}")
        print("="*80)
        
        # User profile summary
        print(f"\n📋 PROFILE SUMMARY:")
        print(f"   GPA: {user_profile['gpa']}")
        print(f"   Academic Year: {user_profile['year'].title()}")
        print(f"   Programming Experience: {user_profile['programming_exp']} years")
        print(f"   Project Count: {user_profile['project_count']}")
        print(f"   Skills: {user_profile['skills']}")
        print(f"   Interests: {user_profile['interests']}")
        
        print(f"\n🏆 TOP {len(recommendations)} RECOMMENDATIONS:")
        print("-" * 60)
        
        for i, rec in enumerate(recommendations, 1):
            details = rec['details']
            print(f"\n{i}. {rec['domain']} (Score: {rec['score']:.3f})")
            print(f"   💰 Average Salary: ${details['avg_salary']:,}")
            print(f"   📈 Growth Rate: {details['growth_rate']}%")
            print(f"   🎯 Market Demand: {details['market_demand']}/5")
            print(f"   🏋️ Difficulty Level: {details['difficulty_level']}/5")
            print(f"   📚 Required GPA: {details['required_gpa']}")
            
            if rec['method'] == 'Hybrid':
                print(f"   🔍 TF-IDF Score: {rec['tfidf_score']:.3f}")
                print(f"   🤖 ML Score: {rec['ml_score']:.3f}")
        
        # Career insights
        print(f"\n💡 CAREER INSIGHTS:")
        avg_salary = np.mean([rec['details']['avg_salary'] for rec in recommendations])
        avg_growth = np.mean([rec['details']['growth_rate'] for rec in recommendations])
        print(f"   Expected Average Salary: ${avg_salary:,.0f}")
        print(f"   Expected Average Growth: {avg_growth:.1f}%")
        
        # Skill gap analysis
        print(f"\n🎯 SKILL DEVELOPMENT RECOMMENDATIONS:")
        user_skills = set(skill.strip().lower() for skill in user_profile['skills'].split(','))
        
        for rec in recommendations[:3]:  # Top 3 recommendations
            domain_desc = rec['details']['description'].lower()
            recommended_skills = []
            
            key_skills = ['python', 'javascript', 'sql', 'aws', 'docker', 'react', 'tensorflow']
            for skill in key_skills:
                if skill in domain_desc and skill not in user_skills:
                    recommended_skills.append(skill.title())
            
            if recommended_skills:
                print(f"   {rec['domain']}: Consider learning {', '.join(recommended_skills[:3])}")

def main():
    print("🚀 ADVANCED AI INTERNSHIP DOMAIN RECOMMENDER")
    print("=" * 60)
    
    # Initialize recommender system
    recommender = AdvancedInternshipRecommender()
    
    # Build features and models
    print("\n🔧 Building TF-IDF features...")
    recommender.build_tfidf_features()
    
    print("🔧 Creating advanced features...")
    recommender.create_advanced_features()
    
    print("🔧 Performing clustering analysis...")
    recommender.perform_clustering_analysis()
    
    print("🔧 Building ML recommender...")
    recommender.build_ml_recommender()
    
    # Get user input
    print("\n" + "="*60)
    print("📝 PLEASE PROVIDE YOUR INFORMATION:")
    print("="*60)
    
    user_profile = {
        'name': input("Enter your name: "),
        'skills': input("Enter your technical skills (comma-separated): "),
        'interests': input("Enter your interests and career goals: "),
        'experience': input("Describe your experience and projects: "),
        'gpa': float(input("Enter your GPA (0.0-4.0): ")),
        'year': input("Enter your academic year (freshman/sophomore/junior/senior): ").lower(),
        'programming_exp': int(input("Enter your programming experience in years (0-10): ")),
        'project_count': int(input("Enter number of projects you've completed (0-20): "))
    }
    
    # Get recommendations using different methods
    print(f"\n🔍 Analyzing profile for {user_profile['name']}...")
    
    # TF-IDF recommendations
    tfidf_recs = recommender.get_recommendations(user_profile, method='tfidf', top_k=5)
    
    # ML recommendations
    ml_recs = recommender.get_recommendations(user_profile, method='ml', top_k=5)
    
    # Hybrid recommendations
    hybrid_recs = recommender.get_recommendations(user_profile, method='hybrid', top_k=5)
    
    # Display results
    print("\n📊 COMPARISON OF DIFFERENT RECOMMENDATION METHODS:")
    print("="*80)
    
    methods = [('TF-IDF', tfidf_recs), ('Machine Learning', ml_recs), ('Hybrid', hybrid_recs)]
    
    for method_name, recs in methods:
        print(f"\n🔍 {method_name.upper()} METHOD:")
        print("-" * 40)
        for i, rec in enumerate(recs, 1):
            print(f"{i}. {rec['domain']} (Score: {rec['score']:.3f})")
    
    # Generate detailed report for hybrid method
    recommender.generate_detailed_report(user_profile, hybrid_recs)
    
    print(f"\n✅ RECOMMENDATION ANALYSIS COMPLETE!")
    print("Thank you for using the Advanced AI Internship Domain Recommender! 🎉")

if __name__ == "__main__":
    main()

🚀 ADVANCED AI INTERNSHIP DOMAIN RECOMMENDER

🔧 Building TF-IDF features...
🔧 Creating advanced features...
🔧 Performing clustering analysis...

🔍 STUDENT CLUSTER ANALYSIS

📊 Cluster 0 (22 students):
   Average GPA: 3.03
   Average Programming Experience: 1.6 years
   Average Project Count: 3.5
   Common Skills: python, javascript, sql, react, java

📊 Cluster 1 (22 students):
   Average GPA: 3.19
   Average Programming Experience: 2.7 years
   Average Project Count: 8.1
   Common Skills: python, javascript, sql, react, java

📊 Cluster 2 (16 students):
   Average GPA: 3.53
   Average Programming Experience: 0.2 years
   Average Project Count: 5.8
   Common Skills: python, javascript, sql, react, java

📊 Cluster 3 (18 students):
   Average GPA: 3.85
   Average Programming Experience: 3.6 years
   Average Project Count: 10.1
   Common Skills: python, javascript, sql, react, java

📊 Cluster 4 (22 students):
   Average GPA: 3.02
   Average Programming Experience: 2.8 years
   Average Project