In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import re
from datetime import datetime
import json
from urllib.parse import urljoin, urlparse
import random
from collections import Counter

class UnifiedInternshipScraper:
    def __init__(self):
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
            'Accept-Language': 'en-US,en;q=0.5',
            'Accept-Encoding': 'gzip, deflate',
            'Connection': 'keep-alive',
            'Upgrade-Insecure-Requests': '1'
        }
        self.session = requests.Session()
        self.session.headers.update(self.headers)
        self.internships_data = []
        
    def extract_stipend_value(self, stipend_text):
        """Extract numeric stipend value from text"""
        if not stipend_text or 'unpaid' in stipend_text.lower():
            return 0
        
        numbers = re.findall(r'[\d,]+', stipend_text.replace(',', ''))
        if numbers:
            nums = [int(n) for n in numbers]
            return sum(nums) / len(nums)
        return 0
    
    def categorize_domain(self, title, skills=""):
        """Categorize internship into domain based on title and skills"""
        title_lower = title.lower()
        skills_lower = skills.lower() if skills else ""
        combined = f"{title_lower} {skills_lower}"
        
        categories = {
            'Tech': ['software', 'developer', 'programming', 'python', 'java', 'react', 'node', 'full stack', 'backend', 'frontend', 'web dev', 'app dev', 'data science', 'machine learning', 'ai', 'ml', 'javascript', 'html', 'css', 'database', 'sql'],
            'Design': ['ui', 'ux', 'graphic', 'design', 'figma', 'photoshop', 'illustrator', 'canva', 'creative', 'adobe', 'designer'],
            'Marketing': ['marketing', 'digital marketing', 'seo', 'social media', 'content', 'advertising', 'brand', 'campaign', 'google ads', 'facebook ads'],
            'Sales': ['sales', 'business development', 'bd', 'lead generation', 'customer acquisition', 'crm', 'salesforce'],
            'Finance': ['finance', 'accounting', 'financial', 'investment', 'banking', 'audit', 'excel', 'tally'],
            'Operations': ['operations', 'logistics', 'supply chain', 'project management', 'admin', 'coordinator'],
            'HR': ['human resource', 'hr', 'recruitment', 'talent acquisition', 'people', 'hiring'],
            'Content': ['content writing', 'copywriting', 'blog', 'writer', 'journalism', 'editor', 'article'],
            'Research': ['research', 'analyst', 'market research', 'data analyst', 'business analyst', 'research associate']
        }
        
        for category, keywords in categories.items():
            if any(keyword in combined for keyword in keywords):
                return category
        
        return 'Other'
    
    def extract_job_url(self, container):
        """Extract the detailed job/internship URL"""
        job_url = None
        
        
        url_selectors = [
            'a[href*="/internship/detail/"]',
            'a[href*="/internship/"]',
            '.view_detail_button a',
            '.view-details a',
            'a[title*="detail"]',
            'h3 a',
            'h4 a',
            '.heading_4_5 a'
        ]
        
        for selector in url_selectors:
            links = container.select(selector)
            for link in links:
                href = link.get('href', '')
                if href and ('internship' in href or 'job' in href):
                    
                    if not href.startswith('http'):
                        job_url = 'https://internshala.com' + href
                    else:
                        job_url = href
                    break
            if job_url:
                break
        
        return job_url
    
    def extract_skills_from_listing(self, container):
        """Extract skills from the job listing container"""
        skills_raw = ""
        
        
        skills_selectors = [
            '.round_tabs_container',
            '.round_tabs',
            '.skills-required',
            '.skills',
            '.technologies'
        ]
        
        for selector in skills_selectors:
            skills_container = container.select_one(selector)
            if skills_container:
                skills_raw = skills_container.get_text(' ', strip=True)
                break
            else:
                
                skill_elements = container.select(selector)
                if skill_elements:
                    skills_raw = ', '.join([skill.get_text(strip=True) for skill in skill_elements])
                    break
        
        
        if not skills_raw:
            container_text = container.get_text()
            skills_patterns = [
                r'Skills?[:\s]*([^\n]+)',
                r'Requirements?[:\s]*([^\n]+)', 
                r'Tools?[:\s]*([^\n]+)',
                r'Technologies?[:\s]*([^\n]+)',
                r'Qualifications?[:\s]*([^\n]+)'
            ]
            
            for pattern in skills_patterns:
                skills_match = re.search(pattern, container_text, re.IGNORECASE)
                if skills_match:
                    skills_raw = skills_match.group(1).strip()
                    break
        
        return self.clean_skills(skills_raw)
    
    def extract_skills_from_detail_page(self, job_url):
        """Extract detailed skills from the job detail page"""
        if not job_url:
            return ""
        
        try:
            print(f"  Fetching detailed skills from: {job_url}")
            response = self.session.get(job_url, timeout=15)
            response.raise_for_status()
            
            soup = BeautifulSoup(response.content, 'html.parser')
            
            
            detail_skills_selectors = [
                '.round_tabs_container',
                '.skills-required',
                '.skill-tags',
                '.requirements',
                '.qualifications',
                'div[class*="skill"]',
                'section[class*="skill"]',
                '.round_tabs',
                'span.round_tabs',
                '.other_details_class .round_tabs_container'
            ]
            
            skills_text = ""
            
            for selector in detail_skills_selectors:
                try:
                    skills_section = soup.select_one(selector)
                    if skills_section:
                        skills_text = skills_section.get_text(' ', strip=True)
                        if skills_text and len(skills_text) > 3:
                            break
                    else:
                        
                        skill_elements = soup.select(selector)
                        if skill_elements:
                            skills_list = []
                            for elem in skill_elements:
                                text = elem.get_text(strip=True)
                                if text and len(text) < 50:  
                                    skills_list.append(text)
                            if skills_list:
                                skills_text = ', '.join(skills_list)
                                break
                except Exception as e:
                    continue
            
            
            if not skills_text:
                page_text = soup.get_text()
                skills_patterns = [
                    r'Skills?[:\s]*([^\n\r]{10,200})',
                    r'Requirements?[:\s]*([^\n\r]{10,200})',
                    r'Qualifications?[:\s]*([^\n\r]{10,200})',
                    r'Technologies?[:\s]*([^\n\r]{10,200})',
                    r'Tools?[:\s]*([^\n\r]{10,200})'
                ]
                
                for pattern in skills_patterns:
                    match = re.search(pattern, page_text, re.IGNORECASE | re.MULTILINE)
                    if match:
                        skills_text = match.group(1).strip()
                        break
            
            return self.clean_skills(skills_text)
            
        except Exception as e:
            print(f"  Error fetching skills from detail page: {e}")
            return ""
    
    def clean_skills(self, skills_text):
        """Clean and standardize skills text"""
        if not skills_text or skills_text == "Not specified":
            return ""
        
        
        skills_text = re.sub(r'[^\w\s,.\-+#()]', '', skills_text)
        
        skill_mappings = {
            'photoshop': 'Adobe Photoshop',
            'illustrator': 'Adobe Illustrator', 
            'ms office': 'Microsoft Office',
            'excel': 'Microsoft Excel',
            'powerpoint': 'Microsoft PowerPoint',
            'word': 'Microsoft Word',
            'js': 'JavaScript',
            'html5': 'HTML',
            'css3': 'CSS',
            'nodejs': 'Node.js',
            'reactjs': 'React.js',
            'vuejs': 'Vue.js',
            'angularjs': 'Angular',
            'mysql': 'MySQL',
            'postgresql': 'PostgreSQL',
            'mongodb': 'MongoDB'
        }
        
        skills = []
        
        for separator in [',', 'and', '&', '|', ';']:
            if separator in skills_text.lower():
                parts = re.split(rf'\s*{re.escape(separator)}\s*', skills_text, flags=re.IGNORECASE)
                break
        else:
            parts = [skills_text]
        
        for part in parts:
            part = part.strip()
            if part and len(part) > 1:
                
                subparts = re.split(r'\s+and\s+', part, flags=re.IGNORECASE)
                for subpart in subparts:
                    subpart = subpart.strip()
                    if subpart and len(subpart) > 1:
                        subpart_lower = subpart.lower()
                        mapped_skill = skill_mappings.get(subpart_lower, subpart)
                        if len(mapped_skill) > 1 and len(mapped_skill) < 50:  
                            skills.append(mapped_skill)
        
        unique_skills = []
        seen = set()
        for skill in skills:
            skill_lower = skill.lower()
            if skill_lower not in seen and len(unique_skills) < 15:
                unique_skills.append(skill)
                seen.add(skill_lower)
        
        return ', '.join(unique_skills)
    
    def scrape_internshala(self, max_pages=5, fetch_detailed_skills=True):
        """Scrape internships from Internshala with comprehensive data"""
        print("Starting comprehensive Internshala scraping...")
        base_url = "https://internshala.com/internships"
        
        for page in range(1, max_pages + 1):
            try:
                print(f"\nScraping page {page}...")
                url = f"{base_url}/page-{page}" if page > 1 else base_url
                
                response = self.session.get(url, timeout=10)
                response.raise_for_status()
                
                soup = BeautifulSoup(response.content, 'html.parser')
                
                internship_containers = []
                selectors_to_try = [
                    'div.internship_meta',
                    'div[class*="internship"]',
                    'div.individual_internship',
                    'div.internship-item',
                    'div[data-internship-id]',
                    'div.container-fluid.individual_internship',
                    'div.view_detail_button'
                ]
                
                for selector in selectors_to_try:
                    try:
                        containers = soup.select(selector)
                        if containers:
                            internship_containers = containers
                            print(f"Found {len(containers)} internships using selector: {selector}")
                            break
                    except:
                        continue
                
                if not internship_containers:
                    all_divs = soup.find_all('div')
                    for div in all_divs:
                        if div.get_text() and any(keyword in div.get_text().lower() for keyword in ['internship', 'stipend', 'apply by']):
                            text = div.get_text().lower()
                            if 'stipend' in text and ('month' in text or 'week' in text):
                                internship_containers.append(div)
                    
                    internship_containers = [div for div in internship_containers if len(div.get_text()) > 100][:20]
                    print(f"Found {len(internship_containers)} internships using broad search")
                
                if not internship_containers:
                    print(f"No internships found on page {page}")
                    break
                
                for i, container in enumerate(internship_containers):
                    try:
                        print(f"Processing internship {i+1}/{len(internship_containers)}")
                        internship_data = self.extract_comprehensive_data(container, fetch_detailed_skills)
                        if internship_data:
                            self.internships_data.append(internship_data)
                            print(f"✓ Successfully extracted: {internship_data['job_title']} at {internship_data['company_name']}")
                            if internship_data['skills_required']:
                                print(f"  Skills: {internship_data['skills_required'][:100]}...")
                    except Exception as e:
                        print(f"Error extracting internship data {i+1}: {e}")
                        continue
                
                print(f"Page {page} complete. Total internships so far: {len(self.internships_data)}")
                time.sleep(random.uniform(2, 4))
                
            except Exception as e:
                print(f"Error scraping page {page}: {e}")
                continue
        
        print(f"\nScraped {len(self.internships_data)} internships from Internshala")
    
    def extract_comprehensive_data(self, container, fetch_detailed_skills=True):
        """Extract comprehensive data from individual internship container"""
        try:
            container_text = container.get_text()
            
            company_name = "Not specified"
            company_selectors = [
                'a.link_display_like_text',
                'a[href*="company"]',
                'h4 a',
                'h5 a',
                '.company-name'
            ]
            
            for selector in company_selectors:
                company_elem = container.select_one(selector)
                if company_elem:
                    company_name = company_elem.get_text(strip=True)
                    break
            
            if company_name == "Not specified":
                company_match = re.search(r'Company[:\s]*([^\n]+)', container_text, re.IGNORECASE)
                if company_match:
                    company_name = company_match.group(1).strip()
            
            job_title = "Not specified"
            title_selectors = [
                'h3.heading_4_5',
                'h4.heading_4_5',
                'h3',
                'h4',
                '.job-title',
                '.internship-title'
            ]
            
            for selector in title_selectors:
                title_elem = container.select_one(selector)
                if title_elem:
                    job_title = title_elem.get_text(strip=True)
                    break
            
            if job_title == "Not specified":
                lines = container_text.split('\n')
                for line in lines:
                    if 'intern' in line.lower() and len(line.strip()) < 100:
                        job_title = line.strip()
                        break
            
            location = "Not specified"
            location_selectors = [
                'a[href*="location"]',
                '.location',
                '.internship-location'
            ]
            
            for selector in location_selectors:
                location_elem = container.select_one(selector)
                if location_elem:
                    location = location_elem.get_text(strip=True)
                    break
            
            if location == "Not specified":
                location_patterns = [
                    r'Location[:\s]*([^\n]+)',
                    r'Work from home|Remote',
                    r'([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*),?\s*(?:India|IN)?'
                ]
                for pattern in location_patterns:
                    location_match = re.search(pattern, container_text, re.IGNORECASE)
                    if location_match:
                        location = location_match.group(1).strip() if location_match.groups() else location_match.group(0)
                        break
            
            
            stipend_raw = "Not specified"
            stipend_selectors = [
                'span.stipend',
                '.stipend',
                '.salary',
                '.compensation'
            ]
            
            for selector in stipend_selectors:
                stipend_elem = container.select_one(selector)
                if stipend_elem:
                    stipend_raw = stipend_elem.get_text(strip=True)
                    break
            
            if stipend_raw == "Not specified":
                stipend_patterns = [
                    r'₹\s*[\d,]+(?:\s*-\s*₹?\s*[\d,]+)?',
                    r'Stipend[:\s]*([^\n]+)',
                    r'(\d+(?:,\d+)*)\s*(?:per month|/month)',
                    r'Unpaid',
                    r'Performance based'
                ]
                for pattern in stipend_patterns:
                    stipend_match = re.search(pattern, container_text, re.IGNORECASE)
                    if stipend_match:
                        stipend_raw = stipend_match.group(0)
                        break
            
            stipend_numeric = self.extract_stipend_value(stipend_raw)
            
            duration_raw = "Not specified"
            duration_patterns = [
                r'(\d+)\s*months?',
                r'(\d+)\s*weeks?',
                r'Duration[:\s]*([^\n]+)'
            ]
            
            for pattern in duration_patterns:
                duration_match = re.search(pattern, container_text, re.IGNORECASE)
                if duration_match:
                    duration_raw = duration_match.group(0)
                    break
            
            skills_from_listing = self.extract_skills_from_listing(container)
            detailed_skills = ""
            
            if fetch_detailed_skills:
                job_url = self.extract_job_url(container)
                if job_url:
                    detailed_skills = self.extract_skills_from_detail_page(job_url)
                    time.sleep(random.uniform(1, 2))  
            
            final_skills = detailed_skills if detailed_skills else skills_from_listing
            
            intern_type = "Unpaid" if stipend_numeric == 0 else "Paid"
            if "performance" in stipend_raw.lower():
                intern_type = "Performance-based"
            
            domain_category = self.categorize_domain(job_title, final_skills)
            
            result = {
                'job_title': job_title,
                'domain_category': domain_category,
                'skills_required': final_skills,
                'location': location,
                'stipend': stipend_numeric,
                'stipend_raw': stipend_raw,
                'duration_months': self.parse_duration(duration_raw),
                'duration_raw': duration_raw,
                'posted_on': datetime.now().strftime('%Y-%m-%d'),
                'intern_type': intern_type,
                'location_type': self.categorize_location(location),
                'skills_count': len(final_skills.split(',')) if final_skills else 0,
                'has_stipend': stipend_numeric > 0,
                'company_name': company_name,  
                'platform': 'Internshala'  
            }
            
            if (company_name != "Not specified" or job_title != "Not specified") and len(container_text) > 50:
                return result
            else:
                return None
            
        except Exception as e:
            print(f"Error parsing internship container: {e}")
            return None
    
    def parse_duration(self, duration_text):
        """Parse duration text to months"""
        if not duration_text:
            return 0
        
        months_match = re.search(r'(\d+)\s*month', duration_text.lower())
        if months_match:
            return int(months_match.group(1))
        
        weeks_match = re.search(r'(\d+)\s*week', duration_text.lower())
        if weeks_match:
            return round(int(weeks_match.group(1)) / 4.33, 1)
        
        return 0
    
    def categorize_location(self, location):
        """Categorize location type"""
        location_lower = location.lower()
        if 'remote' in location_lower or 'work from home' in location_lower:
            return 'Remote'
        elif 'hybrid' in location_lower:
            return 'Hybrid'
        else:
            return 'Onsite'
    
    def clean_and_structure_data(self):
        """Clean and structure the scraped data"""
        if not self.internships_data:
            print("No data to clean")
            return pd.DataFrame()
        
        df = pd.DataFrame(self.internships_data)
        
        required_columns = [
            'job_title', 'domain_category', 'skills_required', 'location', 
            'stipend', 'stipend_raw', 'duration_months', 'duration_raw', 
            'posted_on', 'intern_type', 'location_type', 'skills_count', 'has_stipend'
        ]
        
        for col in required_columns:
            if col not in df.columns:
                df[col] = None
        
        df = df[required_columns]
        
        df = df.drop_duplicates(subset=['job_title', 'location', 'stipend'], keep='first')
        
        df['skills_required'] = df['skills_required'].fillna('').astype(str)
        df['skills_count'] = df['skills_required'].apply(lambda x: len([s.strip() for s in x.split(',') if s.strip()]) if x else 0)
        
        return df
    
    def save_data(self, filename='comprehensive_internship_data.csv'):
        """Save cleaned data to CSV with exact column format"""
        df = self.clean_and_structure_data()
        if not df.empty:
            df.to_csv(filename, index=False)
            print(f"\nData saved to {filename}")
            print(f"Total internships scraped: {len(df)}")
            print(f"Columns: {list(df.columns)}")
            return df
        else:
            print("No data to save")
            return pd.DataFrame()
    
    def get_comprehensive_summary(self):
        """Get comprehensive summary statistics"""
        df = self.clean_and_structure_data()
        if df.empty:
            return "No data available"
        
        all_skills = []
        for skills_str in df['skills_required']:
            if skills_str:
                skills_list = [skill.strip() for skill in skills_str.split(',') if skill.strip()]
                all_skills.extend(skills_list)
        
        skill_counts = Counter(all_skills)
        
        stats = {
            'Total Internships': len(df),
            'Paid Internships': len(df[df['has_stipend'] == True]),
            'Unpaid Internships': len(df[df['has_stipend'] == False]),
            'Average Stipend (Paid)': round(df[df['stipend'] > 0]['stipend'].mean(), 2) if len(df[df['stipend'] > 0]) > 0 else 0,
            'Internships with Skills': len(df[df['skills_count'] > 0]),
            'Average Skills per Internship': round(df['skills_count'].mean(), 2),
            'Top 5 Domains': df['domain_category'].value_counts().head(5).to_dict(),
            'Location Types': df['location_type'].value_counts().to_dict(),
            'Internship Types': df['intern_type'].value_counts().to_dict(),
            'Top 10 Skills': dict(skill_counts.most_common(10)),
            'Unique Skills Found': len(skill_counts)
        }
        
        return stats

def main():
    """Main function to run the unified scraper"""
    scraper = UnifiedInternshipScraper()
    
    scraper.scrape_internshala(max_pages=3, fetch_detailed_skills=True)
    
    df = scraper.save_data('unified_internship_dataset.csv')
    
    stats = scraper.get_comprehensive_summary()
    
    print("\n" + "="*50)
    print("COMPREHENSIVE SCRAPING SUMMARY")
    print("="*50)
    
    for key, value in stats.items():
        
        print(f"{key}: {value}")
    
    if not df.empty:
        print("\n" + "="*50)
        print("SAMPLE DATA")
        print("="*50)
        print(df.head(2).to_string())
    
    return df

if __name__ == "__main__":
    df = main()

Starting comprehensive Internshala scraping...

Scraping page 1...
Found 50 internships using selector: div.internship_meta
Processing internship 1/50
  Fetching detailed skills from: https://internshala.com/internship/detail/business-development-sales-internship-in-ahmedabad-at-think-que-consulting-private-limited1751050913
✓ Successfully extracted: Business Development (Sales) at Think Que Consulting Private Limited
Processing internship 2/50
  Fetching detailed skills from: https://internshala.com/internship/detail/part-time-graphic-design-internship-in-mohali-at-alphanumeric-ideas-private-limited1752127866
✓ Successfully extracted: Graphic Design at Alphanumeric Ideas Private Limited
Processing internship 3/50
  Fetching detailed skills from: https://internshala.com/internship/detail/sales-internship-in-mumbai-at-matchlog-solutions-pvt-ltd1750246847
✓ Successfully extracted: Sales at MatchLog Solutions Private Limited
Processing internship 4/50
  Fetching detailed skills from: http