In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import re
from datetime import datetime
import json
from urllib.parse import urljoin, urlparse
import random

class InternshipScraper:
    def __init__(self):
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
            'Accept-Language': 'en-US,en;q=0.5',
            'Accept-Encoding': 'gzip, deflate',
            'Connection': 'keep-alive',
            'Upgrade-Insecure-Requests': '1'
        }
        self.session = requests.Session()
        self.session.headers.update(self.headers)
        self.internships_data = []
        
    def extract_stipend_value(self, stipend_text):
        """Extract numeric stipend value from text"""
        if not stipend_text or 'unpaid' in stipend_text.lower():
            return 0
        
        numbers = re.findall(r'[\d,]+', stipend_text.replace(',', ''))
        if numbers:
            
            nums = [int(n) for n in numbers]
            return sum(nums) / len(nums)
        return 0
    
    def categorize_domain(self, title, skills=""):
        """Categorize internship into domain based on title and skills"""
        title_lower = title.lower()
        skills_lower = skills.lower() if skills else ""
        combined = f"{title_lower} {skills_lower}"
        
        categories = {
            'Tech': ['software', 'developer', 'programming', 'python', 'java', 'react', 'node', 'full stack', 'backend', 'frontend', 'web dev', 'app dev', 'data science', 'machine learning', 'ai', 'ml'],
            'Design': ['ui', 'ux', 'graphic', 'design', 'figma', 'photoshop', 'illustrator', 'canva', 'creative'],
            'Marketing': ['marketing', 'digital marketing', 'seo', 'social media', 'content', 'advertising', 'brand', 'campaign'],
            'Sales': ['sales', 'business development', 'bd', 'lead generation', 'customer acquisition'],
            'Finance': ['finance', 'accounting', 'financial', 'investment', 'banking', 'audit'],
            'Operations': ['operations', 'logistics', 'supply chain', 'project management', 'admin'],
            'HR': ['human resource', 'hr', 'recruitment', 'talent acquisition', 'people'],
            'Content': ['content writing', 'copywriting', 'blog', 'writer', 'journalism', 'editor'],
            'Research': ['research', 'analyst', 'market research', 'data analyst', 'business analyst']
        }
        
        for category, keywords in categories.items():
            if any(keyword in combined for keyword in keywords):
                return category
        
        return 'Other'
    
    def scrape_internshala(self, max_pages=5):
        """Scrape internships from Internshala"""
        print("Starting Internshala scraping...")
        base_url = "https://internshala.com/internships"
        
        for page in range(1, max_pages + 1):
            try:
                print(f"Scraping page {page}...")
                url = f"{base_url}/page-{page}" if page > 1 else base_url
                
                response = self.session.get(url, timeout=10)
                response.raise_for_status()
                
                soup = BeautifulSoup(response.content, 'html.parser')
                
               
                internship_containers = []
                
                
                selectors_to_try = [
                    'div.internship_meta',
                    'div[class*="internship"]',
                    'div.individual_internship',
                    'div.internship-item',
                    'div[data-internship-id]',
                    'div.container-fluid.individual_internship',
                    'div.view_detail_button'
                ]
                
                for selector in selectors_to_try:
                    try:
                        containers = soup.select(selector)
                        if containers:
                            internship_containers = containers
                            print(f"Found {len(containers)} internships using selector: {selector}")
                            break
                    except:
                        continue
                
                
                if not internship_containers:
                    all_divs = soup.find_all('div')
                    for div in all_divs:
                        if div.get_text() and any(keyword in div.get_text().lower() for keyword in ['internship', 'stipend', 'apply by']):
                            text = div.get_text().lower()
                            if 'stipend' in text and ('month' in text or 'week' in text):
                                internship_containers.append(div)
                    
                   
                    internship_containers = [div for div in internship_containers if len(div.get_text()) > 100][:20]
                    print(f"Found {len(internship_containers)} internships using broad search")
                
                if not internship_containers:
                    print(f"No internships found on page {page}")
                    with open(f'debug_page_{page}.html', 'w', encoding='utf-8') as f:
                        f.write(soup.prettify())
                    print(f"Saved HTML to debug_page_{page}.html for inspection")
                    break
                
                for i, container in enumerate(internship_containers):
                    try:
                        print(f"Processing internship {i+1}/{len(internship_containers)}")
                        internship_data = self.extract_internshala_data(container)
                        if internship_data:
                            self.internships_data.append(internship_data)
                            print(f"Successfully extracted: {internship_data['job_title']} at {internship_data['company_name']}")
                    except Exception as e:
                        print(f"Error extracting internship data {i+1}: {e}")
                        continue
                
                print(f"Page {page} complete. Total internships so far: {len(self.internships_data)}")
                
                time.sleep(random.uniform(2, 4))
                
            except Exception as e:
                print(f"Error scraping page {page}: {e}")
                continue
        
        print(f"Scraped {len(self.internships_data)} internships from Internshala")
    
    def extract_internshala_data(self, container):
        """Extract data from individual internship container"""
        try:
            container_text = container.get_text()
            company_name = "Not specified"
            company_selectors = [
                'a.link_display_like_text',
                'a[href*="company"]',
                'h4 a',
                'h5 a',
                '.company-name'
            ]
            
            for selector in company_selectors:
                company_elem = container.select_one(selector)
                if company_elem:
                    company_name = company_elem.get_text(strip=True)
                    break
            
            if company_name == "Not specified":
                company_match = re.search(r'Company[:\s]*([^\n]+)', container_text, re.IGNORECASE)
                if company_match:
                    company_name = company_match.group(1).strip()
            
            job_title = "Not specified"
            title_selectors = [
                'h3.heading_4_5',
                'h4.heading_4_5',
                'h3',
                'h4',
                '.job-title',
                '.internship-title'
            ]
            
            for selector in title_selectors:
                title_elem = container.select_one(selector)
                if title_elem:
                    job_title = title_elem.get_text(strip=True)
                    break
            
            if job_title == "Not specified":
                lines = container_text.split('\n')
                for line in lines:
                    if 'intern' in line.lower() and len(line.strip()) < 100:
                        job_title = line.strip()
                        break
            
            location = "Not specified"
            location_selectors = [
                'a[href*="location"]',
                '.location',
                '.internship-location'
            ]
            
            for selector in location_selectors:
                location_elem = container.select_one(selector)
                if location_elem:
                    location = location_elem.get_text(strip=True)
                    break
            
            if location == "Not specified":
                location_patterns = [
                    r'Location[:\s]*([^\n]+)',
                    r'Work from home|Remote',
                    r'([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*),?\s*(?:India|IN)?'
                ]
                for pattern in location_patterns:
                    location_match = re.search(pattern, container_text, re.IGNORECASE)
                    if location_match:
                        location = location_match.group(1).strip() if location_match.groups() else location_match.group(0)
                        break
        
            stipend_raw = "Not specified"
            stipend_selectors = [
                'span.stipend',
                '.stipend',
                '.salary',
                '.compensation'
            ]
            
            for selector in stipend_selectors:
                stipend_elem = container.select_one(selector)
                if stipend_elem:
                    stipend_raw = stipend_elem.get_text(strip=True)
                    break
            
            if stipend_raw == "Not specified":
                stipend_patterns = [
                    r'₹\s*[\d,]+(?:\s*-\s*₹?\s*[\d,]+)?',
                    r'Stipend[:\s]*([^\n]+)',
                    r'(\d+(?:,\d+)*)\s*(?:per month|/month)',
                    r'Unpaid',
                    r'Performance based'
                ]
                for pattern in stipend_patterns:
                    stipend_match = re.search(pattern, container_text, re.IGNORECASE)
                    if stipend_match:
                        stipend_raw = stipend_match.group(0)
                        break
            
            stipend_numeric = self.extract_stipend_value(stipend_raw)
            
            duration_raw = "Not specified"
            duration_patterns = [
                r'(\d+)\s*months?',
                r'(\d+)\s*weeks?',
                r'Duration[:\s]*([^\n]+)'
            ]
            
            for pattern in duration_patterns:
                duration_match = re.search(pattern, container_text, re.IGNORECASE)
                if duration_match:
                    duration_raw = duration_match.group(0)
                    break
            
            skills_raw = ""
            skills_patterns = [
                r'Skills?[:\s]*([^\n]+)',
                r'Requirements?[:\s]*([^\n]+)',
                r'Tools?[:\s]*([^\n]+)'
            ]
            
            for pattern in skills_patterns:
                skills_match = re.search(pattern, container_text, re.IGNORECASE)
                if skills_match:
                    skills_raw = skills_match.group(1).strip()
                    break
            
            intern_type = "Unpaid" if stipend_numeric == 0 else "Paid"
            if "performance" in stipend_raw.lower():
                intern_type = "Performance-based"
            
            domain_category = self.categorize_domain(job_title, skills_raw)
            
            result = {
                'platform': 'Internshala',
                'company_name': company_name,
                'job_title': job_title,
                'domain_category': domain_category,
                'skills_required': skills_raw,
                'location': location,
                'stipend': stipend_numeric,
                'stipend_raw': stipend_raw,
                'duration_months': self.parse_duration(duration_raw),
                'duration_raw': duration_raw,
                'posted_on': datetime.now().strftime('%Y-%m-%d'),
                'intern_type': intern_type,
                'location_type': self.categorize_location(location)
            }
            
            if (company_name != "Not specified" or job_title != "Not specified") and len(container_text) > 50:
                return result
            else:
                return None
            
        except Exception as e:
            print(f"Error parsing internship container: {e}")
            return None
    
    def parse_duration(self, duration_text):
        """Parse duration text to months"""
        if not duration_text:
            return 0
        
        months_match = re.search(r'(\d+)\s*month', duration_text.lower())
        if months_match:
            return int(months_match.group(1))
        
        weeks_match = re.search(r'(\d+)\s*week', duration_text.lower())
        if weeks_match:
            return round(int(weeks_match.group(1)) / 4.33, 1)  
        
        return 0
    
    def categorize_location(self, location):
        """Categorize location type"""
        location_lower = location.lower()
        if 'remote' in location_lower or 'work from home' in location_lower:
            return 'Remote'
        elif 'hybrid' in location_lower:
            return 'Hybrid'
        else:
            return 'Onsite'
    
    def scrape_linkedin_internships(self, max_results=50):
        """Scrape LinkedIn internships (basic approach)"""
        print("LinkedIn scraping requires more complex setup due to login requirements")
        print("For now, focusing on Internshala. You can extend this for LinkedIn later.")
        pass
    
    def clean_and_structure_data(self):
        """Clean and structure the scraped data"""
        if not self.internships_data:
            print("No data to clean")
            return pd.DataFrame()
        
        df = pd.DataFrame(self.internships_data)
        
        df['skills_required'] = df['skills_required'].apply(self.clean_skills)
        
        df = df.drop_duplicates(subset=['company_name', 'job_title', 'location'], keep='first')
        
        df['skills_count'] = df['skills_required'].apply(lambda x: len(x.split(',')) if x else 0)
        df['has_stipend'] = df['stipend'] > 0
        
        return df
    
    def clean_skills(self, skills_text):
        """Clean and standardize skills text"""
        if not skills_text or skills_text == "Not specified":
            return ""
        
        skill_mappings = {
            'photoshop': 'Adobe Photoshop',
            'illustrator': 'Adobe Illustrator',
            'ms office': 'Microsoft Office',
            'excel': 'Microsoft Excel',
            'powerpoint': 'Microsoft PowerPoint',
            'word': 'Microsoft Word'
        }
        
        skills = [skill.strip() for skill in skills_text.split(',')]
        cleaned_skills = []
        
        for skill in skills:
            skill_lower = skill.lower()
            mapped_skill = skill_mappings.get(skill_lower, skill)
            if len(mapped_skill) > 1:  
                cleaned_skills.append(mapped_skill)
        
        return ', '.join(cleaned_skills[:5])  
    
    def save_data(self, filename='internship_data.csv'):
        """Save cleaned data to CSV"""
        df = self.clean_and_structure_data()
        if not df.empty:
            df.to_csv(filename, index=False)
            print(f"Data saved to {filename}")
            print(f"Total internships scraped: {len(df)}")
            return df
        else:
            print("No data to save")
            return pd.DataFrame()
    
    def get_summary_stats(self):
        """Get basic summary statistics"""
        df = self.clean_and_structure_data()
        if df.empty:
            return "No data available"
        
        stats = {
            'Total Internships': len(df),
            'Paid Internships': len(df[df['has_stipend'] == True]),
            'Unpaid Internships': len(df[df['has_stipend'] == False]),
            'Average Stipend (Paid)': df[df['stipend'] > 0]['stipend'].mean(),
            'Top Domains': df['domain_category'].value_counts().head(5).to_dict(),
            'Location Types': df['location_type'].value_counts().to_dict()
        }
        
        return stats

def main():
    scraper = InternshipScraper()
    scraper.scrape_internshala(max_pages=5) 
    df = scraper.save_data('internship_market_data.csv')
    stats = scraper.get_summary_stats()
    print("\n=== SCRAPING SUMMARY ===")

    for key, value in stats.items():
        print(f"{key}: {value}")
    
    return df
if __name__ == "__main__":
    df = main()

Starting Internshala scraping...
Scraping page 1...
Found 50 internships using selector: div.internship_meta
Processing internship 1/50
Successfully extracted: Graphic Design at zFrames Media
Processing internship 2/50
Successfully extracted: Brand Ambassador – On-Camera Content Creator (Wellness + Lifestyle) at Herbnexus Pvt Ltd
Processing internship 3/50
Successfully extracted: Human Resources (HR) at Lime Learn Eduserv Private Limited
Processing internship 4/50
Successfully extracted: Data Entry at SANNA CAPITAL PRIVATE LIMITED
Processing internship 5/50
Successfully extracted: Data Entry at Cherag Traders
Processing internship 6/50
Successfully extracted: Email Marketing at Web3Task
Processing internship 7/50
Successfully extracted: Game Tester at Renske Technologies Inc
Processing internship 8/50
Successfully extracted: Video Editing/Making at zFrames Media
Processing internship 9/50
Successfully extracted: Public Speaking & Theatre Coach at Utsaah Learning Private Limited
Process

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import re
from datetime import datetime
import json
from urllib.parse import urljoin, urlparse
import random

class InternshipScraper:
    def __init__(self):
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
            'Accept-Language': 'en-US,en;q=0.5',
            'Accept-Encoding': 'gzip, deflate',
            'Connection': 'keep-alive',
            'Upgrade-Insecure-Requests': '1'
        }
        self.session = requests.Session()
        self.session.headers.update(self.headers)
        self.internships_data = []
        
    def extract_stipend_value(self, stipend_text):
        """Extract numeric stipend value from text"""
        if not stipend_text or 'unpaid' in stipend_text.lower():
            return 0
        
        numbers = re.findall(r'[\d,]+', stipend_text.replace(',', ''))
        if numbers:
            nums = [int(n) for n in numbers]
            return sum(nums) / len(nums)
        return 0
    
    def categorize_domain(self, title, skills=""):
        """Categorize internship into domain based on title and skills"""
        title_lower = title.lower()
        skills_lower = skills.lower() if skills else ""
        combined = f"{title_lower} {skills_lower}"
        
        categories = {
            'Tech': ['software', 'developer', 'programming', 'python', 'java', 'react', 'node', 'full stack', 'backend', 'frontend', 'web dev', 'app dev', 'data science', 'machine learning', 'ai', 'ml'],
            'Design': ['ui', 'ux', 'graphic', 'design', 'figma', 'photoshop', 'illustrator', 'canva', 'creative'],
            'Marketing': ['marketing', 'digital marketing', 'seo', 'social media', 'content', 'advertising', 'brand', 'campaign'],
            'Sales': ['sales', 'business development', 'bd', 'lead generation', 'customer acquisition'],
            'Finance': ['finance', 'accounting', 'financial', 'investment', 'banking', 'audit'],
            'Operations': ['operations', 'logistics', 'supply chain', 'project management', 'admin'],
            'HR': ['human resource', 'hr', 'recruitment', 'talent acquisition', 'people'],
            'Content': ['content writing', 'copywriting', 'blog', 'writer', 'journalism', 'editor'],
            'Research': ['research', 'analyst', 'market research', 'data analyst', 'business analyst']
        }
        
        for category, keywords in categories.items():
            if any(keyword in combined for keyword in keywords):
                return category
        
        return 'Other'
    
    def extract_posted_company_website(self, container, internship_detail_url=None):
        """Extract the actual company website posted on Internshala"""
        website_url = None
        
        website_selectors = [
            'a[href*="company_website"]',
            'a[title*="website"]',
            'a[title*="Website"]',
            '.company-details a',
            '.company-info a',
            '.company-website',
            '.website-link',
            'a[href]:not([href*="internshala.com"]):not([href*="javascript"]):not([href*="mailto"])'
        ]
        
        for selector in website_selectors:
            try:
                links = container.select(selector)
                for link in links:
                    href = link.get('href', '')
                    title = link.get('title', '').lower()
                    text = link.get_text(strip=True).lower()
                    
                    if href and self.is_valid_company_website(href):
                        if any(keyword in f"{title} {text}" for keyword in ['website', 'company', 'visit', 'www', 'site']):
                            website_url = self.clean_website_url(href)
                            break
                        elif self.is_external_domain(href):
                            website_url = self.clean_website_url(href)
                            break
                if website_url:
                    break
            except Exception as e:
                continue
        
        if not website_url and internship_detail_url:
            try:
                website_url = self.extract_from_detail_page(internship_detail_url)
            except Exception as e:
                print(f"Error extracting from detail page: {e}")
        
        if not website_url:
            website_url = self.extract_from_company_profile(container)
        
        return website_url
    
    def is_valid_company_website(self, url):
        """Check if URL looks like a valid company website"""
        if not url:
            return False
        
        if 'internshala.com' in url:
            return False
        
        if url.startswith(('mailto:', 'tel:', 'javascript:')):
            return False
        
        social_domains = ['facebook.com', 'twitter.com', 'linkedin.com', 'instagram.com', 'youtube.com']
        if any(domain in url for domain in social_domains):
            return False
        
        if any(pattern in url for pattern in ['http://', 'https://', 'www.', '.com', '.in', '.org', '.net']):
            return True
        
        return False
    
    def is_external_domain(self, url):
        """Check if URL is an external domain (not Internshala)"""
        try:
            parsed = urlparse(url)
            domain = parsed.netloc.lower()
            return domain and 'internshala.com' not in domain
        except:
            return False
    
    def clean_website_url(self, url):
        """Clean and normalize website URL"""
        if not url:
            return None
        
        if not url.startswith(('http://', 'https://')):
            if url.startswith('www.'):
                url = 'https://' + url
            elif '.' in url:
                url = 'https://' + url
        
        try:
            parsed = urlparse(url)
            clean_url = f"{parsed.scheme}://{parsed.netloc}{parsed.path}"
            return clean_url.rstrip('/')
        except:
            return url
    
    def extract_from_detail_page(self, detail_url):
        """Extract company website from internship detail page"""
        try:
            if not detail_url.startswith('http'):
                    detail_url = 'https://internshala.com' + detail_url
            
            response = self.session.get(detail_url, timeout=10)
            response.raise_for_status()
            
            soup = BeautifulSoup(response.content, 'html.parser')
            detail_selectors = [
                '.company-details a[href*="http"]',
                '.about-company a[href*="http"]',
                '.company-info a[href*="http"]',
                'a[title*="website"]',
                'a[title*="Website"]',
                '.company-website',
                'a[href]:not([href*="internshala.com"])'
            ]
            
            for selector in detail_selectors:
                try:
                    links = soup.select(selector)
                    for link in links:
                        href = link.get('href', '')
                        if self.is_valid_company_website(href):
                            return self.clean_website_url(href)
                except:
                    continue
            
            return None
            
        except Exception as e:
            print(f"Error fetching detail page {detail_url}: {e}")
            return None
    
    def extract_from_company_profile(self, container):
        """Extract website from company profile information"""
        try:
            profile_links = container.select('a[href*="company"]')
            
            for link in profile_links:
                href = link.get('href', '')
                if 'company' in href and 'internshala.com' in href:
                    try:
                        company_url = urljoin('https://internshala.com', href)
                        response = self.session.get(company_url, timeout=10)
                        response.raise_for_status()
                        
                        soup = BeautifulSoup(response.content, 'html.parser')
                        
                        website_selectors = [
                            '.company-website a',
                            '.website a',
                            'a[title*="website"]',
                            'a[href*="http"]:not([href*="internshala.com"])'
                        ]
                        
                        for selector in website_selectors:
                            website_links = soup.select(selector)
                            for w_link in website_links:
                                w_href = w_link.get('href', '')
                                if self.is_valid_company_website(w_href):
                                    return self.clean_website_url(w_href)
                    except:
                        continue
            
            return None
            
        except Exception as e:
            return None
    
    def get_internship_detail_url(self, container):
        """Extract the detail page URL for the internship"""
        try:
            detail_selectors = [
                'a[href*="internship"]',
                '.view_detail_button a',
                '.view-details a',
                'a[title*="detail"]',
                'a[href*="/internship/"]'
            ]
            
            for selector in detail_selectors:
                links = container.select(selector)
                for link in links:
                    href = link.get('href', '')
                    if 'internship' in href and ('detail' in href or 'view' in href.lower()):
                        return href
            
            return None
            
        except Exception as e:
            return None
    
    def verify_website_exists(self, url):
        """Verify if a website URL exists and is accessible"""
        if not url:
            return False, "No URL provided"
        
        try:
            response = requests.head(url, timeout=10, allow_redirects=True)
            if response.status_code == 200:
                return True, "Valid"
            elif response.status_code == 404:
                return False, "Not Found"
            else:
                return False, f"HTTP {response.status_code}"
        except requests.exceptions.RequestException as e:
            return False, f"Connection Error: {str(e)[:50]}"
    
    def scrape_internshala(self, max_pages=5, verify_websites=False):
        """Scrape internships from Internshala"""
        print("Starting Internshala scraping...")
        base_url = "https://internshala.com/internships"
        
        for page in range(1, max_pages + 1):
            try:
                print(f"Scraping page {page}...")
                url = f"{base_url}/page-{page}" if page > 1 else base_url
                
                response = self.session.get(url, timeout=10)
                response.raise_for_status()
                
                soup = BeautifulSoup(response.content, 'html.parser')
                
                internship_containers = []
                
                selectors_to_try = [
                    'div.internship_meta',
                    'div[class*="internship"]',
                    'div.individual_internship',
                    'div.internship-item',
                    'div[data-internship-id]',
                    'div.container-fluid.individual_internship',
                    'div.view_detail_button'
                ]
                
                for selector in selectors_to_try:
                    try:
                        containers = soup.select(selector)
                        if containers:
                            internship_containers = containers
                            print(f"Found {len(containers)} internships using selector: {selector}")
                            break
                    except:
                        continue
                
                if not internship_containers:
                    all_divs = soup.find_all('div')
                    for div in all_divs:
                        if div.get_text() and any(keyword in div.get_text().lower() for keyword in ['internship', 'stipend', 'apply by']):
                            text = div.get_text().lower()
                            if 'stipend' in text and ('month' in text or 'week' in text):
                                internship_containers.append(div)
                    
                    internship_containers = [div for div in internship_containers if len(div.get_text()) > 100][:20]
                    print(f"Found {len(internship_containers)} internships using broad search")
                
                if not internship_containers:
                    print(f"No internships found on page {page}")
                    with open(f'debug_page_{page}.html', 'w', encoding='utf-8') as f:
                        f.write(soup.prettify())
                    print(f"Saved HTML to debug_page_{page}.html for inspection")
                    break
                
                for i, container in enumerate(internship_containers):
                    try:
                        print(f"Processing internship {i+1}/{len(internship_containers)}")
                        internship_data = self.extract_internshala_data(container, verify_websites)
                        if internship_data:
                            self.internships_data.append(internship_data)
                            print(f"Successfully extracted: {internship_data['job_title']} at {internship_data['company_name']}")
                            if internship_data['company_website']:
                                print(f"  Website found: {internship_data['company_website']}")
                    except Exception as e:
                        print(f"Error extracting internship data {i+1}: {e}")
                        continue
                
                print(f"Page {page} complete. Total internships so far: {len(self.internships_data)}")
                
                time.sleep(random.uniform(2, 4))
                
            except Exception as e:
                print(f"Error scraping page {page}: {e}")
                continue
        
        print(f"Scraped {len(self.internships_data)} internships from Internshala")
    
    def extract_internshala_data(self, container, verify_websites=False):
        """Extract data from individual internship container"""
        try:
            container_text = container.get_text()
            
            company_name = "Not specified"
            company_selectors = [
                'a.link_display_like_text',
                'a[href*="company"]',
                'h4 a',
                'h5 a',
                '.company-name'
            ]
            
            for selector in company_selectors:
                company_elem = container.select_one(selector)
                if company_elem:
                    company_name = company_elem.get_text(strip=True)
                    break
            
            if company_name == "Not specified":
                company_match = re.search(r'Company[:\s]*([^\n]+)', container_text, re.IGNORECASE)
                if company_match:
                    company_name = company_match.group(1).strip()
            
            detail_url = self.get_internship_detail_url(container)
            
            company_website = self.extract_posted_company_website(container, detail_url)
            

            website_status = "Not Checked"
            if verify_websites and company_website:
                is_valid, status = self.verify_website_exists(company_website)
                website_status = status
                print(f"Website verification for {company_name}: {status}")
            elif company_website:
                website_status = "Found but Not Verified"
            else:
                website_status = "No Website Posted"
            
            
            job_title = "Not specified"
            title_selectors = [
                'h3.heading_4_5',
                'h4.heading_4_5',
                'h3',
                'h4',
                '.job-title',
                '.internship-title'
            ]
            
            for selector in title_selectors:
                title_elem = container.select_one(selector)
                if title_elem:
                    job_title = title_elem.get_text(strip=True)
                    break
            
            
            if job_title == "Not specified":
                lines = container_text.split('\n')
                for line in lines:
                    if 'intern' in line.lower() and len(line.strip()) < 100:
                        job_title = line.strip()
                        break
            
           
            location = "Not specified"
            location_selectors = [
                'a[href*="location"]',
                '.location',
                '.internship-location'
            ]
            
            for selector in location_selectors:
                location_elem = container.select_one(selector)
                if location_elem:
                    location = location_elem.get_text(strip=True)
                    break
            
           
            if location == "Not specified":
                location_patterns = [
                    r'Location[:\s]*([^\n]+)',
                    r'Work from home|Remote',
                    r'([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*),?\s*(?:India|IN)?'
                ]
                for pattern in location_patterns:
                    location_match = re.search(pattern, container_text, re.IGNORECASE)
                    if location_match:
                        location = location_match.group(1).strip() if location_match.groups() else location_match.group(0)
                        break
            
            
            stipend_raw = "Not specified"
            stipend_selectors = [
                'span.stipend',
                '.stipend',
                '.salary',
                '.compensation'
            ]
            
            for selector in stipend_selectors:
                stipend_elem = container.select_one(selector)
                if stipend_elem:
                    stipend_raw = stipend_elem.get_text(strip=True)
                    break
            
            if stipend_raw == "Not specified":
                stipend_patterns = [
                    r'₹\s*[\d,]+(?:\s*-\s*₹?\s*[\d,]+)?',
                    r'Stipend[:\s]*([^\n]+)',
                    r'(\d+(?:,\d+)*)\s*(?:per month|/month)',
                    r'Unpaid',
                    r'Performance based'
                ]
                for pattern in stipend_patterns:
                    stipend_match = re.search(pattern, container_text, re.IGNORECASE)
                    if stipend_match:
                        stipend_raw = stipend_match.group(0)
                        break
            
            stipend_numeric = self.extract_stipend_value(stipend_raw)
            
            duration_raw = "Not specified"
            duration_patterns = [
                r'(\d+)\s*months?',
                r'(\d+)\s*weeks?',
                r'Duration[:\s]*([^\n]+)'
            ]
            
            for pattern in duration_patterns:
                duration_match = re.search(pattern, container_text, re.IGNORECASE)
                if duration_match:
                    duration_raw = duration_match.group(0)
                    break
            
            skills_raw = ""
            
            skills_container = container.select_one('.round_tabs_container')
            if skills_container:
                skills_raw = skills_container.get_text(' ', strip=True)
            else:
                
                skill_elements = container.select('.round_tabs')
                if skill_elements:
                    skills_raw = ', '.join([skill.get_text(strip=True) for skill in skill_elements])
                else:
                    
                    skills_patterns = [
                        r'Skills?[:\s]*([^\n]+)',
                        r'Requirements?[:\s]*([^\n]+)',
                        r'Tools?[:\s]*([^\n]+)',
                        r'Qualifications?[:\s]*([^\n]+)'
                    ]
                    
                    for pattern in skills_patterns:
                        skills_match = re.search(pattern, container_text, re.IGNORECASE)
                        if skills_match:
                            skills_raw = skills_match.group(1).strip()
                            break
            
            
            if skills_raw:
                skills_raw = self.clean_skills(skills_raw)
            
           
            intern_type = "Unpaid" if stipend_numeric == 0 else "Paid"
            if "performance" in stipend_raw.lower():
                intern_type = "Performance-based"
            
            domain_category = self.categorize_domain(job_title, skills_raw)
            
            result = {
                'platform': 'Internshala',
                'company_name': company_name,
                'job_title': job_title,
                'domain_category': domain_category,
                'skills_required': skills_raw,
                'location': location,
                'stipend': stipend_numeric,
                'stipend_raw': stipend_raw,
                'duration_months': self.parse_duration(duration_raw),
                'duration_raw': duration_raw,
                'posted_on': datetime.now().strftime('%Y-%m-%d'),
                'intern_type': intern_type,
                'location_type': self.categorize_location(location),
                'company_website': company_website,
                'website_status': website_status,
                'detail_url': detail_url
            }
            
           
            if (company_name != "Not specified" or job_title != "Not specified") and len(container_text) > 50:
                return result
            else:
                return None
            
        except Exception as e:
            print(f"Error parsing internship container: {e}")
            return None
    
    def parse_duration(self, duration_text):
        """Parse duration text to months"""
        if not duration_text:
            return 0
        
     
        months_match = re.search(r'(\d+)\s*month', duration_text.lower())
        if months_match:
            return int(months_match.group(1))
        
      
        weeks_match = re.search(r'(\d+)\s*week', duration_text.lower())
        if weeks_match:
            return round(int(weeks_match.group(1)) / 4.33, 1)  
        
        return 0
    
    def categorize_location(self, location):
        """Categorize location type"""
        location_lower = location.lower()
        if 'remote' in location_lower or 'work from home' in location_lower:
            return 'Remote'
        elif 'hybrid' in location_lower:
            return 'Hybrid'
        else:
            return 'Onsite'
    
    def update_existing_data_with_posted_websites(self, csv_file_path, verify_websites=True):
        """Update existing data with posted company websites from Internshala (not guessed ones)"""
        try:
            df = pd.read_csv(csv_file_path)
            print(f"Loaded {len(df)} existing records")
            
            df['company_website'] = "Re-scrape needed for posted websites"
            df['website_status'] = "Need to re-scrape"
            
            
            output_file = csv_file_path.replace('.csv', '_with_posted_websites.csv')
            df.to_csv(output_file, index=False)
            print(f"Updated data saved to {output_file}")
            print("Note: To get actual posted websites, please run a fresh scrape with the updated scraper")
            
            return df
            
        except Exception as e:
            print(f"Error processing existing data: {e}")
            return None
    
    def scrape_linkedin_internships(self, max_results=50):
        """Scrape LinkedIn internships (basic approach)"""
        print("LinkedIn scraping requires more complex setup due to login requirements")
        print("For now, focusing on Internshala. You can extend this for LinkedIn later.")
        pass
    
    def clean_and_structure_data(self):
        """Clean and structure the scraped data"""
        if not self.internships_data:
            print("No data to clean")
            return pd.DataFrame()
        
        df = pd.DataFrame(self.internships_data)
        
        
        df['skills_required'] = df['skills_required'].apply(self.clean_skills)
        
        df = df.drop_duplicates(subset=['company_name', 'job_title', 'location'], keep='first')
        
        df['skills_count'] = df['skills_required'].apply(lambda x: len(x.split(',')) if x else 0)
        df['has_stipend'] = df['stipend'] > 0
        df['has_posted_website'] = df['company_website'].notna() & (df['company_website'] != "")
        df['website_valid'] = df['website_status'] == 'Valid'
        df['website_found_but_not_verified'] = df['website_status'] == 'Found but Not Verified'
        
        return df
    
    def clean_skills(self, skills_text):
        """Clean and standardize skills text"""
        if not skills_text or skills_text == "Not specified":
            return ""
        
        skills_text = re.sub(r'[^\w\s,.-]', '', skills_text)
        
        skill_mappings = {
            'photoshop': 'Adobe Photoshop',
            'illustrator': 'Adobe Illustrator',
            'ms office': 'Microsoft Office',
            'excel': 'Microsoft Excel',
            'powerpoint': 'Microsoft PowerPoint',
            'word': 'Microsoft Word',
            'js': 'JavaScript',
            'html5': 'HTML',
            'css3': 'CSS'
        }
        
        skills = []
        for part in skills_text.split(','):
            part = part.strip()
            if part:
                for subpart in re.split(r'\band\b', part, flags=re.IGNORECASE):
                    subpart = subpart.strip()
                    if subpart:
                        subpart_lower = subpart.lower()
                        mapped_skill = skill_mappings.get(subpart_lower, subpart)
                        if len(mapped_skill) > 1:  
                            skills.append(mapped_skill)
        
        return ', '.join(sorted(set(skills[:10])))  
    
    def save_data(self, filename='internship_data_with_posted_websites.csv'):
        """Save cleaned data to CSV"""
        df = self.clean_and_structure_data()
        if not df.empty:
            df.to_csv(filename, index=False)
            print(f"Data saved to {filename}")
            print(f"Total internships scraped: {len(df)}")
            return df
        else:
            print("No data to save")
            return pd.DataFrame()
    
    def get_summary_stats(self):
        """Get basic summary statistics"""
        df = self.clean_and_structure_data()
        if df.empty:
            return "No data available"
        
        stats = {
            'Total Internships': len(df),
            'Paid Internships': len(df[df['has_stipend'] == True]),
            'Unpaid Internships': len(df[df['has_stipend'] == False]),
            'Average Stipend (Paid)': df[df['stipend'] > 0]['stipend'].mean(),
            'Top Domains': df['domain_category'].value_counts().head(5).to_dict(),
            'Location Types': df['location_type'].value_counts().to_dict(),
            'Companies with Posted Websites': len(df[df['has_posted_website'] == True]),
            'Verified Valid Websites': len(df[df['website_valid'] == True]),
            'Websites Found but Not Verified': len(df[df['website_found_but_not_verified'] == True]),
            'No Website Posted': len(df[df['website_status'] == 'No Website Posted'])
        }
        
        return stats

def main():
    scraper = InternshipScraper()
    
    
    print("Scraping internships and extracting POSTED company websites...")
    scraper.scrape_internshala(max_pages=3, verify_websites=True) 
    df = scraper.save_data('internship_market_data_with_posted_websites.csv')
    
    stats = scraper.get_summary_stats()
    print("\n=== SCRAPING SUMMARY ===")
    for key, value in stats.items():
        print(f"{key}: {value}")
    

    if not df.empty:
        websites_found = df[df['has_posted_website'] == True]
        if not websites_found.empty:
            print(f"\n=== EXAMPLE POSTED WEBSITES FOUND ===")
            for idx, row in websites_found.head(5).iterrows():
                print(f"{row['company_name']}: {row['company_website']} ({row['website_status']})")
        else:
            print("\nNo posted websites found in this scrape. This could mean:")
            print("1. Companies haven't posted their websites on their Internshala profiles")
            print("2. The website extraction selectors need adjustment for current Internshala structure")
            print("3. Websites might be in a different section that requires deeper page navigation")
    
    return df

if __name__ == "__main__":
    df = main()

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import re
from datetime import datetime
import random
from urllib.parse import urljoin, urlparse

class JobSkillsExtractor:
    def __init__(self):
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
            'Accept-Language': 'en-US,en;q=0.5',
            'Accept-Encoding': 'gzip, deflate',
            'Connection': 'keep-alive',
            'Upgrade-Insecure-Requests': '1'
        }
        self.session = requests.Session()
        self.session.headers.update(self.headers)
        self.jobs_data = []
    
    def extract_job_title(self, container):
        """Extract job title from container"""
        job_title = "Not specified"
        
        # Try multiple selectors for job title
        title_selectors = [
            'h3.heading_4_5',
            'h4.heading_4_5', 
            'h3',
            'h4',
            '.job-title',
            '.internship-title',
            'a.view_detail_button',
            'a[href*="internship"]'
        ]
        
        for selector in title_selectors:
            title_elem = container.select_one(selector)
            if title_elem:
                job_title = title_elem.get_text(strip=True)
                # Clean up the title
                if job_title and len(job_title) < 200:  # Reasonable title length
                    break
        
        # Fallback: search in container text
        if job_title == "Not specified":
            container_text = container.get_text()
            lines = container_text.split('\n')
            for line in lines:
                line = line.strip()
                if 'intern' in line.lower() and len(line) < 100 and len(line) > 5:
                    job_title = line
                    break
        
        return job_title
    
    def extract_job_url(self, container):
        """Extract the detailed job/internship URL"""
        job_url = None
        
        # Try multiple selectors for job detail URL
        url_selectors = [
            'a[href*="/internship/detail/"]',
            'a[href*="/internship/"]',
            '.view_detail_button a',
            '.view-details a',
            'a[title*="detail"]',
            'h3 a',
            'h4 a',
            '.heading_4_5 a'
        ]
        
        for selector in url_selectors:
            links = container.select(selector)
            for link in links:
                href = link.get('href', '')
                if href and ('internship' in href or 'job' in href):
                    # Make absolute URL if relative
                    if not href.startswith('http'):
                        job_url = 'https://internshala.com' + href
                    else:
                        job_url = href
                    break
            if job_url:
                break
        
        return job_url
    
    def extract_skills_from_listing(self, container):
        """Extract skills from the job listing container"""
        skills_raw = ""
        
        # Try to find skills in the listing itself
        skills_selectors = [
            '.round_tabs_container',
            '.round_tabs',
            '.skills-required',
            '.skills',
            '.technologies'
        ]
        
        for selector in skills_selectors:
            skills_container = container.select_one(selector)
            if skills_container:
                skills_raw = skills_container.get_text(' ', strip=True)
                break
            else:
                # Try multiple elements
                skill_elements = container.select(selector)
                if skill_elements:
                    skills_raw = ', '.join([skill.get_text(strip=True) for skill in skill_elements])
                    break
        
        # Fallback: search in container text
        if not skills_raw:
            container_text = container.get_text()
            skills_patterns = [
                r'Skills?[:\s]*([^\n]+)',
                r'Requirements?[:\s]*([^\n]+)', 
                r'Tools?[:\s]*([^\n]+)',
                r'Technologies?[:\s]*([^\n]+)',
                r'Qualifications?[:\s]*([^\n]+)'
            ]
            
            for pattern in skills_patterns:
                skills_match = re.search(pattern, container_text, re.IGNORECASE)
                if skills_match:
                    skills_raw = skills_match.group(1).strip()
                    break
        
        return self.clean_skills(skills_raw)
    
    def extract_skills_from_detail_page(self, job_url):
        """Extract detailed skills from the job detail page"""
        if not job_url:
            return ""
        
        try:
            print(f"Fetching skills from: {job_url}")
            response = self.session.get(job_url, timeout=15)
            response.raise_for_status()
            
            soup = BeautifulSoup(response.content, 'html.parser')
            
            # Multiple selectors to find skills on detail page
            detail_skills_selectors = [
                '.round_tabs_container',
                '.skills-required',
                '.skill-tags',
                '.requirements',
                '.qualifications',
                'div[class*="skill"]',
                'section[class*="skill"]',
                '.round_tabs',
                'span.round_tabs',
                '.other_details_class .round_tabs_container'
            ]
            
            skills_text = ""
            
            for selector in detail_skills_selectors:
                try:
                    skills_section = soup.select_one(selector)
                    if skills_section:
                        skills_text = skills_section.get_text(' ', strip=True)
                        if skills_text and len(skills_text) > 3:
                            break
                    else:
                        # Try multiple elements
                        skill_elements = soup.select(selector)
                        if skill_elements:
                            skills_list = []
                            for elem in skill_elements:
                                text = elem.get_text(strip=True)
                                if text and len(text) < 50:  # Individual skill shouldn't be too long
                                    skills_list.append(text)
                            if skills_list:
                                skills_text = ', '.join(skills_list)
                                break
                except Exception as e:
                    continue
            
            # Fallback: search in page text for skills section
            if not skills_text:
                page_text = soup.get_text()
                skills_patterns = [
                    r'Skills?[:\s]*([^\n\r]{10,200})',
                    r'Requirements?[:\s]*([^\n\r]{10,200})',
                    r'Qualifications?[:\s]*([^\n\r]{10,200})',
                    r'Technologies?[:\s]*([^\n\r]{10,200})',
                    r'Tools?[:\s]*([^\n\r]{10,200})'
                ]
                
                for pattern in skills_patterns:
                    match = re.search(pattern, page_text, re.IGNORECASE | re.MULTILINE)
                    if match:
                        skills_text = match.group(1).strip()
                        break
            
            return self.clean_skills(skills_text)
            
        except Exception as e:
            print(f"Error fetching skills from detail page {job_url}: {e}")
            return ""
    
    def clean_skills(self, skills_text):
        """Clean and standardize skills text"""
        if not skills_text or skills_text == "Not specified":
            return ""
        
        # Remove unwanted characters but keep common programming symbols
        skills_text = re.sub(r'[^\w\s,.\-+#()]', '', skills_text)
        
        # Skill mappings for standardization
        skill_mappings = {
            'photoshop': 'Adobe Photoshop',
            'illustrator': 'Adobe Illustrator', 
            'ms office': 'Microsoft Office',
            'excel': 'Microsoft Excel',
            'powerpoint': 'Microsoft PowerPoint',
            'word': 'Microsoft Word',
            'js': 'JavaScript',
            'html5': 'HTML',
            'css3': 'CSS',
            'nodejs': 'Node.js',
            'reactjs': 'React.js',
            'vuejs': 'Vue.js',
            'angularjs': 'Angular',
            'mysql': 'MySQL',
            'postgresql': 'PostgreSQL',
            'mongodb': 'MongoDB'
        }
        
        skills = []
        # Split by common separators
        for separator in [',', 'and', '&', '|', ';']:
            if separator in skills_text.lower():
                parts = re.split(rf'\s*{re.escape(separator)}\s*', skills_text, flags=re.IGNORECASE)
                break
        else:
            parts = [skills_text]
        
        for part in parts:
            part = part.strip()
            if part and len(part) > 1:
                # Further split by 'and'
                subparts = re.split(r'\s+and\s+', part, flags=re.IGNORECASE)
                for subpart in subparts:
                    subpart = subpart.strip()
                    if subpart and len(subpart) > 1:
                        subpart_lower = subpart.lower()
                        mapped_skill = skill_mappings.get(subpart_lower, subpart)
                        if len(mapped_skill) > 1 and len(mapped_skill) < 50:  # Reasonable skill length
                            skills.append(mapped_skill)
        
        # Remove duplicates and limit to reasonable number
        unique_skills = []
        seen = set()
        for skill in skills:
            skill_lower = skill.lower()
            if skill_lower not in seen and len(unique_skills) < 15:
                unique_skills.append(skill)
                seen.add(skill_lower)
        
        return ', '.join(unique_skills)
    
    def scrape_jobs_with_skills(self, max_pages=3, fetch_from_detail=True):
        """Scrape jobs with title, URL and skills"""
        print("Starting job scraping with skills extraction...")
        base_url = "https://internshala.com/internships"
        
        for page in range(1, max_pages + 1):
            try:
                print(f"\nScraping page {page}...")
                url = f"{base_url}/page-{page}" if page > 1 else base_url
                
                response = self.session.get(url, timeout=10)
                response.raise_for_status()
                
                soup = BeautifulSoup(response.content, 'html.parser')
                
               
                internship_containers = []
                selectors_to_try = [
                    'div.internship_meta',
                    'div[class*="internship"]',
                    'div.individual_internship', 
                    'div.internship-item',
                    'div[data-internship-id]',
                    'div.container-fluid.individual_internship'
                ]
                
                for selector in selectors_to_try:
                    try:
                        containers = soup.select(selector)
                        if containers:
                            internship_containers = containers
                            print(f"Found {len(containers)} jobs using selector: {selector}")
                            break
                    except:
                        continue
                
                if not internship_containers:
                    print(f"No job containers found on page {page}")
                    continue
                
                for i, container in enumerate(internship_containers):
                    try:
                        print(f"Processing job {i+1}/{len(internship_containers)}")
                        
                        job_title = self.extract_job_title(container)
                        job_url = self.extract_job_url(container)
                        skills_from_listing = self.extract_skills_from_listing(container)
                        
                        detailed_skills = ""
                        if fetch_from_detail and job_url:
                            detailed_skills = self.extract_skills_from_detail_page(job_url)
                            time.sleep(random.uniform(1, 2))  # Be respectful
                        
                        # Combine skills (prefer detailed skills)
                        final_skills = detailed_skills if detailed_skills else skills_from_listing
                        
                        job_data = {
                            'job_title': job_title,
                            'job_url': job_url,
                            'skills_required': final_skills,
                            'skills_from_listing': skills_from_listing,
                            'skills_from_detail': detailed_skills,
                            'scraped_on': datetime.now().strftime('%Y-%m-%d %H:%M:%S')
                        }
                        
                        if job_title != "Not specified":
                            self.jobs_data.append(job_data)
                            print(f"✓ Extracted: {job_title}")
                            if final_skills:
                                print(f"  Skills: {final_skills[:100]}...")
                            else:
                                print("  No skills found")
                        
                    except Exception as e:
                        print(f"Error processing job {i+1}: {e}")
                        continue
                
                print(f"Page {page} complete. Total jobs: {len(self.jobs_data)}")
                time.sleep(random.uniform(2, 4))  # Page delay
                
            except Exception as e:
                print(f"Error scraping page {page}: {e}")
                continue
        
        print(f"\n=== SCRAPING COMPLETE ===")
        print(f"Total jobs scraped: {len(self.jobs_data)}")
    
    def save_jobs_data(self, filename='jobs_with_skills.csv'):
        """Save job data to CSV"""
        if not self.jobs_data:
            print("No job data to save")
            return pd.DataFrame()
        
        df = pd.DataFrame(self.jobs_data)
        
        # Add some analytics
        df['has_skills'] = df['skills_required'].apply(lambda x: len(x.strip()) > 0)
        df['skills_count'] = df['skills_required'].apply(lambda x: len(x.split(',')) if x else 0)
        df['has_url'] = df['job_url'].notna()
        
        df.to_csv(filename, index=False)
        print(f"\nData saved to {filename}")
        print(f"Jobs with skills: {len(df[df['has_skills']])}")
        print(f"Jobs with URLs: {len(df[df['has_url']])}")
        
        return df
    
    def get_skills_summary(self):
        """Get summary of extracted skills"""
        if not self.jobs_data:
            return "No data available"
        
        df = pd.DataFrame(self.jobs_data)
        
        # Extract all unique skills
        all_skills = []
        for skills_str in df['skills_required']:
            if skills_str:
                skills_list = [skill.strip() for skill in skills_str.split(',')]
                all_skills.extend(skills_list)
        
        # Count skill frequency
        from collections import Counter
        skill_counts = Counter(all_skills)
        
        summary = {
            'Total Jobs': len(df),
            'Jobs with Skills': len(df[df['skills_required'].str.len() > 0]),
            'Jobs with URLs': len(df[df['job_url'].notna()]),
            'Unique Skills Found': len(skill_counts),
            'Top 10 Skills': dict(skill_counts.most_common(10))
        }
        
        return summary

def main():
    """Main function to run the job skills extractor"""
    extractor = JobSkillsExtractor()
    
    # Scrape jobs with skills (set fetch_from_detail=False for faster scraping)
    extractor.scrape_jobs_with_skills(max_pages=2, fetch_from_detail=True)
    
    # Save data
    df = extractor.save_jobs_data('internship_jobs_with_skills.csv')
    
    # Print summary
    summary = extractor.get_skills_summary()
    print("\n=== SUMMARY REPORT ===")
    for key, value in summary.items():
        print(f"{key}: {value}")
    
    # Show sample data
    if not df.empty:
        print("\n=== SAMPLE DATA ===")
        for idx, row in df.head(3).iterrows():
            print(f"\nJob {idx+1}:")
            print(f"Title: {row['job_title']}")
            print(f"URL: {row['job_url']}")
            print(f"Skills: {row['skills_required']}")
    
    return df

if __name__ == "__main__":
    df = main()

Starting job scraping with skills extraction...

Scraping page 1...
Found 50 jobs using selector: div.internship_meta
Processing job 1/50
Fetching skills from: https://internshala.com/internship/detail/part-time-public-speaking-theatre-coach-internship-in-multiple-locations-at-utsaah-learning-private-limited1750914763
✓ Extracted: Public Speaking & Theatre Coach
  Skills: Public Speaking...
Processing job 2/50
Fetching skills from: https://internshala.com/internship/detail/ca-articleship-internship-in-delhi-at-vimal-tandon-co1749710338
✓ Extracted: CA Articleship
  No skills found
Processing job 3/50
Fetching skills from: https://internshala.com/internship/detail/campus-ambassador-programme-at-billion-hearts1750314817
✓ Extracted: Campus Ambassador
  No skills found
Processing job 4/50
Fetching skills from: https://internshala.com/internship/detail/digital-marketing-internship-in-navi-mumbai-at-yours-faithfully-advisors-llp1750842293
✓ Extracted: Digital Marketing
  No skills found
Pro

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import re
from datetime import datetime
import json
from urllib.parse import urljoin, urlparse
import random

class SkillsExtractionScraper:
    def __init__(self):
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
            'Accept-Language': 'en-US,en;q=0.5',
            'Accept-Encoding': 'gzip, deflate',
            'Connection': 'keep-alive',
            'Upgrade-Insecure-Requests': '1'
        }
        self.session = requests.Session()
        self.session.headers.update(self.headers)
        self.internships_data = []
        
    def extract_skills_comprehensive(self, container, detail_url=None):
        """Enhanced skills extraction with multiple fallback methods"""
        skills_found = []
        
        # Method 1: Look for skills in container using multiple selectors
        skills_selectors = [
            '.round_tabs_container',  # Main skills container
            '.round_tabs',            # Individual skill tags
            '.skill_required',        # Skills section
            '.requirements',          # Requirements section
            '.tags',                  # Tags section
            '.skills',               # Skills class
            '.qualifications',       
            '[class*="skill"]',      
            '[class*="requirement"]', 
            '.badge',                
            '.tag',                  
        ]
        
        for selector in skills_selectors:
            try:
                elements = container.select(selector)
                for elem in elements:
                    text = elem.get_text(strip=True)
                    if text and len(text) > 1:
                        skills_found.append(text)
            except Exception as e:
                continue
        
        # Method 2: Text pattern matching within container
        container_text = container.get_text()
        skills_patterns = [
            r'Skills?\s*(?:required|needed)?[:\-\s]*([^\n]+)',
            r'Requirements?[:\-\s]*([^\n]+)',
            r'Tools?[:\-\s]*([^\n]+)',
            r'Technologies?[:\-\s]*([^\n]+)',
            r'Qualifications?[:\-\s]*([^\n]+)',
            r'Must\s+(?:have|know)[:\-\s]*([^\n]+)',
            r'Experience\s+(?:in|with)[:\-\s]*([^\n]+)',
            r'Knowledge\s+(?:of|in)[:\-\s]*([^\n]+)',
            r'Proficiency\s+(?:in|with)[:\-\s]*([^\n]+)',
        ]
        
        for pattern in skills_patterns:
            try:
                matches = re.findall(pattern, container_text, re.IGNORECASE | re.MULTILINE)
                for match in matches:
                    if match.strip():
                        skills_found.append(match.strip())
            except Exception as e:
                continue
        
        # Method 3: Extract from detail page if available
        if detail_url and not skills_found:
            detail_skills = self.extract_skills_from_detail_page(detail_url)
            if detail_skills:
                skills_found.extend(detail_skills)
        
        # Method 4: Look for common skill keywords in the full text
        if not skills_found:
            skills_found = self.extract_skills_from_keywords(container_text)
        
        # Clean and combine all found skills
        if skills_found:
            combined_skills = ' '.join(skills_found)
            return self.clean_and_standardize_skills(combined_skills)
        
        return ""
    
    def extract_skills_from_detail_page(self, detail_url):
        """Extract skills from the detailed internship page"""
        try:
            if not detail_url.startswith('http'):
                detail_url = 'https://internshala.com' + detail_url
            
            response = self.session.get(detail_url, timeout=10)
            response.raise_for_status()
            
            soup = BeautifulSoup(response.content, 'html.parser')
            skills_found = []
            
            # Look for skills in detail page
            detail_selectors = [
                '.round_tabs_container',
                '.round_tabs',
                '.skill-tags',
                '.requirements-section',
                '.job-description',
                '.internship-details',
                '[class*="skill"]',
                '[class*="requirement"]',
                '.tags-container'
            ]
            
            for selector in detail_selectors:
                try:
                    elements = soup.select(selector)
                    for elem in elements:
                        text = elem.get_text(strip=True)
                        if text and len(text) > 1:
                            skills_found.append(text)
                except:
                    continue
            
            # Also search in the full page text
            page_text = soup.get_text()
            text_skills = self.extract_skills_from_keywords(page_text)
            skills_found.extend(text_skills)
            
            return skills_found
            
        except Exception as e:
            print(f"Error fetching skills from detail page {detail_url}: {e}")
            return []
    
    def extract_skills_from_keywords(self, text):
        """Extract skills based on common technology/skill keywords"""
        # Comprehensive list of common skills and technologies
        skill_keywords = {
            # Programming Languages
            'python', 'java', 'javascript', 'js', 'c++', 'c#', 'php', 'ruby', 'go', 'swift', 'kotlin',
            'typescript', 'scala', 'r', 'matlab', 'sql', 'html', 'css', 'html5', 'css3',
            
            # Frameworks & Libraries
            'react', 'angular', 'vue', 'node.js', 'nodejs', 'express', 'django', 'flask', 'spring',
            'laravel', 'codeigniter', 'bootstrap', 'jquery', 'redux', 'next.js', 'nuxt.js',
            
            # Databases
            'mysql', 'postgresql', 'mongodb', 'sqlite', 'oracle', 'redis', 'firebase',
            
            # Tools & Platforms
            'git', 'github', 'gitlab', 'docker', 'kubernetes', 'aws', 'azure', 'gcp',
            'jenkins', 'jira', 'confluence', 'slack', 'trello', 'asana',
            
            # Design Tools
            'photoshop', 'illustrator', 'figma', 'sketch', 'canva', 'indesign', 'after effects',
            'premiere pro', 'xd', 'invision', 'zeplin',
            
            # Marketing & Analytics
            'google analytics', 'seo', 'sem', 'google ads', 'facebook ads', 'social media',
            'content marketing', 'email marketing', 'mailchimp', 'hubspot',
            
            # Office & Productivity
            'microsoft office', 'excel', 'powerpoint', 'word', 'google sheets', 'google docs',
            'ms office', 'outlook', 'teams',
            
            # Data Science & AI
            'machine learning', 'deep learning', 'tensorflow', 'pytorch', 'pandas', 'numpy',
            'scikit-learn', 'jupyter', 'tableau', 'power bi', 'data analysis', 'statistics',
            
            # Mobile Development
            'android', 'ios', 'react native', 'flutter', 'xamarin', 'cordova',
            
            # Other Technical Skills
            'api', 'rest api', 'graphql', 'microservices', 'agile', 'scrum', 'devops',
            'testing', 'unit testing', 'automation testing', 'selenium'
        }
        
        found_skills = []
        text_lower = text.lower()
        
        for skill in skill_keywords:
            # Look for skill as whole word
            pattern = r'\b' + re.escape(skill.lower()) + r'\b'
            if re.search(pattern, text_lower):
                found_skills.append(skill)
        
        return found_skills
    
    def clean_and_standardize_skills(self, skills_text):
        """Clean and standardize skills text with better parsing"""
        if not skills_text or skills_text.strip() == "":
            return ""
        
        # Remove extra whitespace and newlines
        skills_text = re.sub(r'\s+', ' ', skills_text.strip())
        
        # Remove common non-skill words
        noise_words = [
            'required', 'preferred', 'must have', 'nice to have', 'experience',
            'knowledge', 'proficiency', 'skills', 'tools', 'technologies',
            'qualifications', 'requirements', 'abilities', 'competencies',
            'familiarity', 'understanding', 'background', 'expertise'
        ]
        
        for word in noise_words:
            skills_text = re.sub(r'\b' + re.escape(word) + r'\b', '', skills_text, flags=re.IGNORECASE)
        
        # Split skills by common delimiters
        delimiters = [',', '•', '|', ';', '&', ' and ', ' or ', '\n', '/', '\\']
        skills_list = [skills_text]
        
        for delimiter in delimiters:
            new_skills_list = []
            for skill_chunk in skills_list:
                new_skills_list.extend([s.strip() for s in skill_chunk.split(delimiter) if s.strip()])
            skills_list = new_skills_list
        
        # Clean individual skills
        cleaned_skills = []
        for skill in skills_list:
            skill = skill.strip()
            if skill:
                # Remove special characters from start/end
                skill = re.sub(r'^[^\w]+|[^\w]+$', '', skill)
                
                # Skip if too short or too long
                if len(skill) < 2 or len(skill) > 50:
                    continue
                
                # Skip if it's mostly numbers or special characters
                if re.match(r'^[\d\s\W]*$', skill):
                    continue
                
                # Standardize common skills
                skill_standardized = self.standardize_skill_name(skill)
                if skill_standardized:
                    cleaned_skills.append(skill_standardized)
        
        # Remove duplicates while preserving order
        final_skills = []
        seen = set()
        for skill in cleaned_skills:
            skill_lower = skill.lower()
            if skill_lower not in seen:
                seen.add(skill_lower)
                final_skills.append(skill)
        
        # Limit to top 15 skills to avoid clutter
        return ', '.join(final_skills[:15])
    
    def standardize_skill_name(self, skill):
        """Standardize skill names to consistent format"""
        skill_lower = skill.lower().strip()
        
        # Skill standardization mappings
        standardizations = {
            'js': 'JavaScript',
            'javascript': 'JavaScript',
            'html5': 'HTML',
            'html': 'HTML',
            'css3': 'CSS',
            'css': 'CSS',
            'photoshop': 'Adobe Photoshop',
            'illustrator': 'Adobe Illustrator',
            'ms office': 'Microsoft Office',
            'microsoft office': 'Microsoft Office',
            'excel': 'Microsoft Excel',
            'powerpoint': 'Microsoft PowerPoint',
            'word': 'Microsoft Word',
            'nodejs': 'Node.js',
            'node.js': 'Node.js',
            'reactjs': 'React',
            'react.js': 'React',
            'vue.js': 'Vue.js',
            'vuejs': 'Vue.js',
            'python': 'Python',
            'java': 'Java',
            'c++': 'C++',
            'c#': 'C#',
            'php': 'PHP',
            'mysql': 'MySQL',
            'postgresql': 'PostgreSQL',
            'mongodb': 'MongoDB',
            'git': 'Git',
            'github': 'GitHub',
            'aws': 'AWS',
            'google analytics': 'Google Analytics',
            'seo': 'SEO',
            'social media marketing': 'Social Media Marketing',
            'content writing': 'Content Writing',
            'digital marketing': 'Digital Marketing',
            'machine learning': 'Machine Learning',
            'data analysis': 'Data Analysis',
            'artificial intelligence': 'Artificial Intelligence',
            'ai': 'Artificial Intelligence',
            'ml': 'Machine Learning'
        }
        
        # Return standardized version if available, otherwise return title case
        if skill_lower in standardizations:
            return standardizations[skill_lower]
        elif len(skill_lower) > 1:
            return skill.title()
        else:
            return None
    
    def get_internship_detail_url(self, container):
        """Extract the detail page URL for the internship"""
        try:
            # Common selectors for internship detail links
            detail_selectors = [
                'a[href*="/internship/detail/"]',
                'a[href*="/internships/detail/"]',
                'a[href*="internship"][href*="detail"]',
                '.view_detail_button a',
                '.view-details a',
                'a[title*="detail"]',
                'a[href*="detail"]',
                'h3 a',  # Title links often lead to detail page
                'h4 a',
                '.heading_4_5 a'
            ]
            
            for selector in detail_selectors:
                links = container.select(selector)
                for link in links:
                    href = link.get('href', '')
                    if href and ('detail' in href or 'internship' in href):
                        return href
            
            return None
            
        except Exception as e:
            return None
    
    def scrape_internshala_with_skills_focus(self, max_pages=5):
        """Scrape internships from Internshala with enhanced skills extraction"""
        print("Starting Internshala scraping with focus on skills extraction...")
        base_url = "https://internshala.com/internships"
        
        for page in range(1, max_pages + 1):
            try:
                print(f"Scraping page {page}...")
                url = f"{base_url}/page-{page}" if page > 1 else base_url
                
                response = self.session.get(url, timeout=15)
                response.raise_for_status()
                
                soup = BeautifulSoup(response.content, 'html.parser')
                
                # Try multiple selectors to find internship containers
                internship_containers = []
                selectors_to_try = [
                    'div.internship_meta',
                    'div[class*="internship"]',
                    'div.individual_internship',
                    'div.container-fluid.individual_internship',
                    'div[data-internship-id]'
                ]
                
                for selector in selectors_to_try:
                    try:
                        containers = soup.select(selector)
                        if containers:
                            internship_containers = containers
                            print(f"Found {len(containers)} internships using selector: {selector}")
                            break
                    except:
                        continue
                
                if not internship_containers:
                    print(f"No internships found on page {page}")
                    continue
                
                for i, container in enumerate(internship_containers):
                    try:
                        print(f"Processing internship {i+1}/{len(internship_containers)}")
                        
                        # Extract basic info
                        job_title = self.extract_job_title(container)
                        company_name = self.extract_company_name(container)
                        detail_url = self.get_internship_detail_url(container)
                        
                        # Extract skills with enhanced method
                        skills_required = self.extract_skills_comprehensive(container, detail_url)
                        
                        if job_title != "Not specified" and skills_required:
                            internship_data = {
                                'platform': 'Internshala',
                                'company_name': company_name,
                                'job_title': job_title,
                                'skills_required': skills_required,
                                'detail_url': detail_url,
                                'scraped_on': datetime.now().strftime('%Y-%m-%d %H:%M:%S')
                            }
                            
                            self.internships_data.append(internship_data)
                            print(f"✓ {job_title} at {company_name}")
                            print(f"  Skills: {skills_required[:100]}...")
                            
                    except Exception as e:
                        print(f"Error processing internship {i+1}: {e}")
                        continue
                
                print(f"Page {page} complete. Total internships: {len(self.internships_data)}")
                time.sleep(random.uniform(2, 4))
                
            except Exception as e:
                print(f"Error scraping page {page}: {e}")
                continue
        
        print(f"Scraping complete! Found {len(self.internships_data)} internships with skills data")
    
    def extract_job_title(self, container):
        """Extract job title from container"""
        title_selectors = [
            'h3.heading_4_5 a',
            'h4.heading_4_5 a', 
            'h3 a',
            'h4 a',
            '.job-title',
            '.internship-title',
            'h3.heading_4_5',
            'h4.heading_4_5'
        ]
        
        for selector in title_selectors:
            elem = container.select_one(selector)
            if elem:
                return elem.get_text(strip=True)
        
        return "Not specified"
    
    def extract_company_name(self, container):
        """Extract company name from container"""
        company_selectors = [
            'a.link_display_like_text',
            'a[href*="company"]',
            '.company-name',
            'h5 a',
            'h4 a'
        ]
        
        for selector in company_selectors:
            elem = container.select_one(selector)
            if elem:
                return elem.get_text(strip=True)
        
        return "Not specified"
    
    def save_skills_data(self, filename='internship_skills_data.csv'):
        """Save data with focus on skills to CSV"""
        if not self.internships_data:
            print("No data to save")
            return pd.DataFrame()
        
        df = pd.DataFrame(self.internships_data)
        
        # Add skills analysis
        df['skills_count'] = df['skills_required'].apply(lambda x: len(x.split(',')) if x else 0)
        df['has_skills'] = df['skills_required'].apply(lambda x: len(x.strip()) > 0 if x else False)
        
        # Remove duplicates
        df = df.drop_duplicates(subset=['company_name', 'job_title'], keep='first')
        
        # Sort by skills count (most skills first)
        df = df.sort_values('skills_count', ascending=False)
        
        df.to_csv(filename, index=False)
        print(f"Skills data saved to {filename}")
        print(f"Total internships with skills: {len(df[df['has_skills']])}")
        
        return df

# Separate focused skills scraper
def scrape_skills_only(job_urls_list):
    """
    Separate function to scrape skills from a list of job URLs
    
    Args:
        job_urls_list: List of internship URLs to scrape skills from
    
    Returns:
        DataFrame with job URLs and extracted skills
    """
    scraper = SkillsExtractionScraper()
    skills_data = []
    
    for i, url in enumerate(job_urls_list):
        try:
            print(f"Processing URL {i+1}/{len(job_urls_list)}: {url[:50]}...")
            
            if not url.startswith('http'):
                url = 'https://internshala.com' + url
            
            response = scraper.session.get(url, timeout=15)
            response.raise_for_status()
            
            soup = BeautifulSoup(response.content, 'html.parser')
            
            # Extract title
            title_elem = soup.select_one('h1, .job-title, .internship-title') or soup.select_one('title')
            job_title = title_elem.get_text(strip=True) if title_elem else "Not specified"
            
            # Extract company
            company_elem = soup.select_one('.company-name, [class*="company"]')
            company_name = company_elem.get_text(strip=True) if company_elem else "Not specified"
            
            # Extract skills using comprehensive method
            skills_required = scraper.extract_skills_comprehensive(soup, url)
            
            skills_data.append({
                'job_url': url,
                'job_title': job_title,
                'company_name': company_name,
                'skills_required': skills_required,
                'skills_count': len(skills_required.split(',')) if skills_required else 0,
                'scraped_on': datetime.now().strftime('%Y-%m-%d %H:%M:%S')
            })
            
            print(f"✓ Found {len(skills_required.split(',')) if skills_required else 0} skills")
            time.sleep(random.uniform(1, 3))
            
        except Exception as e:
            print(f"Error processing {url}: {e}")
            skills_data.append({
                'job_url': url,
                'job_title': "Error",
                'company_name': "Error", 
                'skills_required': "",
                'skills_count': 0,
                'error': str(e)
            })
    
    df = pd.DataFrame(skills_data)
    df.to_csv('skills_from_urls.csv', index=False)
    print(f"Skills data saved to skills_from_urls.csv")
    
    return df

# Usage examples
def main():
    # Option 1: Enhanced scraping with skills focus
    print("=== OPTION 1: Enhanced Skills Scraping ===")
    scraper = SkillsExtractionScraper()
    scraper.scrape_internshala_with_skills_focus(max_pages=3)
    df = scraper.save_skills_data('enhanced_internship_skills.csv')
    
    # Option 2: Scrape skills from specific URLs
    print("\n=== OPTION 2: Skills from Specific URLs ===")
    # Example URLs - replace with your actual URLs
    sample_urls = [
        "/internship/detail/web-development-internship-in-bangalore-at-tech-company1234",
        "/internship/detail/digital-marketing-internship-in-mumbai-at-marketing-firm5678"
    ]
    
    # Uncomment to use:
    # skills_df = scrape_skills_only(sample_urls)
    
    return df

if __name__ == "__main__":
    df = main()

INFO:__main__:Starting internship scraping...
INFO:__main__:Scraping internships from internshala.com
INFO:__main__:Searching: site:internshala.com "web development internship" internship 2024 2025 -blog -article -news
INFO:__main__:Found 0 results for 'web development internship'
INFO:__main__:Searching: site:internshala.com "software development internship" internship 2024 2025 -blog -article -news
INFO:__main__:Found 0 results for 'software development internship'
INFO:__main__:Searching: site:internshala.com "data science internship" internship 2024 2025 -blog -article -news
INFO:__main__:Found 0 results for 'data science internship'
INFO:__main__:Searching: site:internshala.com "digital marketing internship" internship 2024 2025 -blog -article -news
INFO:__main__:Found 0 results for 'digital marketing internship'
INFO:__main__:Searching: site:internshala.com "graphic design internship" internship 2024 2025 -blog -article -news
INFO:__main__:Found 0 results for 'graphic design inte


=== DETAILED STATISTICS ===
"No data available. Run scrape_internships() first."

Recent internships (last 30 days): 0
Scraping failed. Please check your SerpAPI key and internet connection.
