In [62]:
import whois
import ssl
import socket
import requests
import datetime
import re
import time
import json
import logging
from urllib.parse import urlparse
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
from transformers import pipeline

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Initialize BERT sentiment analysis pipeline
try:
    sentiment_analyzer = pipeline('sentiment-analysis', model='distilbert-base-uncased-finetuned-sst-2-english')
except Exception as e:
    logging.error(f"Failed to load BERT model: {e}")
    sentiment_analyzer = None

# Function to validate LinkedIn URL
def validate_linkedin_url(url):
    pattern = r'^https://www\.linkedin\.com/company/[\w-]+/?$'
    return bool(re.match(pattern, url))

# Function to parse follower count
def parse_followers_count(text):
    try:
        if not text:
            logging.warning("Follower text is empty")
            return 0
        logging.info(f"Raw follower text: '{text}'")
        text = text.strip().lower().replace('followers', '').replace(',', '').replace('+', '').strip()
        match = re.match(r'(\d+\.?\d*)\s*([mk]?)', text)
        if match:
            number = float(match.group(1))
            unit = match.group(2)
            if unit == 'm':
                return int(number * 1_000_000)
            elif unit == 'k':
                return int(number * 1_000)
            return int(number)
        num = re.search(r'\d+', text)
        if num:
            return int(num.group())
        logging.warning(f"Could not parse follower count from: '{text}'")
        return 0
    except Exception as e:
        logging.error(f"Error parsing followers count '{text}': {e}")
        return 0

# Function to parse reactions or comments
def parse_reaction_count(text):
    try:
        if not text:
            return 0
        text = text.strip().lower().replace('reactions', '').replace('likes', '').replace('comments', '').replace('comment', '').replace(',', '').strip()
        match = re.match(r'(\d+\.?\d*)\s*([mk]?)', text)
        if match:
            number = float(match.group(1))
            unit = match.group(2)
            if unit == 'm':
                return int(number * 1_000_000)
            elif unit == 'k':
                return int(number * 1_000)
            return int(number)
        num = re.search(r'\d+', text)
        if num:
            return int(num.group())
        logging.warning(f"Could not parse reaction count from: '{text}'")
        return 0
    except Exception as e:
        logging.error(f"Error parsing reaction count '{text}': {e}")
        return 0

# Function to check if job is within past 1 month
def is_job_recent(posted):
    if not posted:
        return False
    posted = posted.lower()
    if 'hour' in posted or 'day' in posted:
        return True
    if 'week' in posted:
        try:
            weeks = int(re.search(r'\d+', posted).group())
            return weeks <= 4
        except:
            return False
    return False

# Function to check WHOIS information
def check_whois(domain):
    try:
        w = whois.whois(domain)
        whois_data = {
            'domain_name': w.domain_name,
            'registrar': w.registrar,
            'creation_date': w.creation_date.isoformat() if isinstance(w.creation_date, datetime.datetime) else w.creation_date,
            'expiration_date': w.expiration_date.isoformat() if isinstance(w.expiration_date, datetime.datetime) else w.expiration_date,
            'last_updated': w.last_updated.isoformat() if isinstance(w.last_updated, datetime.datetime) else w.last_updated,
            'name_servers': w.name_servers
        }
        if isinstance(whois_data['domain_name'], list):
            whois_data['domain_name'] = whois_data['domain_name'][0] if whois_data['domain_name'] else None
        if isinstance(whois_data['creation_date'], list):
            whois_data['creation_date'] = whois_data['creation_date'][0].isoformat() if whois_data['creation_date'] else None
        if isinstance(whois_data['expiration_date'], list):
            whois_data['expiration_date'] = whois_data['expiration_date'][0].isoformat() if whois_data['expiration_date'] else None
        if isinstance(whois_data['last_updated'], list):
            whois_data['last_updated'] = whois_data['last_updated'][0].isoformat() if whois_data['last_updated'] else None
        return whois_data
    except Exception as e:
        logging.error(f"WHOIS check failed for {domain}: {e}")
        return {'error': str(e)}

# Function to check SSL certificate
def check_ssl(domain):
    try:
        context = ssl.create_default_context()
        with socket.create_connection((domain, 443)) as sock:
            with context.wrap_socket(sock, server_hostname=domain) as ssock:
                cert = ssock.getpeercert()
                return {
                    'issuer': dict(x[0] for x in cert['issuer']),
                    'subject': dict(x[0] for x in cert['subject']),
                    'not_before': cert['notBefore'],
                    'not_after': cert['notAfter'],
                    'serial_number': cert['serialNumber']
                }
    except Exception as e:
        logging.error(f"SSL check failed for {domain}: {e}")
        return {'error': str(e)}

# Function to check HTTPS status
def check_https(url):
    try:
        response = requests.head(url, allow_redirects=True, timeout=5)
        return {
            'is_https': response.url.startswith('https://'),
            'status_code': response.status_code,
            'response_time': response.elapsed.total_seconds()
        }
    except Exception as e:
        logging.error(f"HTTPS check failed for {url}: {e}")
        return {'error': str(e), 'is_https': False, 'status_code': None, 'response_time': None}

# Function to calculate domain age
def calculate_domain_age(whois_data):
    try:
        creation_date = whois_data.get('creation_date')
        if not creation_date:
            return {'error': 'No creation date found'}
        
        if isinstance(creation_date, str):
            try:
                creation_date = datetime.datetime.fromisoformat(creation_date.replace('Z', ''))
            except ValueError:
                match = re.search(r'\d{4}-\d{2}-\d{2}', creation_date)
                if match:
                    creation_date = datetime.datetime.strptime(match.group(0), '%Y-%m-%d')
                else:
                    return {'error': 'Invalid creation date format'}
        
        current_date = datetime.datetime.now()
        age = current_date - creation_date
        return {
            'age_days': age.days,
            'age_years': round(age.days / 365.25, 2)
        }
    except Exception as e:
        logging.error(f"Domain age calculation failed: {e}")
        return {'error': str(e)}

# Function to check website authenticity
def check_website_authenticity(url, company_name):
    try:
        result = {
            'is_professional': False,
            'has_contact_info': False,
            'has_red_flags': False,
            'broken_links_count': 0,
            'response_time': None,
            'authenticity_score': 0
        }
        
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
        }
        response = requests.get(url, headers=headers, timeout=10, allow_redirects=True)
        result['response_time'] = response.elapsed.total_seconds()
        
        if response.status_code != 200:
            logging.warning(f"Website returned status code {response.status_code}")
            return result
        
        soup = BeautifulSoup(response.text, 'html.parser')
        
        text_content = soup.get_text().lower()
        professional_indicators = [
            company_name.lower(),
            'about us', 'services', 'products', 'team', 'careers', 'contact',
            'privacy policy', 'terms of service'
        ]
        result['is_professional'] = any(indicator in text_content for indicator in professional_indicators)
        if result['is_professional']:
            result['authenticity_score'] += 5
        
        contact_indicators = [
            'email:', 'phone:', 'tel:', 'contact us', 'get in touch',
            re.compile(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'),
            re.compile(r'\b\d{3}-\d{3}-\d{4}\b')
        ]
        for indicator in contact_indicators:
            if isinstance(indicator, str):
                if indicator in text_content:
                    result['has_contact_info'] = True
                    break
            else:
                if indicator.search(text_content):
                    result['has_contact_info'] = True
                    break
        if result['has_contact_info']:
            result['authenticity_score'] += 5
        
        red_flags = ['lorem ipsum', 'under construction', 'coming soon']
        result['has_red_flags'] = any(flag in text_content for flag in red_flags)
        if not result['has_red_flags']:
            result['authenticity_score'] += 5
        
        links = [a.get('href') for a in soup.find_all('a') if a.get('href')]
        broken_links = 0
        for link in links[:10]:
            if link.startswith('#') or not link.startswith('http'):
                continue
            try:
                link_response = requests.head(link, timeout=5, allow_redirects=True)
                if link_response.status_code >= 400:
                    broken_links += 1
            except:
                broken_links += 1
        result['broken_links_count'] = broken_links
        if broken_links == 0:
            result['authenticity_score'] += 5
        
        logging.info(f"Website authenticity result: {result}")
        return result
    except Exception as e:
        logging.error(f"Website authenticity check failed for {url}: {e}")
        return {
            'is_professional': False,
            'has_contact_info': False,
            'has_red_flags': True,
            'broken_links_count': 0,
            'response_time': None,
            'authenticity_score': 0
        }

# Function to calculate engagement rate per post and average
def calculate_engagement_rate(posts, followers):
    try:
        if not posts or not followers:
            logging.warning("No posts or followers provided")
            return {
                'average_engagement_rate': 0.0,
                'average_reactions': 0.0,
                'average_comments': 0.0,
                'per_post_engagement': []
            }
        
        followers_count = parse_followers_count(followers)
        logging.info(f"Parsed followers count: {followers_count}")
        if followers_count == 0:
            logging.warning("Followers count is zero, setting engagement to 0")
            return {
                'average_engagement_rate': 0.0,
                'average_reactions': 0.0,
                'average_comments': 0.0,
                'per_post_engagement': []
            }
        
        total_reactions = 0
        total_comments = 0
        post_count = len(posts)
        per_post_engagement = []
        
        for i, post in enumerate(posts):
            reactions_count = parse_reaction_count(post['reactions'])
            comments_count = parse_reaction_count(post['comments'])
            
            logging.info(f"Post {i+1}: Reactions={reactions_count}, Comments={comments_count}, Commentary='{post['commentary'][:50]}...'")
            
            engagement = (reactions_count + comments_count) / followers_count * 100
            per_post_engagement.append({
                'commentary': post['commentary'][:50] + '...' if len(post['commentary']) > 50 else post['commentary'],
                'engagement_rate': round(engagement, 4),
                'reactions': reactions_count,
                'comments': comments_count
            })
            
            total_reactions += reactions_count
            total_comments += comments_count
        
        average_reactions = total_reactions / post_count if post_count > 0 else 0
        average_comments = total_comments / post_count if post_count > 0 else 0
        average_engagement_rate = sum(p['engagement_rate'] for p in per_post_engagement) / post_count if post_count > 0 else 0
        
        logging.info(f"Total posts: {post_count}, Total reactions: {total_reactions}, Total comments: {total_comments}")
        logging.info(f"Average engagement rate: {average_engagement_rate:.4f}%")
        
        return {
            'average_engagement_rate': round(average_engagement_rate, 4),
            'average_reactions': round(average_reactions, 2),
            'average_comments': round(average_comments, 2),
            'per_post_engagement': per_post_engagement
        }
    except Exception as e:
        logging.error(f"Engagement rate calculation failed: {e}")
        return {
            'average_engagement_rate': 0.0,
            'average_reactions': 0.0,
            'average_comments': 0.0,
            'per_post_engagement': []
        }

# Function to analyze posts using BERT
def analyze_posts(posts):
    results = []
    if not sentiment_analyzer:
        logging.warning("BERT model not available, skipping NLP analysis")
        return results
    
    try:
        for post in posts:
            text = post['commentary']
            text = re.sub(r'http\S+|#\S+', '', text)
            if len(text) > 512:
                text = text[:512]
            
            bert_result = sentiment_analyzer(text)[0]
            sentiment = bert_result['label'].lower()
            sentiment_score = bert_result['score']
            
            word_count = len(text.split())
            is_professional = word_count > 20 and sentiment_score > 0.7
            
            results.append({
                'text': text,
                'sentiment': 'positive' if sentiment == 'positive' else 'negative',
                'sentiment_score': round(sentiment_score, 2),
                'is_professional': is_professional
            })
        return results
    except Exception as e:
        logging.error(f"BERT analysis failed: {e}")
        return []

# Function to scrape LinkedIn jobs
def scrape_linkedin_company_jobs(company_name):
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    chrome_options.add_argument("--disable-blink-features=AutomationControlled")
    chrome_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")
    
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)
    
    try:
        jobs_data = {
            'job_count': '',
            'job_listings': []
        }
        
        jobs_url = f"https://www.linkedin.com/jobs/search/?keywords={company_name}%20jobs"
        logging.info(f"Navigating to {jobs_url}")
        driver.get(jobs_url)
        time.sleep(5)
        
        try:
            cookie_button = WebDriverWait(driver, 5).until(
                EC.element_to_be_clickable((By.XPATH, "//button[contains(text(), 'Accept')]"))
            )
            cookie_button.click()
            logging.info("Accepted cookie consent prompt")
        except:
            logging.info("No cookie consent prompt found")
        
        login_prompt = driver.find_elements(By.CLASS_NAME, "authwall-join-form")
        if login_prompt:
            logging.warning("Login prompt detected. Job data may be restricted.")
            return jobs_data
        
        logging.info("Scrolling to load job listings")
        for _ in range(5):
            try:
                job_list = WebDriverWait(driver, 10).until(
                    EC.presence_of_element_located((By.CSS_SELECTOR, "ul.jobs-search__results-list"))
                )
                driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
                time.sleep(3)
            except:
                logging.warning("Job list not found during scroll")
                break
        
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        
        jobs_list = soup.select('ul.jobs-search__results-list')
        if not jobs_list:
            logging.error("No job listings found.")
            return jobs_data
        
        job_count_elem = soup.select_one('span.results-context-header__job-count')
        if job_count_elem:
            jobs_data['job_count'] = job_count_elem.text.strip()
            logging.info(f"Found job count: {jobs_data['job_count']}")
        
        job_counter = 0
        for job in soup.select('ul.jobs-search__results-list > li'):
            if job_counter >= 20:
                break
                
            job_card = job.select_one('div[class*="base-card"]')
            if job_card:
                title_elem = job_card.select_one('h3[class*="base-search-card__title"]')
                subtitle_elem = job_card.select_one('h4[class*="base-search-card__subtitle"]')
                location_elem = job_card.select_one('span.job-search-card__location')
                time_elem = job_card.select_one('time[class*="job-search-card__listdate"]')
                link_elem = job_card.select_one('a[class*="base-card__full-link"]')
                               
                posted = time_elem.text.strip() if time_elem else ''
                if not is_job_recent(posted):
                    continue
                
                job_data = {
                    'title': title_elem.text.strip() if title_elem else '',
                    'company': subtitle_elem.text.strip() if subtitle_elem else '',
                    'location': location_elem.text.strip() if location_elem else '',
                    'posted': posted,
                    'url': link_elem['href'] if link_elem else '',
                }
                jobs_data['job_listings'].append(job_data)
                logging.info(f"Extracted job: {job_data['title']}")
                job_counter += 1
        
        return jobs_data
    
    except Exception as e:
        logging.error(f"Error during job scraping: {e}")
        return {'job_count': '', 'job_listings': []}
    
    finally:
        driver.quit()
        
# Function to scrape LinkedIn company followers
def scrape_linkedin_company_followers(url):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
        "Accept-Language": "en-US,en;q=0.9",
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"
    }

    try:
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            soup = BeautifulSoup(response.content, 'html.parser')
            followers_tag = soup.find('meta', {"property": "og:description"}) or \
                           soup.find('div', class_=re.compile('org-top-card-summary-info-list'))
            
            followers_count = "0 followers"
            if followers_tag:
                text = followers_tag.get('content', '') if followers_tag.name == 'meta' else followers_tag.get_text(strip=True)
                followers_match = re.search(r'\b(\d[\d,.]*)\s+followers\b', text, re.IGNORECASE)
                if followers_match:
                    followers_count = followers_match.group(1) + " followers"
                else:
                    followers_alt = soup.find('span', class_='org-top-card-summary__follower-count')
                    if followers_alt:
                        followers_count = followers_alt.text.strip()
            logging.info(f"Extracted followers: {followers_count}")
            return followers_count
        else:
            logging.warning(f"Unable to retrieve LinkedIn company page. Status code: {response.status_code}")
            return "0 followers"
    except Exception as e:
        logging.error(f"Error scraping followers: {str(e)}")
        return "0 followers"

# Function to scrape LinkedIn company data from URL
def scrape_linkedin_company_from_url(linkedin_url):
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    chrome_options.add_argument("--disable-blink-features=AutomationControlled")
    chrome_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")
    
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)
    
    try:
        company_name = linkedin_url.split('/company/')[-1].split('/')[0].replace('-', ' ')
        
        company_data = {
            'overview': {
                'name': '',
                'sector': '',
                'location': '',
                'followers': '',
                'website': '',
                'industry': '',
                'company_size': '',
                'headquarters': '',
                'founded': '',
                'locations': []
            },
            'jobs': {
                'job_count': '',
                'job_listings': []
            },
            'posts': [],
            'domain_info': {
                'whois': {},
                'ssl': {},
                'https': {},
                'domain_age': {},
                'website_authenticity': {},
                'engagement': {}
            }
        }
        
        logging.info(f"Navigating to {linkedin_url}")
        driver.get(linkedin_url)
        time.sleep(5)
        
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(3)
        
        last_height = driver.execute_script("return document.body.scrollHeight")
        while True:
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(3)
            new_height = driver.execute_script("return document.body.scrollHeight")
            if new_height == last_height:
                break
            last_height = new_height
        
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        
        name_elem = soup.find('h1', class_='top-card-layout__title')
        if name_elem:
            company_data['overview']['name'] = name_elem.text.strip()
        
        sector_elem = soup.find('h2', class_='top-card-layout__headline')
        if sector_elem:
            company_data['overview']['sector'] = sector_elem.text.strip()
        
        company_data['overview']['followers'] = scrape_linkedin_company_followers(linkedin_url)
        
        website_elem = soup.find('div', attrs={'data-test-id': 'about-us__website'})
        if website_elem:
            website_link = website_elem.find('a')
            if website_link:
                company_data['overview']['website'] = website_link.text.strip()
        
        industry_elem = soup.find('div', attrs={'data-test-id': 'about-us__industry'})
        if industry_elem:
            industry_dd = industry_elem.find('dd')
            if industry_dd:
                company_data['overview']['industry'] = industry_dd.text.strip()
        
        size_elem = soup.find('div', attrs={'data-test-id': 'about-us__size'})
        if size_elem:
            size_dd = size_elem.find('dd')
            if size_dd:
                company_data['overview']['company_size'] = size_dd.text.strip()
        
        hq_elem = soup.find('div', attrs={'data-test-id': 'about-us__headquarters'})
        if hq_elem:
            hq_dd = hq_elem.find('dd')
            if hq_dd:
                company_data['overview']['headquarters'] = hq_dd.text.strip()
        
        founded_elem = soup.find('div', attrs={'data-test-id': 'about-us__foundedOn'})
        if founded_elem:
            founded_dd = founded_elem.find('dd')
            if founded_dd:
                company_data['overview']['founded'] = founded_dd.text.strip()
        
        try:
            see_all_locations = driver.find_elements(By.XPATH, "//button[contains(text(), 'See all')]")
            for button in see_all_locations:
                if 'locations' in button.get_attribute('aria-label').lower():
                    driver.execute_script("arguments[0].click();", button)
                    time.sleep(3)
                    break
            soup = BeautifulSoup(driver.page_source, 'html.parser')
            locations_list = soup.find('ul', class_='show-more-less__list')
            if locations_list:
                for li in locations_list.find_all('li'):
                    address_div = li.find('div', id=lambda x: x and x.startswith('address-'))
                    if address_div:
                        address_parts = [p.text.strip() for p in address_div.find_all('p')]
                        address = ', '.join(address_parts)
                        is_primary = bool(li.find('span', class_='tag-sm tag-enabled'))
                        company_data['overview']['locations'].append({
                            'address': address,
                            'is_primary': is_primary
                        })
        except Exception as e:
            logging.error(f"Error extracting locations: {e}")
        
        try:
            see_all_posts = driver.find_elements(By.XPATH, "//button[contains(text(), 'See all')]")
            for button in see_all_posts:
                if 'posts' in button.get_attribute('aria-label').lower():
                    driver.execute_script("arguments[0].click();", button)
                    time.sleep(5)
                    for _ in range(10):
                        last_height = driver.execute_script("return document.body.scrollHeight")
                        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
                        time.sleep(3)
                        new_height = driver.execute_script("return document.body.scrollHeight")
                        if new_height == last_height:
                            break
                        last_height = new_height
                    break
            soup = BeautifulSoup(driver.page_source, 'html.parser')
            
            comment_count_script = """
            function extractCommentCounts() {
                const commentElements = document.querySelectorAll('[data-test-id="social-actions__comments"]');
                const commentCounts = [];
                commentElements.forEach((element, index) => {
                    const commentCount = parseInt(element.getAttribute('data-num-comments'), 10);
                    commentCounts.push({
                        postIndex: index + 1,
                        commentCount: isNaN(commentCount) ? 0 : commentCount
                    });
                });
                return commentCounts;
            }
            return extractCommentCounts();
            """
            comment_counts = driver.execute_script(comment_count_script)
            comment_counts_dict = {item['postIndex']: item['commentCount'] for item in comment_counts}
            
            posts_list = soup.find_all('article', class_='main-feed-activity-card')
            for i, post in enumerate(posts_list[:50]):
                commentary = post.find('p', class_='attributed-text-segment-list__content')
                reactions = post.find('span', attrs={'data-test-id': 'social-actions__reaction-count'})
                if not reactions:
                    reactions = post.find('span', class_='social-details-social-counts__reactions-count')
                comments = post.find('a', attrs={'data-test-id': 'social-actions__comments'})
                if not comments:
                    comments = post.find('li', class_='social-details-social-counts__item--comments')
                timestamp = post.find('time', class_='main-feed-activity-card__timestamp')
                
                comment_count = comment_counts_dict.get(i + 1, 0)
                
                post_data = {
                    'commentary': commentary.text.strip() if commentary else '',
                    'reactions': reactions.text.strip() if reactions else '0',
                    'comments': str(comment_count) + (' Comments' if comment_count != 1 else ' Comment'),
                    'timestamp': timestamp.text.strip() if timestamp else ''
                }
                company_data['posts'].append(post_data)
                logging.info(f"Extracted post {i+1}: {post_data['commentary'][:50]}... (Timestamp: {post_data['timestamp']}, Comments: {post_data['comments']})")
        except Exception as e:
            logging.error(f"Error extracting posts: {e}")
        
        jobs_data = scrape_linkedin_company_jobs(company_name)
        company_data['jobs'] = jobs_data
                        
        website = company_data['overview']['website']
        if website:
            try:
                parsed_url = urlparse(website)
                domain = parsed_url.netloc or parsed_url.path.split('/')[0]
                if domain.startswith('www.'):
                    domain = domain[4:]
                
                company_data['domain_info']['whois'] = check_whois(domain)
                company_data['domain_info']['ssl'] = check_ssl(domain)
                company_data['domain_info']['https'] = check_https(website)
                company_data['domain_info']['domain_age'] = calculate_domain_age(company_data['domain_info']['whois'])
                company_data['domain_info']['website_authenticity'] = check_website_authenticity(website, company_data['overview']['name'])
                company_data['domain_info']['engagement'] = calculate_engagement_rate(
                    company_data['posts'],
                    company_data['overview']['followers']
                )
            except Exception as e:
                logging.error(f"Domain info processing failed for {website}: {e}")
                company_data['domain_info'] = {
                    'whois': {'error': str(e)},
                    'ssl': {'error': str(e)},
                    'https': {'error': str(e)},
                    'domain_age': {'error': str(e)},
                    'website_authenticity': {'error': str(e)},
                    'engagement': {
                        'average_engagement_rate': 0.0,
                        'average_reactions': 0.0,
                        'average_comments': 0.0,
                        'per_post_engagement': []
                    }
                }
        else:
            logging.warning("No website URL found for domain checks")
            company_data['domain_info']['website_authenticity'] = {'error': 'No website URL provided'}
        
        return company_data
    
    except Exception as e:
        logging.error(f"Error during scraping: {e}")
        return None
    
    finally:
        driver.quit()

# Function to classify company type
def classify_company_type(structured_data):
    indicators = structured_data['authenticity_indicators']
    size = indicators['overview']['company_size']
    followers = indicators['overview']['followers']
    founded = indicators['overview']['founded']
    
    try:
        size_num = int(re.search(r'\d+', size.replace(',', '')).group())
    except:
        size_num = 0
    
    try:
        followers_num = parse_followers_count(followers)
    except:
        followers_num = 0
    
    try:
        founded_year = int(founded)
        current_year = datetime.datetime.now().year
        years_since_founded = current_year - founded_year
    except:
        years_since_founded = float('inf')
    
    is_small = size_num < 500 or any(s in size.lower() for s in ['1-10', '11-50', '51-200', '201-500'])
    is_new = years_since_founded < 5
    has_few_followers = followers_num < 10000
    
    score = 0
    if is_new:
        score += 2
    if has_few_followers:
            score += 1
    if is_small:
        score += 1
    
    if score >= 3:
        return 'startup'
    return 'established'

# Function to structure authenticity data
def structure_authenticity_data(company_data):
    structured_data = {
        "company_name": company_data['overview']['name'],
        "linkedin_url": "",
        "company_type": "",
        "authenticity_indicators": {
            "domain_info": {
                "whois": company_data['domain_info']['whois'],
                "ssl": company_data['domain_info']['ssl'],
                "https": company_data['domain_info']['https'],
                "domain_age": company_data['domain_info']['domain_age'],
                "website_authenticity": company_data['domain_info']['website_authenticity'],
            },
            "overview": {
                "website": company_data['overview']['website'],
                "industry": company_data['overview']['industry'],
                "company_size": company_data['overview']['company_size'],
                "headquarters": company_data['overview']['headquarters'],
                "founded": company_data['overview']['founded'],
                "locations": company_data['overview']['locations'],
                "followers": company_data['overview']['followers'],
               
            },
            "jobs": {
                "job_count": company_data['jobs']['job_count'],
                "job_listings": company_data['jobs']['job_listings'],
                
            },
            "posts": {
                "engagement": company_data['domain_info']['engagement'],
                "post_count": len(company_data['posts']),
                "post_analysis": analyze_posts(company_data['posts']),
                
            }
        },
        "authenticity_score": 0.0,
        "is_likely_authentic": False
    }
    return structured_data

# Function to validate authenticity
def validate_authenticity(structured_data, linkedin_url):
    structured_data['linkedin_url'] = linkedin_url
    company_type = classify_company_type(structured_data)
    structured_data['company_type'] = company_type
    indicators = structured_data['authenticity_indicators']
    score = 0
    validations = {}

    whois = indicators['domain_info']['whois']
    validations['domain_info'] = {}
    if whois.get('domain_name') and isinstance(whois.get('domain_name'), str) and whois['domain_name'].lower() in indicators['overview']['website'].lower():
        validations['domain_info']['is_domain_match'] = True
        score += 15
    reputable_registrars = ['MarkMonitor, Inc.', 'GoDaddy.com, LLC', 'Namecheap, Inc.', 'Cloudflare, Inc.']
    if whois.get('registrar') in reputable_registrars:
        validations['domain_info']['is_reputable_registrar'] = True
        score += 10
    try:
        creation_date = datetime.datetime.fromisoformat(whois['creation_date'].replace('Z', '')) if whois.get('creation_date') else None
        min_age = 0.25 if company_type == 'startup' else 0.5
        if creation_date and (datetime.datetime.now() - creation_date).days / 365.25 > min_age:
            validations['domain_info']['is_domain_old'] = True
            score += 10
    except:
        validations['domain_info']['is_domain_old'] = False

    ssl = indicators['domain_info']['ssl']
    trusted_issuers = ['DigiCert', 'Let\'s Encrypt', 'Sectigo', 'GlobalSign', 'Cloudflare']
    if any(issuer in ssl.get('issuer', {}).get('commonName', '') for issuer in trusted_issuers):
        validations['domain_info']['is_trusted_issuer'] = True
        score += 5
    try:
        not_after = datetime.datetime.strptime(ssl['not_after'], '%b %d %H:%M:%S %Y GMT')
        if not_after > datetime.datetime.now():
            validations['domain_info']['is_valid_ssl'] = not_after > datetime.datetime.now()
            if not_after:
                validations['domain_info']['is_valid_ssl'] = True
                score += 5
    except:
        validations['domain_info']['is_valid_ssl'] = False

    website_auth = indicators['domain_info']['website_authenticity']
    validations['domain_info']['website_validation'] = {}
    if website_auth.get('is_professional'):
        validations['domain_info']['website_validation']['is_professional'] = True
        score += 5
    if website_auth.get('has_contact_info'):
        validations['domain_info']['website_validation']['has_contact_info'] = True
        score += 5
    if not website_auth.get('has_red_flags'):
        validations['domain_info']['website_validation']['no_red_flags'] = True
        score += 5
    if website_auth.get('broken_links_count', 0) == 0:
        validations['domain_info']['website_validation']['no_broken_links'] = True
        score += 5

    validations['overview'] = {}
    if indicators['overview']['website'].startswith('http'):
        validations['overview']['has_valid_website'] = True
        score += 10
    if company_type == 'established' and (indicators['overview']['headquarters'] or indicators['overview']['locations']):
        validations['overview']['has_physical_presence'] = True
        score += 10
    elif company_type == 'startup' and (indicators['overview']['headquarters'] or indicators['overview']['locations'] or indicators['overview']['website']):
        validations['overview']['has_physical_presence'] = True
        score += 10
    try:
        followers = parse_followers_count(indicators['overview']['followers'])
        follower_score = min(20, int(10 * (followers ** 0.5) / 1000))
        min_followers = 500 if company_type == 'startup' else 5000
        if followers > min_followers:
            validations['overview']['has_significant_followers'] = True
            score += follower_score
    except:
        validations['overview']['has_significant_followers'] = False

    validations['jobs'] = {}
    try:
        job_count = int(re.sub(r'[^\d]', '', indicators['jobs']['job_count'])) if indicators['jobs']['job_count'] else 0
        min_jobs = 1 if company_type == 'startup' else 3
        if job_count >= min_jobs:
            validations['jobs']['has_multiple_jobs'] = True
            score += 10 + min(10, job_count)
    except:
        validations['jobs']['has_multiple_jobs'] = False
    recent_jobs = any('ago' in job['posted'] or 'day' in job['posted'] or 'week' in job['posted'] for job in indicators['jobs']['job_listings'])
    if recent_jobs:
        validations['jobs']['has_active_jobs'] = True
        score += 10

    validations['posts'] = {}
    min_posts = 3 if company_type == 'startup' else 5
    if indicators['posts']['post_count'] >= min_posts:
        validations['posts']['has_recent_posts'] = True
        score += 5
    professional_threshold = 0.4 if company_type == 'startup' else 0.6
    professional_posts = sum(1 for post in indicators['posts']['post_analysis'] if post['is_professional'] and post['sentiment'] == 'positive')
    if professional_posts / max(1, len(indicators['posts']['post_analysis'])) > professional_threshold:
        validations['posts']['has_professional_content'] = True
        score += 5
 
    structured_data['authenticity_score'] = score
    authenticity_threshold = 50 if company_type == 'startup' else 60
    structured_data['is_likely_authentic'] = score >= authenticity_threshold
    structured_data['authenticity_indicators']['validation'] = validations
    return structured_data

# Function to scrape  leaders, and affiliated pages
def scrape_linkedin_sections(linkedin_url):
    if not validate_linkedin_url(linkedin_url):
        logging.error("Invalid LinkedIn company URL")
        return None
    
    company_name = linkedin_url.split('/company/')[-1].split('/')[0].replace('-', ' ').title()
    
    scraped_data = {
        'company_name': company_name,
        'linkedin_url': linkedin_url,
        'leaders': [],
        'affiliated_pages': []
    }
    
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    chrome_options.add_argument("--disable-blink-features=AutomationControlled")
    chrome_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")
    
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)
    
    try:
        life_url = f"{linkedin_url.rstrip('/')}/life"
        logging.info(f"Navigating to {life_url}")
        driver.get(life_url)
        time.sleep(5)
        
        last_height = driver.execute_script("return document.body.scrollHeight")
        while True:
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(3)
            new_height = driver.execute_script("return document.body.scrollHeight")
            if new_height == last_height:
                break
            last_height = new_height
        
        soup = BeautifulSoup(driver.page_source, 'html.parser')
                
        leaders_section = soup.find('section', attrs={'data-test-id': 'leaders-at'})
        if leaders_section:
            leaders_list = leaders_section.find_all('li')
            for leader in leaders_list:
                name_elem = leader.find('h3', class_='base-main-card__title')
                title_elem = leader.find('h4', class_='base-main-card__subtitle')
                profile_link = leader.find('a', class_='base-card--link')
                
                leader_data = {
                    'name': name_elem.text.strip() if name_elem else '',
                    'title': title_elem.text.strip() if title_elem else '',
                    'profile_url': profile_link['href'] if profile_link else ''
                }
                if leader_data['name'] and leader_data['profile_url']:
                    scraped_data['leaders'].append(leader_data)
                    logging.info(f"Extracted leader: {leader_data['name']}, {leader_data['title']}")
        
        affiliated_section = soup.find('section', attrs={'data-test-id': 'affiliated-pages'})
        if affiliated_section:
            affiliated_list = affiliated_section.find_all('li')
            for affiliate in affiliated_list:
                name_elem = affiliate.find('h3', class_='base-aside-card__title')
                subtitle_elem = affiliate.find('p', class_='base-aside-card__subtitle')
                location_elem = affiliate.find('p', class_='base-aside-card__second-subtitle')
                link_elem = affiliate.find('a', class_='base-aside-card--link')
                
                affiliate_data = {
                    'name': name_elem.text.strip() if name_elem else '',
                    'subtitle': subtitle_elem.text.strip() if subtitle_elem else '',
                    'location': location_elem.text.strip() if location_elem else '',
                    'url': link_elem['href'] if link_elem else ''
                }
                if affiliate_data['name'] and affiliate_data['url']:
                    scraped_data['affiliated_pages'].append(affiliate_data)
                    logging.info(f"Extracted affiliated page: {affiliate_data['name']}")
        
        return scraped_data
    
    except Exception as e:
        logging.error(f"Error during scraping: {e}")
        return None
    
    finally:
        driver.quit()

# Function to process user-provided LinkedIn URL for both authenticity check and sections scraping
def check_and_scrape_linkedin_company():
    try:
        linkedin_url = input("Enter the LinkedIn company URL (e.g., https://www.linkedin.com/company/yahoo/): ").strip()
        
        if not validate_linkedin_url(linkedin_url):
            print("Invalid LinkedIn company URL. Please provide a valid URL like https://www.linkedin.com/company/company-name/")
            return None
        
        print("Scraping company data for authenticity check...")
        company_data = scrape_linkedin_company_from_url(linkedin_url)
        if not company_data:
            logging.error("Failed to scrape company data for authenticity")
            print("Failed to scrape company data for authenticity. Check logs for details.")
            return None
        
        structured_data = structure_authenticity_data(company_data)
        
        print("Validating authenticity...")
        validated_data = validate_authenticity(structured_data, linkedin_url)
        
        print("Scraping leaders, and affiliated pages...")
        sections_data = scrape_linkedin_sections(linkedin_url)
        if not sections_data:
            logging.error("Failed to scrape leaders, and affiliated pages")
            print("Failed to scrape leaders, and affiliated pages. Check logs for details.")
            sections_data = {
                'company_name': validated_data['company_name'],
                'linkedin_url': linkedin_url,
                'leaders': [],
                'affiliated_pages': []
            }
        
        # Combine both data sets into a single dictionary
        combined_data = {
            'authenticity': validated_data,
            'sections': sections_data
        }
        
        # Save combined data to a single JSON file
        company_name = validated_data['company_name'].lower().replace(' ', '_')
        output_file = f"{company_name}_company_data.json"
        with open(output_file, 'w', encoding='utf-8') as f:
            json.dump(combined_data, f, indent=2, ensure_ascii=False)
        
        print(f"\n=== Analysis for {validated_data['company_name']} ===")
        print(f"LinkedIn URL: {validated_data['linkedin_url']}")
        print("\n--- Authenticity Check Results ---")
        print(f"Company Type: {validated_data['company_type'].capitalize()}")
        print(f"Founded: {validated_data['authenticity_indicators']['overview']['founded']}")
        print(f"Followers: {validated_data['authenticity_indicators']['overview']['followers']}")
        print(f"Employee Count: {validated_data['authenticity_indicators']['overview']['company_size']}")
        print(f"Job Count: {validated_data['authenticity_indicators']['jobs']['job_count']}")
        print(f"Post Count: {validated_data['authenticity_indicators']['posts']['post_count']}")
        print(f"Average Engagement Rate: {validated_data['authenticity_indicators']['posts']['engagement']['average_engagement_rate']}%")
        print(f"Average Reactions per Post: {validated_data['authenticity_indicators']['posts']['engagement']['average_reactions']}")
        print(f"Average Comments per Post: {validated_data['authenticity_indicators']['posts']['engagement']['average_comments']}")
        print(f"Website Authenticity Score: {validated_data['authenticity_indicators']['domain_info']['website_authenticity']['authenticity_score']}/20")
        print(f"Website Professional: {validated_data['authenticity_indicators']['domain_info']['website_authenticity']['is_professional']}")
        print(f"Website Has Contact Info: {validated_data['authenticity_indicators']['domain_info']['website_authenticity']['has_contact_info']}")
        print(f"Website Has Red Flags: {validated_data['authenticity_indicators']['domain_info']['website_authenticity']['has_red_flags']}")
        print(f"Website Broken Links: {validated_data['authenticity_indicators']['domain_info']['website_authenticity']['broken_links_count']}")
        print(f"Authenticity Score: {validated_data['authenticity_score']}/150")
        print(f"Is Likely Authentic: {validated_data['is_likely_authentic']}")
        print(f"Results saved to {output_file}")
       
        return combined_data
    
    except Exception as e:
        logging.error(f"Analysis failed: {e}")
        print(f"Error: {str(e)}. Check logs for details.")
        return None

# Main entry point
if __name__ == "__main__":
    result = check_and_scrape_linkedin_company()
    if result:
        print("\nCompany analysis completed successfully.")
    else:
        print("\nCompany analysis failed.")

2025-05-14 18:55:12,566 - ERROR - Failed to load BERT model: Failed to import transformers.models.distilbert.modeling_tf_distilbert because of the following error (look up to see its traceback):
Your currently installed version of Keras is Keras 3, but this is not yet supported in Transformers. Please install the backwards-compatible tf-keras package with `pip install tf-keras`.


Enter the LinkedIn company URL (e.g., https://www.linkedin.com/company/yahoo/):  https://www.linkedin.com/company/cybrisk-cyber




Scraping company data for authenticity check...


2025-05-14 18:55:29,380 - INFO - Get LATEST chromedriver version for google-chrome
2025-05-14 18:55:29,444 - INFO - Get LATEST chromedriver version for google-chrome
2025-05-14 18:55:29,502 - INFO - Driver [C:\Users\AnushaM\.wdm\drivers\chromedriver\win64\136.0.7103.92\chromedriver-win32/chromedriver.exe] found in cache
2025-05-14 18:55:30,656 - INFO - Navigating to https://www.linkedin.com/company/cybrisk-cyber
2025-05-14 18:55:56,222 - INFO - Extracted followers: 1,569 followers
2025-05-14 18:55:57,225 - INFO - Extracted post 1: We’re thrilled to share that Cybrisk is now offici... (Timestamp: , Comments: 1 Comment)
2025-05-14 18:55:57,226 - INFO - Extracted post 2: We’re thrilled to share that Cybrisk is now offici... (Timestamp: , Comments: 1 Comment)
2025-05-14 18:55:57,229 - INFO - Extracted post 3: In today’s digital world, phishing attacks and fra... (Timestamp: , Comments: 1 Comment)
2025-05-14 18:55:57,231 - INFO - Extracted post 4: Cyber Awareness vs. Cybersecurity – Are You

Validating authenticity...
Scraping leaders, and affiliated pages...


2025-05-14 18:56:32,457 - INFO - Get LATEST chromedriver version for google-chrome
2025-05-14 18:56:32,517 - INFO - Get LATEST chromedriver version for google-chrome
2025-05-14 18:56:32,603 - INFO - Driver [C:\Users\AnushaM\.wdm\drivers\chromedriver\win64\136.0.7103.92\chromedriver-win32/chromedriver.exe] found in cache
2025-05-14 18:56:33,859 - INFO - Navigating to https://www.linkedin.com/company/cybrisk-cyber/life
2025-05-14 18:56:57,921 - INFO - Extracted affiliated page: CTRL THREATS
2025-05-14 18:56:57,922 - INFO - Extracted affiliated page: CTRL FAKE



=== Analysis for F9 Cybrisk Tech Private Limited ===
LinkedIn URL: https://www.linkedin.com/company/cybrisk-cyber

--- Authenticity Check Results ---
Company Type: Startup
Founded: 2024
Followers: 1,569 followers
Employee Count: 11-50 employees
Job Count: 
Post Count: 50
Average Engagement Rate: 0.5201%
Average Reactions per Post: 8.08
Average Comments per Post: 0.08
Website Authenticity Score: 0/20
Website Professional: False
Website Has Contact Info: False
Website Has Red Flags: False
Website Broken Links: 0
Authenticity Score: 75/150
Is Likely Authentic: True
Results saved to f9_cybrisk_tech_private_limited_company_data.json

Company analysis completed successfully.
