<a href="https://colab.research.google.com/github/NataKrj/AI-project-2024/blob/SergejsKopils/20241109_Al%2Bproject_TSI_API_web_scrapping.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**WORK GOOD** google 1 page

In [None]:
from googleapiclient.discovery import build
import requests
from bs4 import BeautifulSoup
import pandas as pd
import openai
import time

# Set up your API keys
google_api_key = 'xxx'
google_cse_id = 'xxx'
openai.api_key = 'xxx'

# Keywords to search for
keywords = [
    "court", "criminal case", "accusation", "crime", "corruption", "penalty",
    "investigation", "insolvency", "debt", "violation", "arrested", "sanctions",
    "litigation", "shell company", "blackmail"
]

# Function to perform Google search
def google_search(search_term, api_key, cse_id, **kwargs):
    service = build("customsearch", "v1", developerKey=api_key)
    res = service.cse().list(q=search_term, cx=cse_id, **kwargs).execute()
    return res.get('items', [])

# Function to extract text from a URL
def extract_text_from_url(url):
    try:
        response = requests.get(url, headers={'User-Agent': 'Mozilla/5.0'})
        if response.status_code == 200:
            soup = BeautifulSoup(response.text, 'html.parser')
            text = ' '.join(p.text for p in soup.find_all('p'))
            return text
        else:
            print(f"Failed to fetch {url} with status code {response.status_code}")
    except requests.RequestException as e:
        print(f"Request failed: {e}")
    return ""

# Function to check if text contains any keywords
def contains_keywords(text, keywords):
    matched_keywords = [kw for kw in keywords if kw.lower() in text.lower()]
    return ", ".join(matched_keywords) if matched_keywords else "No match"

# Function to analyze text with GPT-4
def analyze_text_with_gpt(text, company_name):
    try:
        response = openai.ChatCompletion.create(
            model="gpt-4",
            messages=[
                {"role": "system", "content": f"Determine if the following text is related to {company_name}."},
                {"role": "user", "content": text}
            ],
            max_tokens=150
        )
        return response['choices'][0]['message']['content'].strip()
    except openai.error.OpenAIError as e:
        print(f"OpenAI API error: {e}")
        return "Analysis failed."

# Main process
company_name = 'SwedBank'
results = google_search(company_name, google_api_key, google_cse_id, num=10)

# Create DataFrame to store results
data = pd.DataFrame(columns=['company', 'url', 'extracted_text', 'gpt_analysis', 'related_keywords'])

for result in results:
    title = result['title']
    link = result['link']

    # Extract text from each URL
    extracted_text = extract_text_from_url(link)

    # Analyze extracted text with GPT-4 to determine relevance to the company
    if extracted_text:
        gpt_analysis = analyze_text_with_gpt(extracted_text, company_name)
        related_keywords = contains_keywords(extracted_text, keywords)
    else:
        gpt_analysis = "No text extracted."
        related_keywords = "No text extracted."

    # Create a new row as a DataFrame and append it to the main DataFrame
    new_row = pd.DataFrame({
        'company': [company_name],
        'url': [link],
        'extracted_text': [extracted_text],
        'gpt_analysis': [gpt_analysis],
        'related_keywords': [related_keywords]  # Add matched keywords
    })
    data = pd.concat([data, new_row], ignore_index=True)  # Concatenate new row

    # Delay to avoid rate limits
    time.sleep(1)  # Adjust delay if necessary

# Save the results to a CSV file
data.to_csv('company_analysis_results.csv', index=False)
print(data)


**WORK GOOD** google 10 page

In [None]:
from googleapiclient.discovery import build
import requests
from bs4 import BeautifulSoup
import pandas as pd
import openai
import time

# Set up your API keys
google_api_key = 'xxx'
google_cse_id = 'xxx'
openai.api_key = 'xxx'

# Keywords to search for
keywords = [
    "court", "criminal case", "accusation", "crime", "corruption", "penalty",
    "investigation", "insolvency", "debt", "violation", "arrested", "sanctions",
    "litigation", "shell company", "blackmail"
]

# Function to perform Google search and get multiple pages
def google_search(search_term, api_key, cse_id, num_pages=10):
    service = build("customsearch", "v1", developerKey=api_key)
    results = []

    for page in range(num_pages):
        start_index = page * 10 + 1  # 1, 11, 21, ... for each page
        res = service.cse().list(q=search_term, cx=cse_id, start=start_index).execute()
        if 'items' in res:
            results.extend(res['items'])
        time.sleep(1)  # Avoid rate limits by adding a delay

    return results

# Function to extract text from a URL
def extract_text_from_url(url):
    try:
        response = requests.get(url, headers={'User-Agent': 'Mozilla/5.0'})
        if response.status_code == 200:
            soup = BeautifulSoup(response.text, 'html.parser')
            text = ' '.join(p.text for p in soup.find_all('p'))
            return text
        else:
            print(f"Failed to fetch {url} with status code {response.status_code}")
    except requests.RequestException as e:
        print(f"Request failed: {e}")
    return ""

# Function to check if text contains any keywords
def contains_keywords(text, keywords):
    matched_keywords = [kw for kw in keywords if kw.lower() in text.lower()]
    return ", ".join(matched_keywords) if matched_keywords else "No match"

# Function to analyze text with GPT-4
def analyze_text_with_gpt(text, company_name):
    try:
        response = openai.ChatCompletion.create(
            model="gpt-4",
            messages=[
                {"role": "system", "content": f"Determine if the following text is related to {company_name}."},
                {"role": "user", "content": text}
            ],
            max_tokens=150
        )
        return response['choices'][0]['message']['content'].strip()
    except openai.error.OpenAIError as e:
        print(f"OpenAI API error: {e}")
        return "Analysis failed."

# Main process
company_name = 'SWEDBANK BALTICS, AS'
results = google_search(company_name, google_api_key, google_cse_id, num_pages=10)

# Create DataFrame to store results
data = pd.DataFrame(columns=['company', 'url', 'extracted_text', 'gpt_analysis', 'related_keywords'])

for result in results:
    title = result['title']
    link = result['link']

    # Extract text from each URL
    extracted_text = extract_text_from_url(link)

    # Analyze extracted text with GPT-4 to determine relevance to the company
    if extracted_text:
        gpt_analysis = analyze_text_with_gpt(extracted_text, company_name)
        related_keywords = contains_keywords(extracted_text, keywords)
    else:
        gpt_analysis = "No text extracted."
        related_keywords = "No text extracted."

    # Create a new row as a DataFrame and append it to the main DataFrame
    new_row = pd.DataFrame({
        'company': [company_name],
        'url': [link],
        'extracted_text': [extracted_text],
        'gpt_analysis': [gpt_analysis],
        'related_keywords': [related_keywords]  # Add matched keywords
    })
    data = pd.concat([data, new_row], ignore_index=True)  # Concatenate new row

    # Delay to avoid rate limits
    time.sleep(1)  # Adjust delay if necessary

# Save the results to a CSV file
data.to_csv('company_analysis_results.csv', index=False)
print(data)
