# Web Scrapping

In [5]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

def scrape_website_data(url):
    HEADERS = {}
    response = requests.get(url, headers=HEADERS)
    soup = BeautifulSoup(response.content, 'html.parser')

    # ---------- EVENTS SECTION ----------
    events_section = soup.find("section", class_="events")
    events_data = []

    if events_section:
        cards = events_section.find_all("div", class_="card")
        for card in cards:
            brand = card.find("h3")
            desc = card.find("p")

            events_data.append({
                "brand": brand.get_text(strip=True) if brand else None,
                "description": desc.get_text(strip=True) if desc else None
            })

    # Save to CSV
    df = pd.DataFrame(events_data)
    df.to_csv("website_data.csv", index=False, encoding='utf-8')
    print(" Data saved to 'website_data.csv' successfully!")

    return events_data


# Run the scraper
url = 'https://iaryanyadav.github.io/project_1/'
final_data = scrape_website_data(url)

# Optional: print preview
print(final_data)


 Data saved to 'website_data.csv' successfully!
[{'brand': 'CocaCola', 'description': 'During the Fashion Week, CocaCola sponsored exclusive discounts on Refrigerator. Customers rushed to grab the best deals from CocaCola as part of this e-commerce event.'}, {'brand': 'Samsung', 'description': 'During the Tech Summit, Samsung sponsored exclusive discounts on Smart Watch. Customers rushed to grab the best deals from Samsung as part of this e-commerce event.'}, {'brand': 'Nike', 'description': 'During the Summer Sale, Nike sponsored exclusive discounts on Running Shoes. Customers rushed to grab the best deals from Nike as part of this e-commerce event.'}, {'brand': 'Microsoft', 'description': 'During the Cyber Monday, Microsoft sponsored exclusive discounts on Surface Laptop. Customers rushed to grab the best deals from Microsoft as part of this e-commerce event.'}, {'brand': 'Sony', 'description': 'During the New Year Sale, Sony sponsored exclusive discounts on Headphones. Customers rus

#Model Training

In [6]:
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('wordnet')
nltk.download('omw-1.4')   # Optional, improves lemmatization


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [7]:
# ---------- Sponsor Model Training (Text Data) ----------
import pandas as pd
import numpy as np
import string
import joblib
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# ----------------------------------------------
# 1Ô∏è Text Preprocessing Helpers
# ----------------------------------------------
lemmatizer = WordNetLemmatizer()

def lemmatize_word(word, lemmatizer):
    for pos in ['n', 'v', 'a', 'r', 's']:
        lemmatized_word = lemmatizer.lemmatize(word, pos)
        if lemmatized_word != word:
            return lemmatized_word
    return lemmatizer.lemmatize(word)

def normalize_text(tokens):
    """Lowercase, remove punctuation, and lemmatize tokens."""
    tokens = [word.lower() for word in tokens]
    tokens = [word for word in tokens if word not in string.punctuation]
    tokens = [lemmatize_word(word, lemmatizer) for word in tokens]
    return tokens

# ----------------------------------------------
# 2Ô∏è Load Your Dataset
# ----------------------------------------------
# CSV should have columns: brand_name, description, status
df = pd.read_csv('sponsor_detection.csv')
print(df['status'].value_counts())

# Check for missing values
df = df.dropna(subset=['description', 'status'])

# Tokenization and normalization
df['tokenized_text'] = df['description'].apply(word_tokenize)
df['tokenized_text'] = df['tokenized_text'].apply(normalize_text)
df['joined_text'] = df['tokenized_text'].apply(lambda tokens: ' '.join(tokens))

# ----------------------------------------------
# 3Ô∏è TF-IDF Feature Extraction
# ----------------------------------------------
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_tfidf = tfidf_vectorizer.fit_transform(df['joined_text'])
y = df['status']   # Target variable (0 = Not Sponsored, 1 = Sponsored)

# ----------------------------------------------
# 4Ô∏è Split Data
# ----------------------------------------------
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)

# ----------------------------------------------
# 5Ô∏è Train Logistic Regression Model
# ----------------------------------------------
sponsor_model = RandomForestClassifier(
    n_estimators=200,         # number of trees in the forest
    random_state=42,          # for reproducibility
    class_weight='balanced'   # handle class imbalance
)

sponsor_model.fit(X_train, y_train)

# ----------------------------------------------
# 6Ô∏è Evaluate Model
# ----------------------------------------------
y_pred = sponsor_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print(" Sponsor Model Performance:")
print(f"Accuracy : {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall   : {recall:.4f}")
print(f"F1 Score : {f1:.4f}")

# ----------------------------------------------
# 7Ô∏è Save Model & Vectorizer
# ----------------------------------------------
joblib.dump(sponsor_model, "sponsor_model.pkl")
joblib.dump(tfidf_vectorizer, "tfidf_vectorizer.pkl")

print("\n Model and vectorizer saved successfully!")


status
sponsor        5008
non-sponsor    4992
Name: count, dtype: int64
 Sponsor Model Performance:
Accuracy : 1.0000
Precision: 1.0000
Recall   : 1.0000
F1 Score : 1.0000

 Model and vectorizer saved successfully!


#Prediction

In [8]:
# ---------- Sponsor Prediction ----------
import joblib

# Load saved model & vectorizer
sponsor_model = joblib.load("sponsor_model.pkl")
vectorizer = joblib.load("tfidf_vectorizer.pkl")

# Example DataFrame with text column
df = pd.DataFrame({
    'text': [
        "Powered by Intel ‚Äî bringing innovation to every gamer.",
        "NASA launched a satellite to study climate impact."
    ]
})

# Transform text and predict
X_tfidf = vectorizer.transform(df['text'])
df['is_sponsor'] = sponsor_model.predict(X_tfidf)

print(df)


                                                text   is_sponsor
0  Powered by Intel ‚Äî bringing innovation to ever...      sponsor
1  NASA launched a satellite to study climate imp...  non-sponsor


#Restart

In [None]:
import pandas as pd
import datetime
import time
import requests
from bs4 import BeautifulSoup
import joblib
import re
from sklearn.feature_extraction.text import TfidfVectorizer

# ==============================================================
# 1Ô∏è Load model and vectorizer
# ==============================================================
model = joblib.load("sponsor_model.pkl")
vectorizer = joblib.load("tfidf_vectorizer.pkl")

# ==============================================================
# 2Ô∏è Brand keyword list
# ==============================================================
BRANDS = [
    "Apple", "Samsung", "Nike", "Adidas", "Sony",
    "Microsoft", "Toyota", "CocaCola", "Pepsi", "Amazon",
    "Google", "Tesla", "Intel", "Puma", "Oppo", "Vivo"
]

def extract_brand(text):
    for brand in BRANDS:
        if re.search(rf"\b{brand}\b", text, re.IGNORECASE):
            return brand
    return "Other"

# ==============================================================
# 3Ô∏è Scrape only relevant text
# ==============================================================
def scrape_website(url):
    try:
        response = requests.get(url, timeout=10)
        soup = BeautifulSoup(response.text, "html.parser")
        cards = soup.find_all("div", class_="card")
        paragraphs = [
            card.get_text(strip=True)
            for card in cards
            if card.find("p")
        ]
        filtered = [
            t for t in paragraphs
            if any(b.lower() in t.lower() for b in BRANDS)
            or "sponsor" in t.lower()
            or "brand" in t.lower()
        ]
        return filtered
    except Exception as e:
        print(f" Error scraping {url}: {e}")
        return []
# ==============================================================
# 4Ô∏è Predict sponsorship
# ==============================================================
def predict_sponsorship(texts):
    if not texts:
        return []
    X_new = vectorizer.transform(texts)
    return model.predict(X_new)

# ==============================================================
# 5Ô∏è‚É£ Load or create activity log
# ==============================================================
try:
    activity_log = pd.read_csv("website_activity_log.csv")
except FileNotFoundError:
    activity_log = pd.DataFrame(columns=[
        "brand", "description", "is_sponsored",
        "url", "active_since", "active_until"
    ])

# ==============================================================
# 6Ô∏è‚É£ Main tracking function
# ==============================================================
def track_website(url):
    global activity_log
    print(f"\n Checking: {url} at {datetime.datetime.now()}")

    # Step 1: Scrape + Predict
    current_texts = scrape_website(url)
    if not current_texts:
        print(" No relevant brand/sponsor content found.")
        return

    current_preds = predict_sponsorship(current_texts)
    current_df = pd.DataFrame({
        "brand": [extract_brand(t) for t in current_texts],
        "description": current_texts,
        "is_sponsored": current_preds,
        "url": [url] * len(current_texts)
    })

    # Step 2: Remove duplicates
    current_df.drop_duplicates(subset=["url", "description"], inplace=True)
    activity_log.drop_duplicates(subset=["url", "description", "active_since"], inplace=True)

    # Step 3: Determine active and inactive sets
    existing_entries = activity_log[activity_log["url"] == url]

    # texts that are currently active (not removed)
    active_old = set(existing_entries.loc[existing_entries["active_until"].isna(), "description"])
    old_texts_all = set(existing_entries["description"])
    new_texts = set(current_df["description"])

    # Added = new ones not in currently active set
    added = new_texts - active_old
    # Removed = currently active ones not seen now
    removed = active_old - new_texts

    # Step 4: Add new entries (always as new rows, even if reappeared)
    for text in added:
        brand = extract_brand(text)
        pred = model.predict(vectorizer.transform([text]))[0]
        entry = {
            "brand": brand,
            "description": text,
            "is_sponsored": pred,
            "url": url,
            "active_since": datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
            "active_until": None
        }
        activity_log = pd.concat([activity_log, pd.DataFrame([entry])], ignore_index=True)
        print(f"üÜï Added new text for {brand}: {text[:70]}...")

    # Step 5: Mark removed ones
    for text in removed:
        mask = (
            (activity_log["url"] == url)
            & (activity_log["description"] == text)
            & (activity_log["active_until"].isna())
        )
        if mask.any():
            activity_log.loc[mask, "active_until"] = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
            print(f" Marked removed text: {text[:70]}...")

    # Step 6: Save clean CSV
    activity_log.to_csv("website_activity_log.csv", index=False)
    print(" Log updated and saved cleanly!")

# ==============================================================
# 7Ô∏è Continuous monitoring loop
# ==============================================================
urls_to_track = ["https://iaryanyadav.github.io/project_1/"]

while True:
    for url in urls_to_track:
        track_website(url)

    print("‚è≥ Sleeping for 10 seconds before next check...\n")
    time.sleep(10)



 Checking: https://iaryanyadav.github.io/project_1/ at 2025-11-05 06:28:20.310628
üÜï Added new text for Nike: ‚òÄÔ∏è Summer SaleRunning ShoesNikeDuring the Summer Sale, Nike sponsored ...
üÜï Added new text for Microsoft: üíª Cyber MondaySurface LaptopMicrosoftDuring the Cyber Monday, Microsof...
üÜï Added new text for CocaCola: üéâ Fashion WeekRefrigeratorCocaColaDuring the Fashion Week, CocaCola sp...
üÜï Added new text for Sony: üéÜ New Year SaleHeadphonesSonyDuring the New Year Sale, Sony sponsored ...
üÜï Added new text for Samsung: üì± Tech SummitSmart WatchSamsungDuring the Tech Summit, Samsung sponsor...
 Log updated and saved cleanly!
‚è≥ Sleeping for 10 seconds before next check...


 Checking: https://iaryanyadav.github.io/project_1/ at 2025-11-05 06:28:30.496029
 Log updated and saved cleanly!
‚è≥ Sleeping for 10 seconds before next check...


 Checking: https://iaryanyadav.github.io/project_1/ at 2025-11-05 06:28:40.534531
 Log updated and saved cleanly!
‚è≥ Sle

#Extract text from image

In [None]:
import os

print("Model exists:", os.path.exists("sponsor_model.pkl"))
print("Vectorizer exists:", os.path.exists("tfidf_vectorizer.pkl"))


In [None]:
!pip install easyocr

In [None]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin   # ‚úÖ Needed for full image URLs
import pandas as pd                # ‚úÖ Needed to save CSV

# Step 1: Enter your target webpage URL
url = "https://iaryanyadav.github.io/project_1/"   # üîπ Replace this with your actual webpage

# Step 2: Get the HTML content of the page
response = requests.get(url)
response.raise_for_status()  # Optional: raises error if request fails

# Step 3: Parse HTML using BeautifulSoup
soup = BeautifulSoup(response.text, "html.parser")

# Step 4: Find all <img> tags
image_tags = soup.find_all("img")

# Step 5: Extract image URLs (handle both absolute & relative URLs)
image_links = []
for img in image_tags:
    src = img.get("src")
    if src:
        full_url = urljoin(url, src)
        image_links.append(full_url)

# Step 6: Save all image URLs to a CSV file
if image_links:
    df = pd.DataFrame({"image_url": image_links})
    df.to_csv("image_links.csv", index=False)
    print(f"‚úÖ {len(image_links)} image links saved to image_links.csv")
else:
    print("‚ö† No image URLs found on the page.")


In [None]:
import pandas as pd

# Replace 'your_csv_file.csv' with the actual path to your CSV file.
df = pd.read_csv('image_links.csv')

# Display the first few rows of the DataFrame
display(df.head())

# Display information about the DataFrame
display(df.info())

In [None]:
import easyocr
import requests
from PIL import Image
from io import BytesIO
import numpy as np
import pandas as pd

# Load your CSV file (if not already loaded)
df = pd.read_csv("image_links.csv")

# Initialize the EasyOCR reader
reader = easyocr.Reader(['en'])

# Create a new column for extracted text
df['extracted_text'] = None

# Loop through each image URL and extract text
for index, row in df.iterrows():
    url = row['image_url']   # ‚úÖ corrected column name
    print(f"Processing image: {url}")
    try:
        if "wikimedia.org" in url:
            headers = {'User-Agent': 'Mozilla/5.0'}
            response = requests.get(url, headers=headers)
        else:
            response = requests.get(url)

        response.raise_for_status()

        try:
            result = reader.readtext(response.content)
        except Exception:
            img = Image.open(BytesIO(response.content))
            img_np = np.array(img)
            result = reader.readtext(img_np)

        # Store the extracted text
        if result:
            extracted_texts = [text for (bbox, text, prob) in result]
            df.at[index, 'extracted_text'] = ", ".join(extracted_texts)
            print(f"  Extracted Text: {', '.join(extracted_texts)}")
        else:
            df.at[index, 'extracted_text'] = "No text found"
            print("  No text found in this image.")

    except requests.exceptions.RequestException as e:
        df.at[index, 'extracted_text'] = f"Error fetching image: {e}"
        print(f"  Error fetching image: {e}")
    except Exception as e:
        df.at[index, 'extracted_text'] = f"Error processing image: {e}"
        print(f"  Error processing image: {e}")

    print("-" * 20)

# Save results
df.to_csv("image_links_with_text.csv", index=False)
print("‚úÖ Extracted text saved to image_links_with_text.csv")

# Display the DataFrame
display(df)


In [None]:
import pandas as pd
import joblib
import re

# Load CSV and model/vectorizer
df = pd.read_csv("image_links_with_text.csv")
model = joblib.load("sponsor_model.pkl")
vectorizer = joblib.load("tfidf_vectorizer.pkl")

# Clean text before transforming
df["clean_text"] = (
    df["extracted_text"]
    .fillna("")
    .str.lower()
    .apply(lambda x: re.sub(r'[^a-z\s]', '', x))
)

# Transform and predict
X = vectorizer.transform(df["clean_text"])
probs = model.predict_proba(X)[:, 1]

# Apply slightly relaxed threshold
df["is_sponsor"] = ["Yes" if p > 0.4 else "No" for p in probs]

# Save final results
df.to_csv("image_links_with_text_and_prediction.csv", index=False)
print("‚úÖ Improved predictions saved to image_links_with_text_and_prediction.csv")


In [None]:
import pandas as pd
import datetime
import time
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import joblib
import re
from sklearn.feature_extraction.text import TfidfVectorizer

# ==============================================================
# 1Ô∏è‚É£ Load model and vectorizer
# ==============================================================
model = joblib.load("sponsor_model.pkl")
vectorizer = joblib.load("tfidf_vectorizer.pkl")

# ==============================================================
# 2Ô∏è‚É£ Brand keyword list
# ==============================================================
BRANDS = [
    "Apple", "Samsung", "Nike", "Adidas", "Sony",
    "Microsoft", "Toyota", "CocaCola", "Pepsi", "Amazon",
    "Google", "Tesla", "Intel", "Puma", "Oppo", "Vivo"
]

def extract_brand(text):
    for brand in BRANDS:
        if re.search(rf"\b{brand}\b", text, re.IGNORECASE):
            return brand
    return "Other"

# ==============================================================
# 3Ô∏è‚É£ Scrape image URLs + extract alt text
# ==============================================================
def scrape_images(url):
    try:
        response = requests.get(url, timeout=10)
        soup = BeautifulSoup(response.text, "html.parser")
        image_tags = soup.find_all("img")

        images = []
        for img in image_tags:
            src = img.get("src")
            alt_text = img.get("alt", "")
            if src:
                full_url = urljoin(url, src)
                images.append({"image_url": full_url, "alt_text": alt_text})
        return pd.DataFrame(images)
    except Exception as e:
        print(f"‚ö†Ô∏è Error scraping {url}: {e}")
        return pd.DataFrame(columns=["image_url", "alt_text"])

# ==============================================================
# 4Ô∏è‚É£ Predict sponsorship
# ==============================================================
def predict_sponsorship(texts):
    if not texts:
        return []
    X_new = vectorizer.transform(texts)
    return model.predict(X_new)

# ==============================================================
# 5Ô∏è‚É£ Load or create activity log
# ==============================================================
try:
    image_log = pd.read_csv("image_activity_log.csv")
except FileNotFoundError:
    image_log = pd.DataFrame(columns=[
        "brand", "image_url", "text", "is_sponsored",
        "url", "active_since", "active_until"
    ])

# ==============================================================
# 6Ô∏è‚É£ Main tracking function
# ==============================================================
def track_images(url):
    global image_log
    print(f"\nüîç Checking images at: {url} ‚Äî {datetime.datetime.now()}")

    # Step 1: Scrape images
    current_images = scrape_images(url)
    if current_images.empty:
        print("‚ö†Ô∏è No images found on the page.")
        return

    # Step 2: Clean text and predict
    current_images["clean_text"] = current_images["alt_text"].fillna("").str.lower()
    current_images["brand"] = current_images["clean_text"].apply(extract_brand)
    current_images["is_sponsored"] = predict_sponsorship(current_images["clean_text"].tolist())

    current_images["url"] = url

    # Step 3: Remove duplicates
    current_images.drop_duplicates(subset=["url", "image_url"], inplace=True)
    image_log.drop_duplicates(subset=["url", "image_url", "active_since"], inplace=True)

    # Step 4: Determine active/inactive sets
    existing_entries = image_log[image_log["url"] == url]
    active_old = set(existing_entries.loc[existing_entries["active_until"].isna(), "image_url"])
    new_set = set(current_images["image_url"])

    added = new_set - active_old
    removed = active_old - new_set

    # Step 5: Add new entries
    for img_url in added:
        row = current_images[current_images["image_url"] == img_url].iloc[0]
        entry = {
            "brand": row["brand"],
            "image_url": row["image_url"],
            "text": row["clean_text"],
            "is_sponsored": row["is_sponsored"],
            "url": url,
            "active_since": datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
            "active_until": None
        }
        image_log = pd.concat([image_log, pd.DataFrame([entry])], ignore_index=True)
        print(f"üÜï Added new image for {row['brand']}: {img_url}")

    # Step 6: Mark removed ones
    for img_url in removed:
        mask = (
            (image_log["url"] == url)
            & (image_log["image_url"] == img_url)
            & (image_log["active_until"].isna())
        )
        if mask.any():
            image_log.loc[mask, "active_until"] = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
            print(f"‚ùå Marked removed image: {img_url}")

    # Step 7: Save clean CSV
    image_log.to_csv("image_activity_log.csv", index=False)
    print("‚úÖ Image activity log updated and saved.")

# ==============================================================
# 7Ô∏è‚É£ Continuous monitoring loop
# ==============================================================
urls_to_track = ["https://iaryanyadav.github.io/project_1/"]

while True:
    for url in urls_to_track:
        track_images(url)

    print("‚è≥ Sleeping for 10 seconds before next check...\n")
    time.sleep(10)
