In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
import time
import pandas as pd
import os


In [2]:
chrome_path = "chromedriver.exe"
service = Service(executable_path=chrome_path)
driver = webdriver.Chrome(service=service)

In [4]:
#Define Generic Scraping Function
def scrape_website(url, site_name, sleep_time=5):
    print(f" Scraping {site_name}...")

    driver.get(url)
    time.sleep(sleep_time)

    elements = driver.find_elements(By.XPATH, "//*")
    data = []

    for el in elements:
        try:
            text = el.text.strip()
            if text and len(text.split()) <= 20:
                data.append(text)
        except:
            continue

    # Save scraped data
    os.makedirs("data/raw", exist_ok=True)
    df = pd.DataFrame({"text": list(set(data))})
    df.to_csv(f"data/raw/{site_name.lower()}_raw.csv", index=False)
    print(f" {site_name} scraping complete! {len(df)} items saved.\n")


In [5]:
#Run Scraping for 3 Sites
scrape_website("https://www.amazon.in", "Amazon")
scrape_website("https://www.ajio.com", "Ajio")
scrape_website("https://www.nykaa.com", "Nykaa")

driver.quit()


 Scraping Amazon...
 Amazon scraping complete! 207 items saved.

 Scraping Ajio...
 Ajio scraping complete! 251 items saved.

 Scraping Nykaa...
 Nykaa scraping complete! 60 items saved.



In [20]:
# Function to automatically label text based on keywords
def auto_label(text):
    promo_keywords = [
        "offer", "sale", "deal", "discount", "off", "upto", "%", 
        "limited time", "save", "lowest price", "exclusive"
    ]
    text_lower = str(text).lower()
    return "Promotion" if any(keyword in text_lower for keyword in promo_keywords) else "Non-Promotion"



In [21]:
def auto_label_site(site_name):
    print(f" Auto-labeling {site_name}...")
    os.makedirs("data/labeled", exist_ok=True)
    
    # Load raw scraped data
    df = pd.read_csv(f"data/raw/{site_name.lower()}_raw.csv")
    df.dropna(inplace=True)
    df = df[df['text'].str.strip() != ""]
    
    # Apply label as text
    df['label'] = df['text'].apply(auto_label)
    
    # Save labeled data
    df.to_csv(f"data/labeled/{site_name.lower()}_labeled.csv", index=False)
    print(f" Labeled data saved to data/labeled/{site_name.lower()}_labeled.csv")


In [22]:
auto_label_site("Amazon")
auto_label_site("Ajio")
auto_label_site("Nykaa")


 Auto-labeling Amazon...
 Labeled data saved to data/labeled/amazon_labeled.csv
 Auto-labeling Ajio...
 Labeled data saved to data/labeled/ajio_labeled.csv
 Auto-labeling Nykaa...
 Labeled data saved to data/labeled/nykaa_labeled.csv


In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import joblib
import os


#Load Labeled Data
df_amazon = pd.read_csv("data/labeled/amazon_labeled.csv")
df_ajio = pd.read_csv("data/labeled/ajio_labeled.csv")
df_nykaa = pd.read_csv("data/labeled/nykaa_labeled.csv")

#Combine all data
df = pd.concat([df_amazon, df_ajio, df_nykaa], ignore_index=True)
df.dropna(subset=['text', 'label'], inplace=True)
df['text'] = df['text'].astype(str)
df['label'] = df['label'].astype(int)


In [8]:
#Split into train and test sets
X = df['text']
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

#TF-IDF Vectorization
vectorizer = TfidfVectorizer(ngram_range=(1,2), stop_words='english', max_features=1000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)


In [9]:
#Train the Classifier
model = LogisticRegression()
model.fit(X_train_tfidf, y_train)

#Evaluate
y_pred = model.predict(X_test_tfidf)

print("\n Classification Report:\n")
print(classification_report(y_test, y_pred))



 Classification Report:

              precision    recall  f1-score   support

           0       0.67      0.97      0.80        34
           1       0.86      0.27      0.41        22

    accuracy                           0.70        56
   macro avg       0.77      0.62      0.60        56
weighted avg       0.75      0.70      0.65        56



In [10]:
#Save model and vectorizer
os.makedirs("models", exist_ok=True)
joblib.dump(model, "models/promotion_classifier.pkl")
joblib.dump(vectorizer, "models/tfidf_vectorizer.pkl")
print("Model and vectorizer saved to /models/")


Model and vectorizer saved to /models/


In [11]:
import joblib

model = joblib.load("models/promotion_classifier.pkl")
vectorizer = joblib.load("models/tfidf_vectorizer.pkl")


In [12]:
#Function to predict promotion status
def predict_promotion(text):
    text_vector = vectorizer.transform([text])
    prediction = model.predict(text_vector)[0]
    return " Promotion" if prediction == 1 else " Non-Promotion"


In [14]:
import pandas as pd
import os

os.makedirs("data/predicted", exist_ok=True)

#function to predict and save
def predict_and_save(site_name):
    print(f"\n Predicting promotions in {site_name} data...")
    file_path = f"data/raw/{site_name.lower()}_raw.csv"
    df = pd.read_csv(file_path)
    df.dropna(inplace=True)
    df['text'] = df['text'].astype(str)
    df['prediction'] = df['text'].apply(predict_promotion)
    output_path = f"data/predicted/{site_name.lower()}_predicted.csv"
    df.to_csv(output_path, index=False)
    print(f" Predictions saved to {output_path}")


In [16]:

predict_and_save("Amazon")
predict_and_save("Ajio")
predict_and_save("Nykaa")



 Predicting promotions in Amazon data...
 Predictions saved to data/predicted/amazon_predicted.csv

 Predicting promotions in Ajio data...
 Predictions saved to data/predicted/ajio_predicted.csv

 Predicting promotions in Nykaa data...
 Predictions saved to data/predicted/nykaa_predicted.csv
