In [2]:
import pandas as pd
import numpy as np
import re
from bs4 import BeautifulSoup
import requests
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix

In [None]:
import pandas as pd
import numpy as np
import re
from bs4 import BeautifulSoup
import requests
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# ----------------------------
# 1. SCRAPING FUNCTION
# ----------------------------
def scrape_flipkart(page):
    url = f"https://www.flipkart.com/search?q=tv&page={page}"
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"
    }
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, "html.parser")

    def safe_extract(tag, class_name):
        return [x.get_text(" ", strip=True) for x in soup.find_all(tag, {'class': class_name})]

    names   = safe_extract('div', 'KzDlHZ')
    ratings = safe_extract('div', 'XQDdHH')
    infos   = safe_extract('div', '_6NESgJ')
    prices  = safe_extract('div', 'Nx9bqj _4b5DiR')
    offers  = safe_extract('div', 'UkUFwK')
    Deal_type  = safe_extract('div', 'yiggsN O5Fpg8')
    Deal_type  = safe_extract('span', 'Wphh3N','span')
    

    
    max_len = max(len(names), len(ratings), len(infos), len(prices), len(offers),len(Deal_type))
    pad = lambda lst: lst + [""] * (max_len - len(lst))

    return list(zip(pad(names), pad(ratings), pad(infos), pad(prices), pad(offers),pad(Deal_type)))

# ----------------------------
# 2. SCRAPE ALL PAGES
# ----------------------------
all_data = []
for page in range(1, 53):
    print(f"Scraping page {page}...")
    all_data.extend(scrape_flipkart(page))

# ----------------------------
# 3. CREATE DATAFRAME
# ----------------------------
df = pd.DataFrame(all_data, columns=["TV_Name", "Rating", "Info_Raw", "Price", "Offers","Deal_type"])
df["TV_Name"].replace("", np.nan, inplace=True)
df.dropna(subset=["TV_Name"], inplace=True)

# ----------------------------
# 4. CLEANING & FEATURE EXTRACTION
# ----------------------------
df['Resolution']    = df['Info_Raw'].str.extract(r'^(.*?)\s*\|')
df['Panel_Type']    = df['Info_Raw'].str.extract(r'\|\s*(.*?)\s*Model ID')
df['Model_ID']      = df['Info_Raw'].str.extract(r'Model ID:\s*(.*?)\s*Launch')
df['Launch_Year']   = df['Info_Raw'].str.extract(r'Launch Year:\s*(\d{4})').astype(float)
df['Sound_Output']  = df['Info_Raw'].str.extract(r'Total Sound Output:\s*(\d+)\s*W').astype(float)
df['Warranty']      = df['Info_Raw'].str.extract(r'(Warranty.*)').fillna("No Warranty")

df['Tv_name']       = df['TV_Name'].str.extract(r'^(.*?)\s*\d+\s*cm')
df['Size_cm']       = df['TV_Name'].str.extract(r'(\d+)\s*cm').astype(float)
df['Size_inch']     = df['TV_Name'].str.extract(r'\((\d+)\s*inch').astype(float)

df['Offers'] = df['Offers'].str.replace('%', '').str.replace('off', '').replace("", "0").astype(int)
df['Price']  = df['Price'].str.replace('â‚¹', '').str.replace(',', '').astype(float)

# ----------------------------
# 5. OS Type Extraction
# ----------------------------
def get_os(x):
    x = x.lower()
    for os in ["google tv", "android tv", "webos", "tizen", "fire tv", "linux tv", "coolita tv"]:
        if os in x:
            return os.title()
    return "Other"

df['OS_Type'] = df['TV_Name'].apply(get_os)

products = df[['Tv_name','Resolution','Panel_Type','Launch_Year',
               'Sound_Output','Warranty','Size_cm','Size_inch',
               'OS_Type','Rating','Deal_type','Price']].drop_duplicates()

final_df = df[[
    "TV_Name", 
    "Tv_name",
    "Resolution",
    "Panel_Type",
    "Model_ID",
    "Launch_Year",
    "Sound_Output",
    "Warranty",
    "Size_cm",
    "Size_inch",
    "OS_Type",
    "Rating",
    "Deal_type",
    "Price",
    "Offers"
]]

# Export as CSV
final_df.to_csv("Flipkart_TV_Splitted_Cleaned.csv", index=False)

Scraping page 1...
Scraping page 2...
Scraping page 3...
Scraping page 4...
Scraping page 5...
Scraping page 6...
Scraping page 7...
Scraping page 8...
Scraping page 9...
Scraping page 10...
Scraping page 11...
Scraping page 12...
Scraping page 13...
Scraping page 14...
Scraping page 15...
Scraping page 16...
Scraping page 17...
Scraping page 18...
Scraping page 19...
Scraping page 20...
Scraping page 21...
Scraping page 22...
Scraping page 23...
Scraping page 24...
Scraping page 25...
Scraping page 26...
Scraping page 27...
Scraping page 28...
Scraping page 29...
Scraping page 30...
Scraping page 31...
Scraping page 32...
Scraping page 33...
Scraping page 34...
Scraping page 35...
Scraping page 36...
Scraping page 37...
Scraping page 38...
Scraping page 39...
Scraping page 40...
Scraping page 41...
Scraping page 42...
Scraping page 43...
Scraping page 44...
Scraping page 45...
Scraping page 46...
Scraping page 47...
Scraping page 48...
Scraping page 49...
Scraping page 50...
Scraping 

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["TV_Name"].replace("", np.nan, inplace=True)


In [None]:
def scrape_flipkart(page):
    url = f"https://www.flipkart.com/search?q=tv&page={page}"
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"
    }
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, "html.parser")

    def safe_extract(tag, class_name):
        return [x.get_text(" ", strip=True) for x in soup.find_all(tag, {'class': class_name})]

    names      = safe_extract('div', 'KzDlHZ')
    ratings    = safe_extract('div', 'XQDdHH')
    infos      = safe_extract('div', '_6NESgJ')
    prices     = safe_extract('div', 'Nx9bqj _4b5DiR')
    offers     = safe_extract('div', 'UkUFwK')

    # ðŸ‘‰ UPDATED: extract tUxRFH only
    Deal_type  = safe_extract('div', 'tUxRFH')
    
    max_len = max(len(names), len(ratings), len(infos), len(prices), len(offers), len(Deal_type))
    pad = lambda lst: lst + [""] * (max_len - len(lst))

    return list(zip(
        pad(names), pad(ratings), pad(infos), pad(prices), pad(offers), pad(Deal_type)
    ))