In [1]:
import pandas as pd
import numpy as np
import re
from bs4 import BeautifulSoup
import requests

import warnings
warnings.filterwarnings("ignore")

def scrape_flipkart(page):
    url = f"https://www.flipkart.com/search?q=tv&page={page}"
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"
    }
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, "html.parser")

    def safe_extract(tag, class_name):
        return [x.get_text(" ", strip=True) for x in soup.find_all(tag, {'class': class_name})]

    names   = safe_extract('div', 'RG5Slk')
    ratings = safe_extract('div', 'MKiFS6')
    infos   = safe_extract('div', 'CMXw7N')
    prices  = safe_extract('div', 'hZ3P6w DeU9vF')
    offers  = safe_extract('div', 'HQe8jr')
    Deal = safe_extract('div', 'hx1EGN')
    rating_review = safe_extract('span', 'PvbNMB')
    

    
    max_len = max(len(names), len(ratings), len(infos), len(prices), len(offers),len(Deal),len(rating_review))
    pad = lambda lst: lst + [""] * (max_len - len(lst))

    return list(zip(pad(names), pad(ratings), pad(infos), pad(prices), pad(offers),pad(Deal),pad(rating_review)))

all_data = []
for page in range(1, 53):
    all_data.extend(scrape_flipkart(page))

print(all_data)

[('XIAOMI A Series 108 cm (43 inch) Full HD LED Smart Google TV 2025 Edition with 200+ Free Channels | HD...', '4.3', 'Full HD | LED Model ID: L43MB-AFIN Launch Year: 2025 Total Sound Output: 20 W 1 year comprehensive warranty on product and 1 year additional on Panel provided by the brand from the date of purchase', '₹19,999', '35% off', 'Lowest price since launch', '1,07,508 Ratings & 6,963 Reviews'), ('XIAOMI F Pro 138 cm (55 inch) QLED Ultra HD (4K) Smart Fire TV 2025 Edition with 34W Box Speakers | Do...', '4', 'Ultra HD (4K) | QLED Model ID: L55MB-FPIN Launch Year: 2025 Total Sound Output: 34 W 1 year comprehensive warranty', '₹32,999', '47% off', 'Upto ₹2,000 Off on Exchange', '12,783 Ratings & 720 Reviews'), ('Samsung 80 cm (32 inch) HD Ready LED Smart Tizen TV 2025 Edition with Voice Assistance Remote Control ...', '4.3', 'HD Ready | LED Model ID: UA32H4500FUXXL Launch Year: 2025 Total Sound Output: 20 W 2 Year Warranty (1 Year Standard Warranty + 1 Year additional warranty on

In [2]:
cols = ["TV_Name", "Rating", "Info_Raw", "Price", "Offers", "Deal_Type","rating_review"]
df = pd.DataFrame(all_data, columns=cols)

In [3]:
df = df.replace(["", " "], np.nan)

In [4]:
df.head()

Unnamed: 0,TV_Name,Rating,Info_Raw,Price,Offers,Deal_Type,rating_review
0,XIAOMI A Series 108 cm (43 inch) Full HD LED S...,4.3,Full HD | LED Model ID: L43MB-AFIN Launch Year...,"₹19,999",35% off,Lowest price since launch,"1,07,508 Ratings & 6,963 Reviews"
1,XIAOMI F Pro 138 cm (55 inch) QLED Ultra HD (4...,4.0,Ultra HD (4K) | QLED Model ID: L55MB-FPIN Laun...,"₹32,999",47% off,"Upto ₹2,000 Off on Exchange","12,783 Ratings & 720 Reviews"
2,Samsung 80 cm (32 inch) HD Ready LED Smart Tiz...,4.3,HD Ready | LED Model ID: UA32H4500FUXXL Launch...,"₹12,990",27% off,Hot Deal,"1,24,643 Ratings & 7,448 Reviews"
3,TCL C6KS 139 cm (55 inch) Ultra HD (4K) Mini L...,4.1,Ultra HD (4K) | Mini LED Model ID: 55C6KS Laun...,"₹44,490",55% off,"Upto ₹6,650 Off on Exchange",167 Ratings & 24 Reviews
4,XIAOMI 125 cm (50 inch) Ultra HD (4K) LED Smar...,4.3,Ultra HD (4K) | LED Model ID: L50MB-AIN Launch...,"₹29,999",33% off,Hot Deal,"81,898 Ratings & 6,088 Reviews"


In [5]:
df.dropna(inplace=True)

In [6]:
df.columns

Index(['TV_Name', 'Rating', 'Info_Raw', 'Price', 'Offers', 'Deal_Type',
       'rating_review'],
      dtype='object')

In [7]:
df=df.drop_duplicates()

In [8]:
df.shape

(679, 7)

In [9]:
df.dropna(subset=['TV_Name'],inplace=True)

In [10]:
df.shape

(679, 7)

In [11]:
df.isnull().sum()

TV_Name          0
Rating           0
Info_Raw         0
Price            0
Offers           0
Deal_Type        0
rating_review    0
dtype: int64

In [12]:
# CLEAN NAME
df["name"] = df["TV_Name"].astype(str).str.split().str[0]
# df["name"] = (df["TV_Name"].astype(str).str.replace(r"[^A-Za-z ]+", " ", regex=True).str.replace(r"\s+", " ", regex=True).str.strip())

# EXTRACT SIZES
df["size_cm"] = df["TV_Name"].astype(str).str.extract(r"(\d{2,3})\s*cm", expand=False)
df["size_cm"] = df["size_cm"].fillna(np.nan)
df["size_inch"] = df["TV_Name"].astype(str).str.extract(r"(\d{2,3}(?:\.\d+)?)\s*(?:inch|inches|in|\")", expand=False)
df["size_inch"] = df["size_inch"].fillna(np.nan)


# Display
df["display"] = df["TV_Name"].astype(str).str.extract(r"\b(Mini\s*LED|Neo\s*QLED|QLED|OLED|ULED|LED)\b", flags=re.IGNORECASE, expand=False)
df["display"] = df["display"].fillna("unknown").astype(str).str.strip()

price_text = df["Price"].astype(str)
df["edition"] = df["TV_Name"].astype(str).str.extract(r"(20\d{2}|\s*\d+|\d+\s*|Edition)", expand=False)
df["edition"] = df["edition"].fillna(price_text.str.extract(r"(20\d{2}|\s*\d+|\d+\s*|Edition)", expand=False))
df["edition"] = df["edition"].fillna("unknown")

#MODEL ID, SOUND, WARRANTY from Info_Raw
df["model_id"] = df["Info_Raw"].astype(str).str.extract(r"Model\s*ID[:\s]*([A-Za-z0-9\-]+)", expand=False)
df["model_id"] = df["model_id"].fillna("unknown").astype(str)

df["sound_output"] = df["Info_Raw"].astype(str).str.extract(r"(\d{2,3}\s*)", expand=False)
df["sound_output"] = df["sound_output"].str.replace(" ", "", regex=False).fillna("unknown").astype(str)

df["warranty_years"] = df["Info_Raw"].astype(str).str.extract(r"(\d+)\s*(?:Year|Years|Yr)", expand=False)

# Preserve original Price text, then clean numeric Price
df["Price_clean"] = price_text.str.replace("₹", "", regex=False).str.replace(",", "", regex=False).str.strip()
df["Price"] = pd.to_numeric(df["Price_clean"], errors="coerce").fillna(0).astype(int)
df.drop(columns=["Price_clean"], inplace=True)

# Offers -> numeric percent
df["Offers"] = (df["Offers"].astype(str).str.replace(r"%\s*off", "", regex=True).str.strip())
df["Offers"] = df["Offers"].replace("", np.nan)
df["Offers"] = pd.to_numeric(df["Offers"], errors="coerce").fillna(0).astype(int)

# Deal_Type: remove special characters
df["Deal_Type"] = df["Deal_Type"].astype(str).str.replace(r"[^A-Za-z0-9 ]", "", regex=True).str.strip()

# rating_count & review_count
df["rating_count"] = (df["rating_review"].astype(str).str.extract(r"([\d,]+)\s*Ratings", expand=False))
df["review_count"] = (df["rating_review"].astype(str).str.extract(r"([\d,]+)\s*Reviews", expand=False))

# remove commas and convert to numeric (coerce bad values)
df["rating_count"] = df["rating_count"].str.replace(",", "", regex=False)
df["review_count"] = df["review_count"].str.replace(",", "", regex=False)

df["rating_count"] = pd.to_numeric(df["rating_count"], errors="coerce")
df["review_count"] = pd.to_numeric(df["review_count"], errors="coerce")

# fill missing with mode if available, else 0, then make int
def fill_with_mode_or_zero(series):
    m = series.mode()
    if len(m) > 0 and not np.isnan(m.iloc[0]):
        return series.fillna(int(m.iloc[0])).astype(int)
    else:
        return series.fillna(0).astype(int)

df["rating_count"] = fill_with_mode_or_zero(df["rating_count"])
df["review_count"] = fill_with_mode_or_zero(df["review_count"])

df["warranty_years"] = pd.to_numeric(df["warranty_years"], errors="coerce")
df["warranty_years"] = df["warranty_years"].fillna(0).astype(int)


df.dropna(subset=["size_inch"], inplace=True)
df.dropna(subset=["size_cm"], inplace=True)
df.replace('unknown', np.nan, inplace=True)
df.dropna(inplace=True)
df["size_inch"] = df["size_inch"].astype(float)
df["size_cm"] = df["size_cm"].astype(int)

In [13]:
df.columns

Index(['TV_Name', 'Rating', 'Info_Raw', 'Price', 'Offers', 'Deal_Type',
       'rating_review', 'name', 'size_cm', 'size_inch', 'display', 'edition',
       'model_id', 'sound_output', 'warranty_years', 'rating_count',
       'review_count'],
      dtype='object')

In [14]:
df.head()

Unnamed: 0,TV_Name,Rating,Info_Raw,Price,Offers,Deal_Type,rating_review,name,size_cm,size_inch,display,edition,model_id,sound_output,warranty_years,rating_count,review_count
0,XIAOMI A Series 108 cm (43 inch) Full HD LED S...,4.3,Full HD | LED Model ID: L43MB-AFIN Launch Year...,19999,35,Lowest price since launch,"1,07,508 Ratings & 6,963 Reviews",XIAOMI,108,43.0,LED,108,L43MB-AFIN,43,0,107508,6963
1,XIAOMI F Pro 138 cm (55 inch) QLED Ultra HD (4...,4.0,Ultra HD (4K) | QLED Model ID: L55MB-FPIN Laun...,32999,47,Upto 2000 Off on Exchange,"12,783 Ratings & 720 Reviews",XIAOMI,138,55.0,QLED,138,L55MB-FPIN,55,0,12783,720
2,Samsung 80 cm (32 inch) HD Ready LED Smart Tiz...,4.3,HD Ready | LED Model ID: UA32H4500FUXXL Launch...,12990,27,Hot Deal,"1,24,643 Ratings & 7,448 Reviews",Samsung,80,32.0,LED,80,UA32H4500FUXXL,32,2,124643,7448
3,TCL C6KS 139 cm (55 inch) Ultra HD (4K) Mini L...,4.1,Ultra HD (4K) | Mini LED Model ID: 55C6KS Laun...,44490,55,Upto 6650 Off on Exchange,167 Ratings & 24 Reviews,TCL,139,55.0,Mini LED,6,55C6KS,55,2,167,24
4,XIAOMI 125 cm (50 inch) Ultra HD (4K) LED Smar...,4.3,Ultra HD (4K) | LED Model ID: L50MB-AIN Launch...,29999,33,Hot Deal,"81,898 Ratings & 6,088 Reviews",XIAOMI,125,50.0,LED,125,L50MB-AIN,50,2,81898,6088


In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 642 entries, 0 to 1761
Data columns (total 17 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   TV_Name         642 non-null    object 
 1   Rating          642 non-null    object 
 2   Info_Raw        642 non-null    object 
 3   Price           642 non-null    int64  
 4   Offers          642 non-null    int64  
 5   Deal_Type       642 non-null    object 
 6   rating_review   642 non-null    object 
 7   name            642 non-null    object 
 8   size_cm         642 non-null    int64  
 9   size_inch       642 non-null    float64
 10  display         642 non-null    object 
 11  edition         642 non-null    object 
 12  model_id        642 non-null    object 
 13  sound_output    642 non-null    object 
 14  warranty_years  642 non-null    int64  
 15  rating_count    642 non-null    int64  
 16  review_count    642 non-null    int64  
dtypes: float64(1), int64(6), object(10)
mem

In [16]:
df.isnull().sum()

TV_Name           0
Rating            0
Info_Raw          0
Price             0
Offers            0
Deal_Type         0
rating_review     0
name              0
size_cm           0
size_inch         0
display           0
edition           0
model_id          0
sound_output      0
warranty_years    0
rating_count      0
review_count      0
dtype: int64

In [17]:
df.to_csv("Tv_dataset.csv", index=False)