In [1]:
import pandas as pd
import numpy as np
import re
from bs4 import BeautifulSoup
import requests
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix

In [2]:
all_data = []

for page in range(1, 53):
    url = f"https://www.flipkart.com/search?q=tv&page={page}"
    print(f"Scraping page: {page}")

    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0 Safari/537.36"
    }
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, "html.parser")

    TV_Name   = soup.find_all('div', {'class': 'KzDlHZ'})
    TV_rating = soup.find_all('div', {'class': 'XQDdHH'})
    TV_info   = soup.find_all('div', {'class': '_6NESgJ'})
    TV_price  = soup.find_all('div', {'class': 'Nx9bqj _4b5DiR'})
    TV_offer  = soup.find_all('div', {'class': 'UkUFwK'})

    def safe_extract(tags):
        return [t.get_text(" ", strip=True) for t in tags]

    names   = safe_extract(TV_Name)
    ratings = safe_extract(TV_rating)
    infos   = safe_extract(TV_info)
    prices  = safe_extract(TV_price)
    offers  = safe_extract(TV_offer)

    max_len = max(len(names), len(ratings), len(infos), len(prices), len(offers))

    def pad(lst, length):
        return lst + [""] * (length - len(lst))

    names   = pad(names, max_len)
    ratings = pad(ratings, max_len)
    infos   = pad(infos, max_len)
    prices  = pad(prices, max_len)
    offers  = pad(offers, max_len)

    for a, b, c, d, e in zip(names, ratings, infos, prices, offers):
        all_data.append([a, b, c, d, e])

Scraping page: 1
Scraping page: 2
Scraping page: 3
Scraping page: 4
Scraping page: 5
Scraping page: 6
Scraping page: 7
Scraping page: 8
Scraping page: 9
Scraping page: 10
Scraping page: 11
Scraping page: 12
Scraping page: 13
Scraping page: 14
Scraping page: 15
Scraping page: 16
Scraping page: 17
Scraping page: 18
Scraping page: 19
Scraping page: 20
Scraping page: 21
Scraping page: 22
Scraping page: 23
Scraping page: 24
Scraping page: 25
Scraping page: 26
Scraping page: 27
Scraping page: 28
Scraping page: 29
Scraping page: 30
Scraping page: 31
Scraping page: 32
Scraping page: 33
Scraping page: 34
Scraping page: 35
Scraping page: 36
Scraping page: 37
Scraping page: 38
Scraping page: 39
Scraping page: 40
Scraping page: 41
Scraping page: 42
Scraping page: 43
Scraping page: 44
Scraping page: 45
Scraping page: 46
Scraping page: 47
Scraping page: 48
Scraping page: 49
Scraping page: 50
Scraping page: 51
Scraping page: 52


In [4]:
df = pd.DataFrame(all_data, columns=["TV_Name", "Rating", "Info_Raw", "Price", "Offers"])

df["TV_Name"].replace("", np.nan, inplace=True)
df.dropna(subset=["TV_Name"], inplace=True)

df['Resolution'] = df['Info_Raw'].str.extract(r'^(.*?)\s*\|')
df['Panel_Type'] = df['Info_Raw'].str.extract(r'\|\s*(.*?)\s*Model ID')
df['Model_ID'] = df['Info_Raw'].str.extract(r'Model ID:\s*([A-Za-z0-9\-\s\/]+?)\s*Launch')
df['Launch_Year'] = df['Info_Raw'].str.extract(r'Launch Year:\s*(\d{4})')
df['Sound_Output'] = df['Info_Raw'].str.extract(r'Total Sound Output:\s*(\d+)\s*W')
df['Warranty'] = df['Info_Raw'].str.extract(r'(Warranty.*)')

df['Sound_Output'] = df['Sound_Output'].astype(float)
df['Launch_Year'] = df['Launch_Year'].astype(int)
df["TV_Name"] = df["TV_Name"].replace("", np.nan)

df['Tv_name'] = df['TV_Name'].str.extract(r'^(.*?)\s*\d+\s*cm')
df['Size_cm'] = df['TV_Name'].str.extract(r'(\d+)\s*cm').astype(float)
df['Size_inch'] = df['TV_Name'].str.extract(r'\((\d+)\s*inch').astype(float)

df['Offers'] = df['Offers'].astype(str)
df['Offers'] = df['Offers'].str.replace('%', '', regex=False)
df['Offers'] = df['Offers'].str.replace('off', '', regex=False).str.strip()
df['Offers'] = df['Offers'].replace("", "0").astype(int)

df['Price'] = (df['Price'].astype(str).str.replace('₹', '', regex=False).str.replace(',', '', regex=False).str.strip())

df['Warranty'] = df['Warranty'].fillna("No")
df['Size_cm'] = df['Size_cm'].fillna(0)
df['Size_inch'] = df['Size_inch'].fillna(0)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["TV_Name"].replace("", np.nan, inplace=True)


In [5]:
df.columns

Index(['TV_Name', 'Rating', 'Info_Raw', 'Price', 'Offers', 'Resolution',
       'Panel_Type', 'Model_ID', 'Launch_Year', 'Sound_Output', 'Warranty',
       'Tv_name', 'Size_cm', 'Size_inch'],
      dtype='object')

In [6]:
def get_os(x):
    x = x.lower()
    if "google tv" in x: return "Google TV"
    if "android tv" in x: return "Android TV"
    if "webos" in x: return "WebOS TV"
    if "tizen" in x: return "Tizen TV"
    if "fire tv" in x: return "Fire TV"
    if "linux tv" in x: return "Linux TV"
    if "coolita tv" in x: return "Coolita TV"
    return "Other"

df['OS_Type'] = df['TV_Name'].apply(get_os)

products = df[['Tv_name','Resolution','Panel_Type','Launch_Year',
               'Sound_Output','Warranty','Size_cm','Size_inch',
               'OS_Type','Rating','Price']]
products.drop_duplicates()
print(products)
future = products.drop(columns="Price")

le = LabelEncoder()
for col in future.columns:
    if future[col].dtype == 'object':
        future[col] = le.fit_transform(future[col].astype(str))

                    Tv_name     Resolution Panel_Type  Launch_Year  \
0                    Foxsky       HD Ready       QLED         2024   
1     KODAK Special Edition       HD Ready        LED         2024   
2                    Foxsky       HD Ready        LED         2023   
3           realme TechLife       HD Ready       QLED         2025   
4     KODAK Special Edition        Full HD        LED         2024   
...                     ...            ...        ...          ...   
1468                Samsung  Ultra HD (4K)       OLED         2025   
1469            TCL V4C 80.       HD Ready       QLED         2025   
1470                 Foxsky       HD Ready        LED         2023   
1471                   VONE        Full HD        LED         2024   
1472                    BPL        Full HD        LED         2024   

      Sound_Output                                           Warranty  \
0             30.0      Warranty on Product from the date of purchase   
1            

In [7]:
X = future
y = products["Price"]

In [8]:
X

Unnamed: 0,Tv_name,Resolution,Panel_Type,Launch_Year,Sound_Output,Warranty,Size_cm,Size_inch,OS_Type,Rating
0,49,1,4,2024,30.0,51,80.0,32.0,3,14
1,90,1,0,2024,30.0,44,80.0,32.0,4,13
2,49,1,0,2023,30.0,51,80.0,32.0,0,13
3,295,1,4,2025,26.0,28,80.0,32.0,3,15
4,90,0,0,2024,30.0,44,108.0,43.0,4,13
...,...,...,...,...,...,...,...,...,...,...
1468,175,2,3,2025,70.0,3,138.0,55.0,6,22
1469,231,1,4,2025,16.0,1,4.0,32.0,3,13
1470,49,1,0,2023,30.0,51,80.0,32.0,5,22
1471,257,0,0,2024,15.0,33,80.0,32.0,0,0


In [9]:
future.isnull().sum()

Tv_name         0
Resolution      0
Panel_Type      0
Launch_Year     0
Sound_Output    0
Warranty        0
Size_cm         0
Size_inch       0
OS_Type         0
Rating          0
dtype: int64

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(787, 10)
(787,)
(197, 10)
(197,)


In [12]:
model = LogisticRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print("accuracy_score:", accuracy_score(y_test, y_pred))
print("classification_report :",classification_report(y_test, y_pred))
print("confusion_matrix :", confusion_matrix(y_test, y_pred))


accuracy_score: 0.20812182741116753
classification_report :               precision    recall  f1-score   support

                   0.00      0.00      0.00         3
       10499       0.00      0.00      0.00         1
       10699       0.00      0.00      0.00         1
       10990       0.60      0.60      0.60         5
      112990       0.86      1.00      0.92         6
       11499       0.00      0.00      0.00         1
      114999       0.00      0.00      0.00         1
       11500       0.00      0.00      0.00         1
      117999       0.00      0.00      0.00         1
       11849       0.00      0.00      0.00         1
      119899       0.00      0.00      0.00         1
       11999       0.00      0.00      0.00         2
      119999       0.00      0.00      0.00         1
       12490       0.75      0.86      0.80         7
       12606       0.00      0.00      0.00         1
       12998       0.00      0.00      0.00         1
       13049       0.

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [13]:
print("hello")

hello
