### Imports

In [1]:
import csv
import re
import os

import pandas as pd
from bs4 import BeautifulSoup
from request_service import *

**Making http request on given url is brought to the separate class.**

**Collecting advertisements urls from site pages.**

In [2]:
DATASET_PACKAGE = 'datasets'
IMAGE_PACKAGE = f'{DATASET_PACKAGE}/images'
DESCRIPTION_PACKAGE = f'{DATASET_PACKAGE}/descriptions'

AD_FILE = f'{DATASET_PACKAGE}/mobiles_ads_urls.txt'
TSV_FILE = f'{DATASET_PACKAGE}/full_mobiles_dataset.tsv'
ARFF_FILE = f'{DATASET_PACKAGE}/full_mobiles_dataset.arff'

file_index_descr = 461
file_index_image = 461


def replacing_whitespaces_by_(model):
    return re.sub(r'[\s+]', '_', model).replace('/', '-')


def write_description_to_file(description, model):
    global file_index_descr
    file_index_descr += 1
    file_name = f'{DESCRIPTION_PACKAGE}/{file_index_descr}_{replacing_whitespaces_by_(model)}.txt'
    with open(file_name, 'w', encoding='utf-8') as file:
        file.write(description)
        print(f"Wrote description to file for {model}\n")
    return file_name


def download_image(url, model):
    global file_index_image
    file_index_image += 1
    file_name = f'{IMAGE_PACKAGE}/{file_index_image}_{replacing_whitespaces_by_(model)}.png'
    response = make_plain_request(url)
    with open(file_name, 'wb') as file:
        file.write(response.content if response else "")
        print(f"Downloaded image for {model}\n")


def get_all_advertisements_urls(url):
    src = make_request(url)
    if not src:
        return []

    soup = BeautifulSoup(src, 'lxml')
    ads = soup.findAll('a', class_='b-good__title-link')
    return [BASE_URL + ad['href'] for ad in ads]


def save_ads_to_file(ads, filename):
    with open(filename, 'a', encoding='utf-8') as f:
        for ad in ads:
            f.write(ad + '\n')


def load_existing_ads(filename):
    if not os.path.exists(filename):
        return []

    with open(filename, 'r', encoding='utf-8') as f:
        return [line.strip() for line in f.readlines()]


def collect_advertisements_from_pages(start, end, save_file=AD_FILE):
    ads = set(load_existing_ads(save_file))
    new_ads = set()

    for i in range(start, end + 1):
        cur_url = f"{BASE_URL}{CATEGORY_URL}?page={i}&{OPTIONS}"
        print(f"Fetching page {i}\n")
        page_ads = get_all_advertisements_urls(cur_url)
        if page_ads:
            unique_ads = set(page_ads) - ads
            ads.update(unique_ads)
            new_ads.update(unique_ads)
            save_ads_to_file(list(new_ads), save_file)

    print(*ads, sep="\n")
    return list(ads)

In [3]:
# ads_list = collect_advertisements_from_pages(1, 21)
# print(f"Downloaded {len(ads_list)} ads\n")
# cur_url = "https://spb.shop.megafon.ru/mobile?si_sbmt=1&si_actions=&si_av=1&si_archVal=0&si_courier=1&si_salon=1&si_specs_2=22211&si_specs_8924=QTYwcw_e_e,QTcw,UzIz"
# 
# page_ads = get_all_advertisements_urls(cur_url)
# if page_ads:
#     save_ads_to_file(page_ads, AD_FILE)

Deleting similar rows from the given file(avoiding parsing one link several times).

In [4]:
def delete_similarities(file_name=AD_FILE):
    with open(file_name, 'r', encoding='utf-8') as f:
        ads = set([line.strip() for line in f.readlines()])
    print(f"unique ads: {len(ads)}\n")
    with open(file_name, 'w', encoding='utf-8') as f:
        for ad in ads:
            f.write(ad + '\n')
    return list(ads)


def get_ads_from_file(file_name=AD_FILE):
    with open(file_name, 'r', encoding='utf-8') as f:
        ads = [line.strip() for line in f.readlines()]
    return ads

In [5]:
# mobiles = delete_similarities('datasets/full_mobiles_dataset_copy.tsv')
# with open('datasets/full_mobiles_dataset_copy.tsv', 'w', encoding='utf-8') as f:
#     for mobile in mobiles:
#         f.write(f"{mobile}\n")
# print(len(mobiles))

### Parsing advertisement

In [1]:
NUMERIC_CATEGORIES = ['price(₽)', 'old_price(₽)', 'rating', 'rating_count', 'weight(g)', 'diagonal(inches)',
                      'refresh_rate(Hz)',
                      'memory(Gb)',
                      'ram(Gb)', 'battery(mAh)', 'front_camera(Mpix)']
CATEGORIES = (['model'] +
              NUMERIC_CATEGORIES +
              ['company', 'color', 'release_date', 'country', 'display', 'processor',
               'main_camera'])


class MobileAd:
    PARAMS_MAP = {
        'Производитель': 'company',
        'Начало продаж': 'year',
        'Страна происхождения': 'country',
        'Цвет': 'color',
        'Вес': 'weight',
        'Диагональ дисплея': 'diagonal',
        'Тип дисплея': 'display_type',
        'Частота обновления экрана': 'refresh_rate',
        'Объем встроенной памяти': 'memory',
        'Объем оперативной памяти (RAM)': 'ram',
        'Емкость аккумулятора': 'battery',
        'Процессор': 'processor_frequency',
        'Фронтальная камера': 'front_camera',
        'Основная камера': 'main_camera'
    }

    def __init__(self, model, price, old_price, rating, rating_count, company, color, year, country, weight, diagonal,
                 display_type,
                 refresh_rate, memory, ram, battery,
                 processor_frequency, front_camera, main_camera):
        self.model = model
        self.price = price
        self.old_price = old_price
        self.rating = rating
        self.rating_count = rating_count
        self.company = company
        self.color = color
        self.year = year
        self.country = country
        self.weight = weight
        self.diagonal = diagonal
        self.display_type = display_type
        self.refresh_rate = refresh_rate
        self.memory = memory
        self.ram = ram
        self.battery = battery
        self.processor_frequency = processor_frequency
        self.front_camera = front_camera
        self.main_camera = main_camera

    def to_tsv(self):
        return [
            self.model,
            self.price,
            self.old_price,
            self.rating,
            self.rating_count,
            self.company,
            self.color,
            self.year,
            self.country,
            self.weight,
            self.diagonal,
            self.display_type,
            self.refresh_rate,
            self.memory,
            self.ram,
            self.battery,
            self.processor_frequency,
            self.front_camera,
            self.main_camera
        ]

    def to_str(self):
        return f"Название: {self.model}, цена: {self.price}, старая цена: {self.old_price}, оценка: {self.rating}\n" + '\n'.join(
            f"{name}:{getattr(self, attr)}" for name, attr in self.PARAMS_MAP.items())

    @classmethod
    def from_soup(cls, soup):
        title = soup.find('h1').get_text()

        company_match = re.search(r"Смартфон\s+([\w]+)", title)
        company = company_match.group(1) if company_match else None

        color_tag = soup.find('span', class_='b-options__color-item variantColor b-options__color-item_state_selected')
        color = color_tag.get('title').split('Цвет:')[-1].strip() if color_tag else None

        rating_tag = soup.find('span', class_='b-comments__rating-value')
        rating = rating_tag.get_text(strip=True) if rating_tag else None

        rating_count_tag = soup.find('meta', itemprop='ratingCount')
        rating_count = rating_count_tag.get('content') if rating_count_tag else None

        price_tag = soup.find('span', class_='b-price-cards__value b-price__value')
        price = price_tag.get_text(strip=True) if price_tag else None

        old_price_tag = soup.find('span', class_='b-price-cards__old-price-value b-price__old-price-value')
        old_price = old_price_tag.get_text(strip=True) if old_price_tag else None

        description_tag = soup.find(class_='b-good-description__full nosmall')
        description = description_tag.get_text() if description_tag else ""
        write_description_to_file(description, title)

        meta_tag = soup.find('meta', {'itemprop': 'image'})
        image_url = meta_tag['content'] if meta_tag and 'content' in meta_tag.attrs else ""
        download_image(image_url, title)

        params = {}
        specs_heads = soup.find_all('div', class_='b-good-specs__head')
        specs_contents = soup.find_all('div', class_='b-good-specs__content')

        for head, content in zip(specs_heads, specs_contents):
            param_name_tag = head.find('div', class_='name g_tool_tip_container')

            if param_name_tag:
                param_name_tag.find('span').decompose()
                spec_descr_tag = param_name_tag.find('div', class_='spec_descr ttc-message')
                if spec_descr_tag:
                    spec_descr_tag.decompose()
                param_name = param_name_tag.get_text(strip=True)
            else:
                param_name = head.get_text(strip=True)
                if param_name.endswith(':'):
                    param_name = param_name[:-1]

            param_value = content.get_text(strip=True)
            if param_name in cls.PARAMS_MAP:
                params[cls.PARAMS_MAP[param_name]] = param_value

        return cls(
            model=title,
            price=price,
            old_price=old_price,
            rating=rating,
            rating_count=rating_count,
            company=company,
            color=color,
            year=params.get('year', None),
            country=params.get('country', None),
            weight=params.get('weight', None),
            diagonal=params.get('diagonal', None),
            display_type=params.get('display_type', None),
            refresh_rate=params.get('refresh_rate', None),
            memory=params.get('memory', None),
            ram=params.get('ram', None),
            battery=params.get('battery', None),
            processor_frequency=params.get('processor_frequency', None),
            front_camera=params.get('front_camera', None),
            main_camera=params.get('main_camera', None)
        )

In [7]:
def parse_advertisement(ad_url):
    src = make_request(ad_url)
    soup = BeautifulSoup(src, 'lxml')
    print(f"Parsed {ad_url}\n")
    return MobileAd.from_soup(soup)


def parse_first_ad(ads_list, tsv_file_name=TSV_FILE):
    with open(tsv_file_name, 'a', encoding='utf-8') as tsv_file:
        writer = csv.writer(tsv_file, delimiter='\t')
        # writer.writerow(MobileAd.PARAMS_MAP.keys())
        info = parse_advertisement(ads_list[100]).to_tsv()
        if info:
            writer.writerow(info)


def parse_ads_to_tsv(ads_list, tsv_file_name=TSV_FILE):
    with open(tsv_file_name, 'a', encoding='utf-8') as tsv_file:
        writer = csv.writer(tsv_file, delimiter='\t')
        # writer.writerow(AutoAd.PARAMS_MAP.keys())
        cnt = 461
        for ad_url in ads_list[cnt:]:
            info = parse_advertisement(ad_url).to_tsv()
            if info:
                writer.writerow(info)
                # ads_list.remove(ad_url)
                print(f"{cnt + 1}|{ad_url} wrote to file\n")
                cnt += 1

In [8]:
# parse_ads_to_tsv(ads_list, TSV_FILE)

**Cleaning data, writing to arff**

In [9]:
clean_tsv_file_name = f'{DATASET_PACKAGE}/clean_full_mobiles_dataset.tsv'

months = {
    'Январь': '01', 'Февраль': '02', 'Март': '03', 'Апрель': '04',
    'Май': '05', 'Июнь': '06', 'Июль': '07', 'Август': '08',
    'Сентябрь': '09', 'Октябрь': '10', 'Ноябрь': '11', 'Декабрь': '12'
}


def clean_numeric_in_tsv_file(input_file=TSV_FILE, output_file=clean_tsv_file_name):
    def clean_numeric(value):
        if isinstance(value, str):
            return re.sub(r"[^\d.]", '', value)
        elif isinstance(value, float):
            try:
                if value == int(value):
                    return str(int(value))
            except ValueError:
                return value
        return value

    def unify_date(date):
        if date:
            try:
                month, year, _ = date.split()
                if month in months:
                    month_num = months.get(month)
                    return f"{year}-{month_num}"
                else:
                    print(month)
                    return date
            except Exception as e:
                print(f"Error processing date '{date}': {e}")
                return ""

    df = pd.read_csv(input_file, sep='\t')
    for category in NUMERIC_CATEGORIES:
        df[category] = df[category].apply(clean_numeric)
    df['release_date'] = df['release_date'].apply(unify_date)
    df.to_csv(output_file, sep='\t', index=False)


In [10]:
def collect_dept_attributes_from_tsv(tsv_file_name=TSV_FILE):
    companies = set()
    colors = set()
    processors = set()
    displays = set()
    countries = set()
    with open(tsv_file_name, 'r', encoding='utf-8') as tsv:
        reader = csv.DictReader(tsv, delimiter='\t')

        for row in reader:
            company = row.get('company')
            if company:
                companies.add(company.strip())

            color = row.get('color')
            if color:
                colors.add(color.strip())

            processor = row.get('processor')
            if processor:
                processors.add(processor.strip())

            display = row.get('display')
            if display:
                displays.add(display.strip())

            country = row.get('country')
            if country:
                countries.add(country)
    return companies, colors, countries, processors, displays


def write_to_arff(tsv_file_name=clean_tsv_file_name, arff_file_name=ARFF_FILE):
    companies, colors, countries, processors, displays = collect_dept_attributes_from_tsv(tsv_file_name)
    with open(arff_file_name, 'w', encoding='utf-8') as arff:
        arff.write(f"@relation gadgets\n\n")

        arff.write(f"@attribute model string\n")
        arff.write(f"@attribute company {{{','.join(companies)}}}\n")
        arff.write(f"@attribute country {{{','.join(countries)}}}\n")
        arff.write(f"@attribute release_date date yyyy-MM\n")
        for category in NUMERIC_CATEGORIES:
            arff.write(f"@attribute {category} numeric\n")
        arff.write(f"@attribute main_camera string\n")
        arff.write(f"@attribute color {{{','.join(colors)}}}\n")
        arff.write(f"@attribute display {{{','.join(displays)}}}\n")
        arff.write(f"@attribute processor {{{','.join(processors)}}}\n")

        arff.write(f"\n@data\n")

        with open(tsv_file_name, 'r', encoding='utf-8') as tsv:
            reader = csv.DictReader(tsv, delimiter='\t')
            for row in reader:
                model = row.get('model', '?')
                price = row.get('price(₽)', '?')
                manufacturer = row.get('company', '?')
                release_date = row.get('release_date', '?')
                country = row.get('country', '?')
                weight = row.get('weight(g)', '?')
                diagonal = row.get('diagonal(inches)', '?')
                screen = row.get('display', '?')
                refresh_rate = row.get('refresh_rate(Hz)', '?')
                memory = row.get('memory(Gb)', '?')
                battery = row.get('battery(mAh)', '?')
                processor = row.get('processor', '?')
                front_camera = row.get('front_camera(Mpix)', '?')
                main_camera = row.get('main_camera', '?')

                arff.write(f"{model},{price},{manufacturer},{release_date},{country},{weight},{diagonal},{screen},"
                           f"{refresh_rate},{memory},{battery},{processor},{front_camera},{main_camera}\n")

# write_to_arff()

In [11]:
# write_to_arff('datasets/clean_full_mobiles_dataset.tsv', 'datasets/full_mobiles_dataset.arff')