In [188]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import pandas as pd
import numpy as np
import bs4 as bs
import nltk
import requests
import re
import certifi

In [189]:
with open('vendors/claro17-2-2022.html') as fp:
    soup = bs.BeautifulSoup(fp, 'lxml')

In [190]:
def find_name(data):
  name_div = data.find('div', class_='product_name')
  name_div = name_div.find_all('a')
  return name_div[0].text

def find_price_div(data):
  price_div = data.find('div', class_='product_price')
  return price_div

def get_price(data):
  price_data = data.find(id=re.compile('^ProductInfoPrice_')) 
  return price_data['value']

In [191]:
df = pd.DataFrame(columns=['name', 'price'])

In [192]:
banner_texts = []
divs = soup.find_all("div", class_='product', id=re.compile('^productContainer_'))
for div in divs:
  name = find_name(div)
  price_div = find_price_div(div)
  price = get_price(price_div)
  df = df.append({'name': name[10:][:-3], 'price': price}, ignore_index=True)
df.head()

Unnamed: 0,name,price
0,Samsung Galaxy S20 FE 128GB 4,2.464.950
1,Samsung Galaxy S20 FE 256GB 4,2.699.950
2,iPhone SE 64GB 4,2.447.960
3,Nokia G10 64GB 4,708.900
4,Xiaomi Redmi Note 10S 4,1.224.900


In [193]:
df.to_csv('claro.csv')

# **Movistar**

In [157]:
with open('movistar.html') as fp:
    soup_movistar = bs.BeautifulSoup(fp, 'html.parser')


FileNotFoundError: [Errno 2] No such file or directory: 'movistar.html'

In [None]:
def find_name_m(data):
  name_div = data.find('h2', class_=re.compile('^text-title'))  
  return name_div.text

def find_price_div_m(data):
  price_div = data.find('span', class_=re.compile('.*price.*'))
  return price_div

def get_price_m(data):
  price = data.find('span', class_=re.compile('.*price.*')).text
  return price

In [None]:
df = pd.DataFrame(columns=['name', 'price'])

In [None]:
divs = soup_movistar.find_all('div', class_='cardcaro')
banner_texts = []
for div in divs:
  name = find_name_m(div)  
  price = get_price_m(div)
  df = df.append({'name': name, 'price': price}, ignore_index=True)
df.head()

Unnamed: 0,name,price
0,TEL GSM ALCATEL 1 16 GB LTE,$199.950
1,TEL GSM NOKIA C1 PLUS LTE,$229.950
2,TEL GSM ALCATEL 1B 2/32 GB LTE,$240.950
3,TEL GSM ALCATEL 1B LTE,$269.950
4,TEL GSM SAMSUNG GALAXY A01 CORE LTE,$279.950
...,...,...
87,TEL GSM SAMSUNG GALAXY A31 + SB LTE,$797.939
88,TEL GSM SAM GALAXY A31 128GB LTE,$797.939
89,TEL GSM HUAWEI Y9A HMS,$1.079.919
90,TEL GSM HUAWEI P40 LITE HMS LTE,$1.099.919


In [None]:
df.to_csv('phone_data_movistar.csv')

# **TIGO**

In [194]:
with open('vendors/tigo17-2-2022.html') as fp:
    soup_movistar = bs.BeautifulSoup(fp, 'html.parser')

In [195]:
def find_name_m(data):
  name_div = data.find('div', class_='item-title')  
  name = name_div.find('h2', class_='text-product')
  return name.text

def get_price_m(data):
  price_div = data.find('h3', class_=re.compile('^best-price'))
  if price_div is not None:
    return price_div.text
  else:
    return None

In [196]:
df = pd.DataFrame(columns=['name', 'price'])

In [197]:
divs = soup_movistar.find_all('div', class_='listItem')
banner_texts = []
for div in divs:
  name = find_name_m(div)  
  price = get_price_m(div)
  if price is not None and name is not None:
    df = df.append({'name': name, 'price': price}, ignore_index=True)
  
df.head()

Unnamed: 0,name,price
0,IPHONE 13 PRO MAX 256GB,$6.399.901
1,IPHONE 13 PRO MAX 128GB,$5.799.901
2,IPHONE 13 PRO 128GB,$5.299.901
3,Samsung Galaxy ZFLIP3 8/256,$4.999.900
4,IPHONE 13 256GB,$4.799.901


In [198]:
df.to_csv('data_tigo.csv')

# **Ktronix**

In [199]:
with open('vendors/ktronix17-2-2022-1.html') as fp1:
    soup_1 = bs.BeautifulSoup(fp1, 'html.parser')
with open('vendors/ktronix17-2-2022-2.html') as fp2:
    soup_2 = bs.BeautifulSoup(fp2, 'html.parser')
with open('vendors/ktronix17-2-2022-3.html') as fp3:
    soup_3 = bs.BeautifulSoup(fp3, 'html.parser')

In [200]:
def find_name_m(data):
  try: 
    name_div = data.find('div', class_='product__information')
    name = name_div.find('a')
  except:
    return None
  return name.text

def get_price_m(data):
  try: 
    price_div = data.find('p', class_='product__price--discounts__price')
  except: 
    return None
  if price_div is not None:
    return price_div.text
  else:
    return None

In [201]:
df = pd.DataFrame(columns=['name', 'price'])

In [202]:
divs = soup_1.find_all('li', class_=re.compile('^product__list--item'))
divs_2 = soup_2.find_all('li', class_=re.compile('^product__list--item'))
banner_texts = []
for div in divs:
  name = find_name_m(div)  
  price = get_price_m(div)
  if price is not None and name is not None:
    df = df.append({'name': name, 'price': price.strip()[:-4]}, ignore_index=True)
  
df.head()

Unnamed: 0,name,price
0,Celular SAMSUNG Galaxy M12 128GB Azul,$619.900
1,Celular MOTOROLA G31 128GB Azul,$799.900
2,Celular XIAOMI REDMI 10 128GB Gris,$796
3,Celular SAMSUNG Galaxy A12 64GB Negro,$599.900
4,Celular XIAOMI REDMI 10 128GB Azul,$796


In [203]:
df.to_csv('ktronix.csv')

***Summary DF, get ram and rom values***

In [315]:
df_claro = pd.read_csv('claro.csv', index_col=0)
df_tigo = pd.read_csv('data_tigo.csv', index_col=0)
df_ktronix =  pd.read_csv('ktronix.csv', index_col=0)
df_claro['vendor'] = 'Claro'
df_tigo['vendor'] = 'Tigo'
df_ktronix['vendor'] = 'Ktronix'
df = df_claro.append(df_tigo, ignore_index=True) \
        .append(df_ktronix, ignore_index=True)
df.head()


Unnamed: 0,name,price,vendor
0,Samsung Galaxy S20 FE 128GB 4,2.464.950,Claro
1,Samsung Galaxy S20 FE 256GB 4,2.699.950,Claro
2,iPhone SE 64GB 4,2.447.960,Claro
3,Nokia G10 64GB 4,708.900,Claro
4,Xiaomi Redmi Note 10S 4,1.224.900,Claro


In [316]:
def delete_quotes(row):
    name = row['name'].replace('"', '')
    return name

In [317]:
df['name'] = df.apply(lambda row: delete_quotes(row), axis=1)

In [318]:
df

Unnamed: 0,name,price,vendor
0,Samsung Galaxy S20 FE 128GB 4,2.464.950,Claro
1,Samsung Galaxy S20 FE 256GB 4,2.699.950,Claro
2,iPhone SE 64GB 4,2.447.960,Claro
3,Nokia G10 64GB 4,708.900,Claro
4,Xiaomi Redmi Note 10S 4,1.224.900,Claro
...,...,...,...
231,iPhone 13 Pro Max 128GB Azul Sierra,$5.849,Ktronix
232,Celular XIAOMI Redmi Note 10S 128GB Gris,$1.099.900,Ktronix
233,Celular SAMSUNG Galaxy S20 FE 256GB Azul,$2.699,Ktronix
234,Celular SAMSUNG M32 128 GB Azul,$1.065.090,Ktronix


In [319]:
def get_memory(row):
    name = row['name']
    memory = re.findall('\d{1,3}(?=GB|Gb|gb| GB| Gb| gb| GB| Gb| gb|/)', name)
    return memory

def get_ram(row):
    memory = get_memory(row)
    for value in memory:
        if int(value)<=12:        
            return value

    return None

def get_rom(row):
    memory = get_memory(row)       
    for value in memory:
        if int(value)>12:        
            return value 

    return None


In [320]:
df['ram'] = df.apply(lambda row: get_ram(row), axis=1)
df['rom'] = df.apply(lambda row: get_rom(row), axis=1)

In [321]:
##re.search('\d{1,3}(?=GB|Gb|gb| GB| Gb| gb|/)', 'Samsung Galaxy S20 FE 128GB 4GB')
re.findall('\d{1,3}(?=GB|Gb|gb| GB| Gb| gb| GB| Gb| gb|/)', 'MOTO E20 2/32 GB')

['2', '32']

In [322]:
brands_dict = {
    'Samsung': 'samsung',
    'iPhone': 'iphone',
    'Xiaomi': 'xiaomi|redmi',
    'Huawei': 'huawei',
    'LG': 'lg',
    'Alcatel': 'alcatel',
    'Motorola': 'motorola|moto',
    'Nokia': 'nokia',
    'Pixel': 'pixel',
    'ZTE': 'zte',
    'Oppo': 'oppo',
    'Honor': 'honor',
    'Vivo': 'vivo',
    'Realme': 'realme'

}

In [323]:
def get_brand(row):
    for k, v in brands_dict.items():
        if re.search(v, row['name'].lower()):
            return k
    return None

In [324]:
df['brand'] = df.apply(lambda row: get_brand(row), axis=1)


In [325]:
s = 'abc'
re.sub(r'a|b|', '', s)

'c'

In [341]:
model_stop_words = '/'
for k,v in brands_dict.items():
    model_stop_words = model_stop_words + '|' + v
for v in ['GB', 'Gb', 'gb']:
    model_stop_words = model_stop_words + '|' + v
model_stop_words = model_stop_words +  '|(?<=con).+|con'

In [342]:
def get_model(row):
    model = re.sub(model_stop_words, '', row['name'].lower()).strip()
    try:
        model = model.replace(row['ram'], '').replace(row['rom'], '')
    except:
        return model
    
    return model

In [343]:
df['model'] = df.apply(lambda row: get_model(row), axis=1)
df.head()

Unnamed: 0,name,price,vendor,ram,rom,brand,model
0,Samsung Galaxy S20 FE 128GB 4,2.464.950,Claro,,128.0,Samsung,galaxy s20 fe 128 4
1,Samsung Galaxy S20 FE 256GB 4,2.699.950,Claro,,256.0,Samsung,galaxy s20 fe 256 4
2,iPhone SE 64GB 4,2.447.960,Claro,,64.0,iPhone,se 64 4
3,Nokia G10 64GB 4,708.900,Claro,,64.0,Nokia,g10 64 4
4,Xiaomi Redmi Note 10S 4,1.224.900,Claro,,,Xiaomi,note 10s 4


In [344]:
df.to_csv('checkpoint.csv')