In [1]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import pandas as pd
import numpy as np
import bs4 as bs
import nltk
import requests
import re
import certifi

In [2]:
with open('vendors/claro17-2-2022.html') as fp:
    soup = bs.BeautifulSoup(fp, 'lxml')

In [3]:
def find_name(data):
  name_div = data.find('div', class_='product_name')
  name_div = name_div.find_all('a')
  return name_div[0].text

def find_price_div(data):
  price_div = data.find('div', class_='product_price')
  return price_div

def get_price(data):
  price_data = data.find(id=re.compile('^ProductInfoPrice_')) 
  return price_data['value']

In [4]:
df = pd.DataFrame(columns=['name', 'price'])

In [5]:
banner_texts = []
divs = soup.find_all("div", class_='product', id=re.compile('^productContainer_'))
for div in divs:
  name = find_name(div)
  price_div = find_price_div(div)
  price = get_price(price_div)
  df = df.append({'name': name[10:][:-3], 'price': price}, ignore_index=True)
df.head()

Unnamed: 0,name,price
0,Samsung Galaxy S20 FE 128GB 4,2.464.950
1,Samsung Galaxy S20 FE 256GB 4,2.699.950
2,iPhone SE 64GB 4,2.447.960
3,Nokia G10 64GB 4,708.900
4,Xiaomi Redmi Note 10S 4,1.224.900


In [6]:
df.to_csv('claro.csv')

# **Movistar**

In [7]:
with open('vendors/movistar18-2-2022.html') as fp:
    soup_movistar = bs.BeautifulSoup(fp, 'html.parser')


In [8]:
def find_name_m(data):
  name_div = data.find('p', class_=re.compile('.*title.*'))  
  return name_div.text

def find_price_div_m(data):
  price_div = data.find('p', class_=re.compile('.*price.*'))
  return price_div

def get_price_m(data):
  price = data.find('p', class_=re.compile('.*price.*')).text
  return price

In [9]:
df = pd.DataFrame(columns=['name', 'price'])

In [10]:
divs = soup_movistar.find_all('div', class_='c-phone')
banner_texts = []

In [11]:
for div in divs:
  smallers_divs = div.find_all('div', class_='c-phone__box')    
  for smarller_div in smallers_divs:    
    name = find_name_m(smarller_div)  
    price = get_price_m(smarller_div)
    df = df.append({'name': name, 'price': price}, ignore_index=True)
df.head(16)

Unnamed: 0,name,price
0,iPhone 13 mini128 GB,$3.729.929
1,iPhone 13 mini256 GB,$4.259.929
2,iPhone 13 mini512 GB,$5.329.929
3,iPhone 13128 GB,$4.265.929
4,iPhone 13256 GB,$4.799.929
5,iPhone 13512 GB,$5.855.929
6,iPhone 13 Pro128 GB,$5.339.929
7,iPhone 13 Pro256 GB,$5.869.929
8,iPhone 13 Pro512 GB,$6.939.929
9,iPhone 13 Pro1 TB,$7.999.929


In [12]:
df.to_csv('movistar.csv')

# **TIGO**

In [13]:
with open('vendors/tigo17-2-2022.html') as fp:
    soup_movistar = bs.BeautifulSoup(fp, 'html.parser')

In [14]:
def find_name_m(data):
  name_div = data.find('div', class_='item-title')  
  name = name_div.find('h2', class_='text-product')
  return name.text

def get_price_m(data):
  price_div = data.find('h3', class_=re.compile('^best-price'))
  if price_div is not None:
    return price_div.text
  else:
    return None

In [15]:
df = pd.DataFrame(columns=['name', 'price'])

In [16]:
divs = soup_movistar.find_all('div', class_='listItem')
banner_texts = []
for div in divs:
  name = find_name_m(div)  
  price = get_price_m(div)
  if price is not None and name is not None:
    df = df.append({'name': name, 'price': price}, ignore_index=True)
  
df.head()

Unnamed: 0,name,price
0,IPHONE 13 PRO MAX 256GB,$6.399.901
1,IPHONE 13 PRO MAX 128GB,$5.799.901
2,IPHONE 13 PRO 128GB,$5.299.901
3,Samsung Galaxy ZFLIP3 8/256,$4.999.900
4,IPHONE 13 256GB,$4.799.901


In [17]:
df.to_csv('data_tigo.csv')

# **Ktronix**

In [18]:
with open('vendors/ktronix17-2-2022-1.html') as fp1:
    soup_1 = bs.BeautifulSoup(fp1, 'html.parser')
with open('vendors/ktronix17-2-2022-2.html') as fp2:
    soup_2 = bs.BeautifulSoup(fp2, 'html.parser')
with open('vendors/ktronix17-2-2022-3.html') as fp3:
    soup_3 = bs.BeautifulSoup(fp3, 'html.parser')

In [19]:
def find_name_m(data):
  try: 
    name_div = data.find('div', class_='product__information')
    name = name_div.find('a')
  except:
    return None
  return name.text

def get_price_m(data):
  try: 
    price_div = data.find('p', class_='product__price--discounts__price')
  except: 
    return None
  if price_div is not None:
    return price_div.text
  else:
    return None

In [20]:
df = pd.DataFrame(columns=['name', 'price'])

In [21]:
divs = soup_1.find_all('li', class_=re.compile('^product__list--item'))
divs_2 = soup_2.find_all('li', class_=re.compile('^product__list--item'))
banner_texts = []
for div in divs:
  name = find_name_m(div)  
  price = get_price_m(div)
  if price is not None and name is not None:
    df = df.append({'name': name, 'price': price.strip()[:-4]}, ignore_index=True)
  
df.head()

Unnamed: 0,name,price
0,Celular SAMSUNG Galaxy M12 128GB Azul,$619.900
1,Celular MOTOROLA G31 128GB Azul,$799.900
2,Celular XIAOMI REDMI 10 128GB Gris,$796
3,Celular SAMSUNG Galaxy A12 64GB Negro,$599.900
4,Celular XIAOMI REDMI 10 128GB Azul,$796


In [22]:
df.to_csv('ktronix.csv')

***Summary DF, get ram and rom values***

In [55]:
df_claro = pd.read_csv('claro.csv', index_col=0)
df_tigo = pd.read_csv('data_tigo.csv', index_col=0)
df_ktronix =  pd.read_csv('ktronix.csv', index_col=0)
df_movistar = pd.read_csv('movistar.csv', index_col=0)
df_movistar['vendor'] = 'Movistar'
df_claro['vendor'] = 'Claro'
df_tigo['vendor'] = 'Tigo'
df_ktronix['vendor'] = 'Ktronix'
df = df_claro.append(df_tigo, ignore_index=True) \
        .append(df_ktronix, ignore_index=True) \
        .append(df_movistar, ignore_index=True)
df.head()


Unnamed: 0,name,price,vendor
0,Samsung Galaxy S20 FE 128GB 4,2.464.950,Claro
1,Samsung Galaxy S20 FE 256GB 4,2.699.950,Claro
2,iPhone SE 64GB 4,2.447.960,Claro
3,Nokia G10 64GB 4,708.900,Claro
4,Xiaomi Redmi Note 10S 4,1.224.900,Claro


In [56]:
def delete_quotes(row):
    name = row['name'].replace('"', '')
    return name

In [57]:
df['name'] = df.apply(lambda row: delete_quotes(row), axis=1)

In [58]:
df

Unnamed: 0,name,price,vendor
0,Samsung Galaxy S20 FE 128GB 4,2.464.950,Claro
1,Samsung Galaxy S20 FE 256GB 4,2.699.950,Claro
2,iPhone SE 64GB 4,2.447.960,Claro
3,Nokia G10 64GB 4,708.900,Claro
4,Xiaomi Redmi Note 10S 4,1.224.900,Claro
...,...,...,...
392,TCL 20SE128 GB,$709.849,Movistar
393,Realme 7i,$538.950$769.859,Movistar
394,OPPO A16,$699.839,Movistar
395,OPPO RENO 6 LITE,$1.699.839,Movistar


In [59]:
value = 1
possible_memory_values = []
while value < 1025:    
    possible_memory_values.append(value)
    value = value*2

In [60]:
possible_memory_values

[1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024]

In [61]:
re.findall('\d{1,4}(?=GB|Gb|gb| GB| Gb| gb| GB| Gb| gb|/|TB|Tb|tb| TB| Tb| tb)', 'iPhone 13 Pro Max1 TB')

['1']

In [62]:
re.search('tb', 'iPhone 13 Pro Max1 TB'.lower()).group(0)

'tb'

In [63]:
def get_memory(row):
    name = row['name']
    memory = re.findall('\d{1,4}(?=GB|Gb|gb| GB| Gb| gb| GB| Gb| gb|/|TB|Tb|tb| TB| Tb| tb)', name)
    return memory

def get_ram(row):
    memory = get_memory(row)    
    for value in memory:
        if int(value)<=12 and (re.search('gb', row['name'].lower()) is not None):        
            return value.lstrip("0")

    return None

def get_rom(row):
    memory = get_memory(row)  
    for value in memory:
        value = value.lstrip("0")
        if int(value)>12 or (re.search('tb', row['name'].lower()) is not None):            
            if int(value) in possible_memory_values:
                return value.lstrip("0")
            else:
                return value[1:].lstrip("0")

    return None


In [64]:
#df.head().apply(lambda row: get_rom(row), axis=1)

In [65]:
df['ram'] = df.apply(lambda row: get_ram(row), axis=1)
df['rom'] = df.apply(lambda row: get_rom(row), axis=1)

In [66]:
##re.search('\d{1,3}(?=GB|Gb|gb| GB| Gb| gb|/)', 'Samsung Galaxy S20 FE 128GB 4GB')
re.findall('\d{1,3}(?=GB|Gb|gb| GB| Gb| gb| GB| Gb| gb|/)', 'MOTO E20 2/32 GB')

['2', '32']

In [67]:
brands_dict = {
    'Samsung': 'samsung',
    'iPhone': 'iphone',
    'Xiaomi': 'xiaomi|redmi',
    'Huawei': 'huawei',
    'LG': 'lg',
    'Alcatel': 'alcatel',
    'Motorola': 'motorola|moto',
    'Nokia': 'nokia',
    'Pixel': 'pixel',
    'ZTE': 'zte',
    'Oppo': 'oppo',
    'Honor': 'honor',
    'Vivo': 'vivo',
    'Realme': 'realme'

}

In [68]:
def get_brand(row):
    for k, v in brands_dict.items():
        if re.search(v, row['name'].lower()):
            return k
    return None

In [69]:
df['brand'] = df.apply(lambda row: get_brand(row), axis=1)


In [70]:
s = 'abc'
re.sub(r'a|b|', '', s)

'c'

In [71]:
model_stop_words = '/'
for k,v in brands_dict.items():
    model_stop_words = model_stop_words + '|' + v
for v in ['GB', 'Gb', 'gb']:
    model_stop_words = model_stop_words + '|' + v
model_stop_words = model_stop_words +  '|(?<=con).+|con'

In [72]:
def get_model(row):
    model = re.sub(model_stop_words, '', row['name'].lower()).strip()
    try:
        model = model.replace('celular', '')
    except:
        pass
    try:
        model = model.replace(row['ram'], '').replace(row['rom'], '')
        model = re.sub(' +', ' ', model)
    except:
        try:
            model = model.replace(row['rom'], '')
            model = re.sub(' +', ' ', model)
        except:
            model = re.sub(' +', ' ', model)
            return model

    
    return model

In [73]:
df['model'] = df.apply(lambda row: get_model(row), axis=1)
df.head()

Unnamed: 0,name,price,vendor,ram,rom,brand,model
0,Samsung Galaxy S20 FE 128GB 4,2.464.950,Claro,,128.0,Samsung,galaxy s20 fe 4
1,Samsung Galaxy S20 FE 256GB 4,2.699.950,Claro,,256.0,Samsung,galaxy s20 fe 4
2,iPhone SE 64GB 4,2.447.960,Claro,,64.0,iPhone,se 4
3,Nokia G10 64GB 4,708.900,Claro,,64.0,Nokia,g10 4
4,Xiaomi Redmi Note 10S 4,1.224.900,Claro,,,Xiaomi,note 10s 4


In [74]:
df.to_csv('checkpoint.csv')

In [75]:
a = '$34$35$'
a.strip('$')

'34$35'

In [76]:
def check_price(row):
    row['price'] = row['price'].strip('$')
    prices = row['price'].split('$')
    row['price'] = prices[1] if len(prices) > 1 and prices[1] < prices[0] else prices[0]
    row['price'] = row['price'].replace('.', '')
    if len(row['price'])<5:
        row['price'] = row['price'] + '000'

    return row['price']

In [77]:
df['price'] = df.apply(lambda row: check_price(row), axis=1)

In [78]:
df['vendor'] = df['vendor'].map({'Claro':1, 'Movistar':2, 'Tigo':3, 'Ktronix':4})
df['brand'] = df['brand'].map({'Samsung':1, 'iPhone':2, 'Xiaomi':3, 'Huawei':4, 'Lg':5, 'Alcatel':6, 'Motorola':7, 'Nokia':8, 'Pixel':9, 'ZTE':10, 'Oppo':11, 'Honor':12, 'Vivo':13, 'Realme':14})


In [79]:
def get_network(name):
    if re.search(' 5| 5g|5g$', name):
        return '5g'
    elif re.search(' 4| 4g|4g$', name):
        return '4g'
    else:
        return None
        

In [80]:
df['name'] =  df['name'].str.strip()

In [81]:
df['network'] = df['name'].apply(lambda name: get_network(name))

In [82]:
df['model'] = df['model'].apply(lambda name: re.sub(' 4| 4g|4g 5| 5g|5g$', '', name))

In [83]:
df['currency'] = 'COP'

In [84]:
df.drop(columns=['name']).to_csv('checkpoint_3.csv')