### Libraries import

In [1]:
import pandas as pd # Dataframes management
from zipfile import ZipFile  # Files compressed management
import os # Files management along OS
import re


In [2]:
from selenium import webdriver # Webscrapping bot
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
import time

In [3]:
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords

### Data import

In [4]:
# Specifying the name of the zip file
fileZIP = "/items_ordered_2years_V2.zip"
fileCSV = "/items_ordered_2years_V2.csv"

path = "../Inputs/Modificados - Atmira_Pharma_Visualization"
  
# Open the zip file in read mode
with ZipFile(f"{path}{fileZIP}", 'r') as zip: 
    # List all the contents of the zip file
    zip.printdir() 
  
    # Extract all files
    print('extraction...') 
    zip.extractall(path) 
    print('Done!')

#Import CSV to pandas
itemsOrdered = pd.read_csv(f"{path}{fileCSV}")
print("CSV imported to Pandas successfully")

# Remove uncompressed CSV file
os.remove(f"{path}{fileCSV}")
print("Original CSV removed to preserve repo health")

File Name                                             Modified             Size
items_ordered_2years_V2.csv                    2022-02-23 01:28:04    150372498
extraction...
Done!
CSV imported to Pandas successfully
Original CSV removed to preserve repo health


In [5]:
itemsOrdered.head(3)

Unnamed: 0,num_order,item_id,created_at,product_id,qty_ordered,base_cost,price,discount_percent,customer_id,city,zipcode
0,562ba4ba9aaf2a1e926842deca19271d,2bca87c2ab10b07feef290a9be47f316,2017-01-01 00:20:33,6409.0,1.0,13.4204,17.17,8.0,50d7f99947b472cc889d58845b9d23e2,Valencia,46019
1,562ba4ba9aaf2a1e926842deca19271d,53e12ac4b579f0c4b1c4f77d50bf8792,2017-01-01 00:20:33,5133.0,1.0,18.5351,22.68,8.0,50d7f99947b472cc889d58845b9d23e2,Valencia,46019
2,562ba4ba9aaf2a1e926842deca19271d,d19d6df5cd6a1e741408b70aa2767bf4,2017-01-01 00:20:33,5125.0,1.0,18.526,22.68,8.0,50d7f99947b472cc889d58845b9d23e2,Valencia,46019


### USO DE WEBSCRAPPING PARA CÓDIGO POSTAL

In [6]:
def AcentosLimpiador(text):
	acentos = {'á': 'a', 'é': 'e', 'í': 'i', 'ó': 'o', 'ú': 'u', 'Á': 'A', 'E': 'E', 'Í': 'I', 'Ó': 'O', 'Ú': 'U'}
	for ele in acentos:
		if ele in text:
			text = text.replace(ele, acentos[ele])
	return text

In [7]:
def CityCleaner(text):
    stopWordSpanish = set(stopwords.words('spanish'))
    wordTokens = word_tokenize(AcentosLimpiador(text.lower()).rstrip()) 
    filteredSentence = [element for element in wordTokens if not element in stopWordSpanish] 
    return filteredSentence

In [8]:
#Limpieza de zipcodes con RegEx
def num_guion(string):
    """ Get a string with the numbers and hyphens of another string
    
    Args:
        df: string used to extract the string with numbers abd hyphens

    Returns:
        df: the string with numbers and hyphens
    """
    aux = re.match("([\d-]+)", str(string))
    try:
        return str(aux.group())
    except:
        return string


itemsOrdered["zipcode"] = itemsOrdered["zipcode"].apply(lambda x: num_guion(x))

----------------------

In [51]:
city = "Aravaca-Madrid"
zipToAnalyze = "28023"

---------------------

In [26]:
driver = webdriver.Firefox()
    
driver.get("https://worldpostalcode.com/lookup")
driver.set_page_load_timeout(5)

insertZipcode =driver.find_element(By.ID,"search")
insertZipcode.clear()
insertZipcode.send_keys(zipToAnalyze)
clickButtonZipcode =driver.find_element(By.CLASS_NAME,"submit")
clickButtonZipcode.click()
driver.set_page_load_timeout(5)

getGeoInfo = driver.find_element(By.CLASS_NAME,"search_units")
saveGeoInfo = getGeoInfo.text

driver.close()

In [27]:
listTestingZipcode = saveGeoInfo.split("\n")
listTestingZipcode

["AlgeriaM'silaAin Laaleg28023",
 "AlgeriaM'silaSaiat28023",
 "AlgeriaM'silaLoug28023",
 "AlgeriaM'silaEl Bayadh28023",
 "AlgeriaM'silaAin Fares28023",
 "AlgeriaM'silaBir Souid28023",
 'SpainMadridMadrid28023',
 'FranceCentre-Val De Loire, Eure-Et-LoirChartres28023 CEDEX',
 'MexicoColimaJardines De Guadalupe28023',
 'UkraineKirovohradska, OleksandriiskyiPantañ\x97Vka28023',
 'United StatesNorth Carolina, RowanChina Grove28023']

In [28]:
indexMatchRegex = list(map(lambda x: [(m.start(0), m.end(0)) for m in re.finditer(r"[a-z][A-Z0-9]", x)], listTestingZipcode))

In [29]:
len(indexMatchRegex[1])

3

In [30]:
resultScrapClean = []
for pas,resultScrap in enumerate(listTestingZipcode):
    for pos,ele in enumerate(indexMatchRegex[pas]):
        if len(indexMatchRegex[pas])==2:
            if pos ==0:
                txt = resultScrap[:ele[0]+1]+","+resultScrap[ele[1]-1:]
            elif pos ==1:
                txt = txt[:ele[0]+2]+","+txt[ele[1]:]
                resultScrapClean.append(txt)
        elif len(indexMatchRegex[pas])==3:
            if pos ==0:
                txt = resultScrap[:ele[0]+1]+","+resultScrap[ele[1]-1:]
            elif pos ==1:
                txt = txt[:ele[0]+2]+","+txt[ele[1]:]
            elif pos ==2:
                txt = txt[:ele[0]+3]+","+txt[ele[1]+1:]
                resultScrapClean.append(txt)



In [31]:
resultScrapClean

["Algeria,M'sila,Ain Laaleg,28023",
 "Algeria,M'sila,Saiat,28023",
 "Algeria,M'sila,Loug,28023",
 "Algeria,M'sila,El Bayadh,28023",
 "Algeria,M'sila,Ain Fares,28023",
 "Algeria,M'sila,Bir Souid,28023",
 'Spain,Madrid,Madrid,28023',
 'France,Centre-Val De Loire, Eure-Et-Loir,Chartres,28023 CEDEX',
 'Mexico,Colima,Jardines De Guadalupe,28023',
 'Ukraine,Kirovohradska, Oleksandriiskyi,Pantañ\x97Vka,28023',
 'United States,North Carolina, Rowan,China Grove,28023']

In [32]:
resultScrapListed = []
for element in resultScrapClean:
    resultScrapListed.append(element.split(","))

In [33]:
resultScrapListed

[['Algeria', "M'sila", 'Ain Laaleg', '28023'],
 ['Algeria', "M'sila", 'Saiat', '28023'],
 ['Algeria', "M'sila", 'Loug', '28023'],
 ['Algeria', "M'sila", 'El Bayadh', '28023'],
 ['Algeria', "M'sila", 'Ain Fares', '28023'],
 ['Algeria', "M'sila", 'Bir Souid', '28023'],
 ['Spain', 'Madrid', 'Madrid', '28023'],
 ['France', 'Centre-Val De Loire', ' Eure-Et-Loir', 'Chartres', '28023 CEDEX'],
 ['Mexico', 'Colima', 'Jardines De Guadalupe', '28023'],
 ['Ukraine', 'Kirovohradska', ' Oleksandriiskyi', 'Pantañ\x97Vka', '28023'],
 ['United States', 'North Carolina', ' Rowan', 'China Grove', '28023']]

In [34]:
resultScrapRearr = [(element[0], element[1], element[-2], element[-1]) for element in resultScrapListed]

In [35]:
resultScrapRearr

[('Algeria', "M'sila", 'Ain Laaleg', '28023'),
 ('Algeria', "M'sila", 'Saiat', '28023'),
 ('Algeria', "M'sila", 'Loug', '28023'),
 ('Algeria', "M'sila", 'El Bayadh', '28023'),
 ('Algeria', "M'sila", 'Ain Fares', '28023'),
 ('Algeria', "M'sila", 'Bir Souid', '28023'),
 ('Spain', 'Madrid', 'Madrid', '28023'),
 ('France', 'Centre-Val De Loire', 'Chartres', '28023 CEDEX'),
 ('Mexico', 'Colima', 'Jardines De Guadalupe', '28023'),
 ('Ukraine', 'Kirovohradska', 'Pantañ\x97Vka', '28023'),
 ('United States', 'North Carolina', 'China Grove', '28023')]

In [37]:
resultZip = []
for element in resultScrapRearr:
    for element2  in CityCleaner(element[2]):
        if element2 in CityCleaner(cityToAnalyze):
            resultZip.append(element)
            break

resultZip = resultZip[0]  
    

IndexError: list index out of range

In [38]:
resultZip

[]

In [46]:
resultZip = []
for element in resultScrapRearr:
    for element2 in CityCleaner(element[2]):
        for element3 in CityCleaner(cityToAnalyze):
            print(element2,element3)
            if element2.__contains__(element3):
                resultZip.append(element)
                break
resultZip = resultZip

ain aravaca-madrid
laaleg aravaca-madrid
saiat aravaca-madrid
loug aravaca-madrid
bayadh aravaca-madrid
ain aravaca-madrid
fares aravaca-madrid
bir aravaca-madrid
souid aravaca-madrid
madrid aravaca-madrid
chartres aravaca-madrid
jardines aravaca-madrid
guadalupe aravaca-madrid
pantañvka aravaca-madrid
china aravaca-madrid
grove aravaca-madrid


In [41]:
resultZip

[]

In [49]:
resultZip = []
for element in resultScrapRearr:
    for element2 in CityCleaner(element[2]):
        for cityToAnalyze in CityCleaner(element3):
            print(element3, element2)
            if element3.__contains__(element2):
                resultZip.append(element)
                break
resultZip = resultZip

aravaca-madrid ain
aravaca-madrid laaleg
aravaca-madrid saiat
aravaca-madrid loug
aravaca-madrid bayadh
aravaca-madrid ain
aravaca-madrid fares
aravaca-madrid bir
aravaca-madrid souid
aravaca-madrid madrid
aravaca-madrid chartres
aravaca-madrid jardines
aravaca-madrid guadalupe
aravaca-madrid pantañvka
aravaca-madrid china
aravaca-madrid grove


In [59]:
resultZip = []
for element in resultScrapRearr:
    for element2  in CityCleaner(element[2]):
        if element2 in CityCleaner(city):
            resultZip.append(element)
            break
try:
    resultZip = resultZip[0]
except IndexError:
    resultZip = []
    for element in resultScrapRearr:
        for element2 in CityCleaner(element[2]):
            for element3 in CityCleaner(city):
                if element2 .__contains__(element3):
                    resultZip.append(element)
                    break
    try:
        resultZip = resultZip[0]
    except IndexError:
        resultZip = []
        for element in resultScrapRearr:
            for element2 in CityCleaner(element[2]):
                for city in CityCleaner(element3):
                    if element3.__contains__(element2):
                        resultZip.append(element)
                        break
        resultZip = resultZip[0]  

resultZip

('Spain', 'Madrid', 'Madrid', '28023')