# Regex

In [1]:
import pandas as pd 
import numpy as np
import re

In [2]:
data = pd.read_csv('./dataset/data_web.csv')
# renomeando as cols
data.rename( columns= lambda x: x.lower(), inplace=True)


In [3]:
# ================================== transfommacao das col ======================================

data = data.dropna(subset=['product_id'])
data['product_id'] = data['product_id'].astype(int)

# product_name
data['product_name'] = data['product_name'].apply(lambda x: x.replace(' ', '_').lower())

# product_price - Remover o símbolo '$' e converter para float
data['product_price'] = data['product_price'].apply(lambda x: x.replace('$', '') if isinstance(x, str) else x ).astype(float)

# scrapy_datetime
data['scrapy_datetime'] = pd.to_datetime(data['scrapy_datetime'], format='%Y-%m-%d %H:%M:%S:')

# Color Name
data['color_name'] = data['color_name'].apply(lambda x: x.replace(' ', '_').replace('/', '_').lower() if isinstance(x, str) else x)

# Fit
data['fit'] = data['fit'].apply(lambda x: x.replace(' ', '_').lower() if isinstance(x, str) else x)

# pocket lining  / Removendo a coluna 'poket lining'

# Size
data['size'] = data['size'].apply(lambda x: x.replace(' ', '_').lower() if isinstance(x, str) else x)

# size_number - Extraido números da coluna 'size'
data['size_number'] = data['size'].apply(lambda x: re.search('\d{3}cm', x).group(0) if isinstance(x, str) and re.search('\d{3}cm', x) else x)
data['size_number'] = data['size_number'].apply( lambda x: re.search('\d+', x).group(0) if pd.notnull( x ) else x )

# size_model - Extraido números da coluna 'size'
data['size_model'] = data['size'].apply( lambda x: re.search('\d+/\\d+', x).group(0) if isinstance(x, str) and re.search('\d+/\\d+', x) else x  )



In [4]:
# ========================== Drop duplicates ==================================================
data = data.drop_duplicates(subset=['product_id', 'product_name', 'product_category', 'product_price','scrapy_datetime', 'style_id', 'color_id', 'color_name', 'fit'], keep='last')

# reset Index
data = data.reset_index(drop=True)



In [5]:
# ========================== brack composition by comma =================================

df1 = data['composition'].str.split(',', expand=True)

#criando um dataframe vazio do tamanho de data para alocar as colunas em ordem
df_ref = pd.DataFrame( index=np.arange( len( data ) ), columns=['cotton', 'Spandex', 'Elastomultiester'] )
 
# ======= DF Cotton =====================

df_cotton = df1[0] # data que tem só cotton
df_cotton.name = 'cotton' # passo o name para a coluna
 
df_ref = pd.concat([ df_ref, df_cotton], axis=1) # faco a uniao com concat 
df_ref = df_ref.iloc[:, ~df_ref.columns.duplicated(keep='last')] # e dorp a col original q era so NaN

df_ref['cotton'] = df_ref['cotton'].fillna('cotton 0%') # td coton vzio passsa a ser coton 0%

# ============  DF Spandex  =======================

df_spandex = df1.loc[df1[1].str.contains('spandex', na=False), 1]
df_spandex.name = 'spandex'

# combine spandex from brch colum 1 and 2
df_spandex =df_spandex.combine_first( df1[2] )

df_ref = pd.concat([df_ref, df_spandex], axis=1)
df_ref = df_ref.iloc[:, ~df_ref.columns.duplicated(keep='last')]
df_ref['spandex'] = df_ref['spandex'].fillna('spandex 0%')

# ================  DF Elastomultiester  ===========================
	
df_Elastomultiester	 = df1.loc[ df1[1].str.contains('Elastomultiester', na=False), 1]
df_Elastomultiester.name = 'Elastomultiester'

df_ref = pd.concat([ df_ref, df_Elastomultiester], axis=1)
df_ref = df_ref.iloc[:, ~df_ref.columns.duplicated(keep='last')]
df_ref['Elastomultiester'] = df_ref['Elastomultiester'].fillna('Elastomultiester 0%')

df_ref = df_ref.drop('Spandex', axis=1)

# ========= Concat ================

data = pd.concat([data, df_ref], axis=1)



In [6]:
# =====================  format composition data =====================================

# cotton
data['cotton'] = data['cotton'].apply( lambda x: int( re.search( '\d+', x ).group(0) ) / 100 if pd.notnull( x ) else x )

#spandex.
data['spandex'] = data['spandex'].apply( lambda x: int( re.search('\d+', x).group(0) ) /100 if pd.notnull(x) else x )

# Elastomultiester
data['Elastomultiester'] = data['Elastomultiester'].apply( lambda x: int(re.search('\d+', x).group(0))/ 100 if pd.notnull( x ) else x)



In [7]:
# ============ Check =================================

data = data.drop( columns=['pocket lining', 'size', 'composition'], axis=1 )
data = data.drop_duplicates()
data.shape


(256, 14)

In [9]:
# criando um csv
# dados = 'data_webs_treated.csv'
# data.to_csv(dados, index=False)