In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer

import pandas as pd
import numpy as np
from db_tools.data_manager import sql_to_pandas, sql_to_pandas_big
from db_tools.request_data import RequestData
from db_tools.object_repository import ObjectRepository
from python_engine.modules.features.product.repository import ProductFeaturesRepository

from common.logger import logger
from common.data_utils.df_utils import sql_list_to_tuple

from python_engine.pipelines.pipeline_repository import PipelineRepository, BUCKET
from python_engine.data.sales import SalesRepository
from python_engine.data.category import CategoryRepository
from python_engine.data.store import StoreRepository
from python_engine.data.product import ProductRepository
from python_engine.data.stocks import StocksRepository
from python_engine.modules.features.transformations.apply_rolling_mean import apply_rolling_mean
from python_engine.modules.features.transformations.fill_dates import fill_dates
from python_engine.modules.common.dataframes.cartesian_merge import merge_cartesian

from sklearn.tree import DecisionTreeRegressor
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier

import datetime

import math

import matplotlib.colors as mcolors

from sklearn.feature_extraction.text import CountVectorizer

import matplotlib.pyplot as plt
# import seaborn as sns
import pandas as pd
import cufflinks as cf
import plotly.offline
cf.go_offline()
cf.set_config_file(offline=False, world_readable=True)

from sklearn.preprocessing import MultiLabelBinarizer

In [2]:
pip install spacy

Collecting spacy
  Downloading spacy-3.1.1-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (6.4 MB)
[K     |████████████████████████████████| 6.4 MB 29.3 MB/s eta 0:00:01
[?25hCollecting typer<0.4.0,>=0.3.0
  Downloading typer-0.3.2-py3-none-any.whl (21 kB)
Collecting murmurhash<1.1.0,>=0.28.0
  Downloading murmurhash-1.0.5-cp36-cp36m-manylinux2014_x86_64.whl (20 kB)
Collecting pydantic!=1.8,!=1.8.1,<1.9.0,>=1.7.4
  Downloading pydantic-1.8.2-cp36-cp36m-manylinux2014_x86_64.whl (10.2 MB)
[K     |████████████████████████████████| 10.2 MB 57.7 MB/s eta 0:00:01
Collecting cymem<2.1.0,>=2.0.2
  Downloading cymem-2.0.5-cp36-cp36m-manylinux2014_x86_64.whl (35 kB)
Collecting tqdm<5.0.0,>=4.38.0
  Downloading tqdm-4.62.0-py2.py3-none-any.whl (76 kB)
[K     |████████████████████████████████| 76 kB 12.0 MB/s eta 0:00:01
[?25hCollecting blis<0.8.0,>=0.4.0
  Downloading blis-0.7.4-cp36-cp36m-manylinux2014_x86_64.whl (9.8 MB)
[K     |████████████████████████████████| 9.8 MB 52.7 MB/

In [3]:
from spacy.lang.fr.stop_words import STOP_WORDS as fr_stop
from spacy.lang.en.stop_words import STOP_WORDS as en_stop

In [4]:
client = 'pimkie'
environment = 'sandbox'
request_data = RequestData(client=client, environment=environment)
request_data.aurora_con, request_data.redshift_con

(<sqlalchemy.engine.base.Connection at 0x7f48df5105f8>,
 <sqlalchemy.engine.base.Connection at 0x7f48df176d68>)

In [5]:
sql = f'''
        WITH domain as (
        SELECT DISTINCT products.switch_product_id AS switch_product_id
        FROM products
        WHERE TRUE
        )
        SELECT DISTINCT switch_product_id as product_id
        FROM domain
        JOIN products USING(switch_product_id)
        WHERE products.family_id = 8
        ORDER BY product_id
        '''

product_id = sql_to_pandas_big(sql, request_data.redshift_con)
product_id.head()

Unnamed: 0,product_id
0,2136
1,2153
2,2158
3,2200
4,2209


In [6]:
# feature_definition_id = (24531, 28304, 28302, 28299, 24529, 24521, 24517, 24520, 24528, 24518, 24522)
##24518 = gender, 24531 = top silhouette, 28304 = shoulders, 28302 = collar piece, 28299 pattern type
##24529 = neckline shape, 24521 = embellishments, 24517 = color, 24520 = color fallback, 24528 = neckline depth
##24522 = pattern types
# feature_definition_id = (24531, 28304, 28302, 28299, 24529, 24521, 24517, 24520, 24528,24522)
## This list is not used
feature_definition_id = (24531, 28304, 28299, 24521, 24517, 24520, 24522,24519)

In [7]:
sql = f'''
        select products.id as product_id, products.name, products.style, feature_item_value_text.feature_definition_id,
        seasons.name as season, products.full_price as price, 
        feature_item_value_text.value, feature_definition.name as value_name
        from products
        left join feature_item_value_text on products.id = feature_item_value_text.item_id
        left join feature_definition on feature_definition.id = feature_item_value_text.feature_definition_id
        left join seasons ON seasons.id = products.season_id
        where products.family_id = 8 and products.id in {tuple(list(product_id['product_id']))} 
        order by products.id
        '''
#feature_item_value_text.feature_definition_id in {feature_definition_id}
names_data = sql_to_pandas_big(sql, request_data.aurora_con)
names_data.head()

Unnamed: 0,product_id,name,style,feature_definition_id,season,price,value,value_name
0,2134,T-shirt à message,T-Shirt standard 45-65cm-Manches courtes-Col r...,24518.0,2018S,4.0,female,gender
1,2134,T-shirt à message,T-Shirt standard 45-65cm-Manches courtes-Col r...,24521.0,2018S,4.0,fabric embellishment,embellishments
2,2134,T-shirt à message,T-Shirt standard 45-65cm-Manches courtes-Col r...,24531.0,2018S,4.0,regular,tops silhouette
3,2134,T-shirt à message,T-Shirt standard 45-65cm-Manches courtes-Col r...,24529.0,2018S,4.0,round,neckline shape
4,2134,T-shirt à message,T-Shirt standard 45-65cm-Manches courtes-Col r...,24519.0,2018S,4.0,top,category type


In [8]:
names_data.loc[(~names_data.product_id.isin(list(names_data[(names_data['value'] == 'top') & (names_data['value_name'] == 'category type')]['product_id']))),
               'value'] = np.nan
##round price
names_data['price'] = round(names_data['price'])
# take out year information
names_data['season'] = [elem[-1] if type(elem)==str else elem for elem in names_data['season']]
names_data['season'].replace({'S':'summer', 'W':'winter'}, inplace=True)
names_data.head()

Unnamed: 0,product_id,name,style,feature_definition_id,season,price,value,value_name
0,2134,T-shirt à message,T-Shirt standard 45-65cm-Manches courtes-Col r...,24518.0,summer,4.0,female,gender
1,2134,T-shirt à message,T-Shirt standard 45-65cm-Manches courtes-Col r...,24521.0,summer,4.0,fabric embellishment,embellishments
2,2134,T-shirt à message,T-Shirt standard 45-65cm-Manches courtes-Col r...,24531.0,summer,4.0,regular,tops silhouette
3,2134,T-shirt à message,T-Shirt standard 45-65cm-Manches courtes-Col r...,24529.0,summer,4.0,round,neckline shape
4,2134,T-shirt à message,T-Shirt standard 45-65cm-Manches courtes-Col r...,24519.0,summer,4.0,top,category type


In [9]:
display(names_data.value_name.unique())
wideeyes_features = ['embellishments', 'tops silhouette', 'shoulders','pattern type fallback',
                     'pattern type','category type']
## Set features not in above list nan
names_data.loc[~names_data.value_name.isin(wideeyes_features),'value'] = np.nan

array(['gender', 'embellishments', 'tops silhouette', 'neckline shape',
       'category type', 'sleeve length', 'shoulders', 'top type',
       'pattern type fallback', 'collar piece', 'neckline depth', 'color',
       'color fallback', 'pattern type', 'pants silhouette',
       'bottom type', None, 'knitwear/sweatshirt type', 'jewellery type',
       'one-piece type', 'dress/skirt length', 'dress/skirt silhouette',
       'jacket type', 'pants length', 'shorts type', 'underwear type',
       'accessory type', 'coat type', 'heel height', 'heel style',
       'toe shape', 'shoe shaft height', 'shoe type', 'vest type',
       'swimwear type', 'headwear type', 'shoe closure'], dtype=object)

In [10]:
reliable_wideeyes = names_data[(names_data['value'] == 'top') & (names_data['value_name'] == 'category type')]
display(reliable_wideeyes.head())
display(reliable_wideeyes.shape)
reliable_wideeyes_id = list(reliable_wideeyes.product_id)

Unnamed: 0,product_id,name,style,feature_definition_id,season,price,value,value_name
4,2134,T-shirt à message,T-Shirt standard 45-65cm-Manches courtes-Col r...,24519.0,summer,4.0,top,category type
14,2135,T-shirt brodé,T-Shirt standard 45-65cm-Manches courtes-Col r...,24519.0,summer,4.0,top,category type
25,2136,T-shirt brodé,T-Shirt standard 45-65cm-Manches courtes-Col r...,24519.0,summer,4.0,top,category type
40,2138,T-shirt imprimé,T-Shirt standard 45-65cm-Manches courtes-Col r...,24519.0,summer,4.0,top,category type
56,2144,T-shirt bijoux,T-Shirt standard 45-65cm-Manches courtes-Col r...,24519.0,summer,6.0,top,category type


(853, 8)

In [11]:
display(names_data.price.unique())

array([ 4.,  6.,  8., 18.,  5.,  9.,  7.,  2., 13., 16.,  3., 12., 10.,
       15., 20., 23., 26., 36., 25.])

In [12]:
#names_data.pivot(index='product_id', columns='value_name', values='value').reset_index()
names_data_unstack = names_data.set_index(['product_id','name','style', 'season','price','value_name']).value.unstack().reset_index()
names_data_unstack = names_data_unstack.loc[:, names_data_unstack.columns.notnull()]

display(names_data_unstack.head())
display(names_data_unstack.shape)

value_name,product_id,name,style,season,price,accessory type,bottom type,category type,coat type,collar piece,...,shoe type,shorts type,shoulders,sleeve length,swimwear type,toe shape,top type,tops silhouette,underwear type,vest type
0,2134,T-shirt à message,T-Shirt standard 45-65cm-Manches courtes-Col r...,summer,4.0,,,top,,,...,,,covered shoulders,,,,,regular,,
1,2135,T-shirt brodé,T-Shirt standard 45-65cm-Manches courtes-Col r...,summer,4.0,,,top,,,...,,,covered shoulders,,,,,,,
2,2136,T-shirt brodé,T-Shirt standard 45-65cm-Manches courtes-Col r...,summer,4.0,,,top,,,...,,,covered shoulders,,,,,regular,,
3,2137,T-shirt imprimé,T-Shirt standard 45-65cm-Manches courtes-Col r...,summer,4.0,,,,,,...,,,,,,,,,,
4,2138,T-shirt imprimé,T-Shirt standard 45-65cm-Manches courtes-Col r...,summer,4.0,,,top,,,...,,,covered shoulders,,,,,regular,,


(3041, 41)

In [13]:
X = names_data_unstack.drop(columns=['product_id', 'name', 'style','price'])
for col in X:
    print(col, X[col].unique())

season ['summer' 'winter']
accessory type [nan]
bottom type [nan]
category type ['top' nan]
coat type [nan]
collar piece [nan]
color [nan]
color fallback [nan]
dress/skirt length [nan]
dress/skirt silhouette [nan]
embellishments ['fabric embellishment' nan 'sequins' 'buttons' 'simple' 'tie closure'
 'fringe' 'zippers' 'pockets' 'fur' 'belted']
gender [nan]
headwear type [nan]
heel height [nan]
heel style [nan]
jacket type [nan]
jewellery type [nan]
knitwear/sweatshirt type [nan]
neckline depth [nan]
neckline shape [nan]
one-piece type [nan]
pants length [nan]
pants silhouette [nan]
pattern type [nan 'floral' 'solid color' 'text/numbers/letters' 'graphics' 'stripes'
 'polka dots' 'animal' 'squares/diamonds' 'metallic' 'geometric/tribal'
 'motives']
pattern type fallback ['positional print' nan 'full print' 'solid colors' 'fabric/texture']
shoe closure [nan]
shoe shaft height [nan]
shoe type [nan]
shorts type [nan]
shoulders ['covered shoulders' nan 'wide strap' 'spaghetti strap' 'off sh

In [18]:
raw_features = names_data_unstack.dropna(axis = 1, how = 'all')
display(raw_features.head())
raw_features.to_csv('EDA_data/for_EDA.csv',index = False)

value_name,product_id,name,style,season,price,category type,embellishments,pattern type,pattern type fallback,shoulders,tops silhouette
0,2134,T-shirt à message,T-Shirt standard 45-65cm-Manches courtes-Col r...,summer,4.0,top,fabric embellishment,,positional print,covered shoulders,regular
1,2135,T-shirt brodé,T-Shirt standard 45-65cm-Manches courtes-Col r...,summer,4.0,top,fabric embellishment,floral,,covered shoulders,
2,2136,T-shirt brodé,T-Shirt standard 45-65cm-Manches courtes-Col r...,summer,4.0,top,fabric embellishment,,,covered shoulders,regular
3,2137,T-shirt imprimé,T-Shirt standard 45-65cm-Manches courtes-Col r...,summer,4.0,,,,,,
4,2138,T-shirt imprimé,T-Shirt standard 45-65cm-Manches courtes-Col r...,summer,4.0,top,fabric embellishment,floral,full print,covered shoulders,regular


In [14]:
names_data_unstack = pd.get_dummies(data=names_data_unstack, columns = list(X.columns))
names_data_unstack.head(2)

Unnamed: 0,product_id,name,style,price,season_summer,season_winter,category type_top,embellishments_belted,embellishments_buttons,embellishments_fabric embellishment,...,pattern type fallback_solid colors,shoulders_covered shoulders,shoulders_off shoulder,shoulders_one shoulder,shoulders_spaghetti strap,shoulders_wide strap,tops silhouette_crop top,tops silhouette_loose/relaxed,tops silhouette_regular,tops silhouette_tight
0,2134,T-shirt à message,T-Shirt standard 45-65cm-Manches courtes-Col r...,4.0,1,0,1,0,0,1,...,0,1,0,0,0,0,0,0,1,0
1,2135,T-shirt brodé,T-Shirt standard 45-65cm-Manches courtes-Col r...,4.0,1,0,1,0,0,1,...,0,1,0,0,0,0,0,0,0,0


In [20]:
# def color_to_rgb(color):
#     try:
#         if color in list(mcolors.CSS4_COLORS):
#             rgb_color = mcolors.to_rgb(color)
#         else:
#             matching = [c for c in list(mcolors.CSS4_COLORS) if color in c]
#             rgb_color = mcolors.to_rgb(min((word for word in matching if word), key=len))
#     except Exception:
#         rgb_color = (np.nan, np.nan, np.nan)
#     return rgb_color

In [21]:
# # CHANGING COLOR FOR RGB

# #names_features['color'] = names_features['color'].replace('undefined', np.nan)

# keys_color = list(names_data_unstack.color.unique())
# keys_color_fallback = list(names_data_unstack['color fallback'].unique())
# keys = list(set(keys_color + keys_color_fallback))
# keys = [elem.replace(" ", "") for elem in keys if type(elem)!=float]
# keys.append(np.nan)
# values = []
# [values.append(color_to_rgb(elem)) for elem in keys]

# colors = dict(zip(keys, values))

# names_data_unstack['color'] = names_data_unstack['color'].map(colors)
# names_data_unstack['color fallback'] = names_data_unstack['color fallback'].map(colors)

# names_data_unstack[['color_r', 'color_g', 'color_b']] = pd.DataFrame(names_data_unstack['color'].apply(pd.Series))
# names_data_unstack[['color_fallback_r', 'color_fallback_g', 'color_fallback_b']] = pd.DataFrame(names_data_unstack['color fallback'].apply(pd.Series))

# names_data_unstack.drop(['color', 'color fallback'], axis=1, inplace=True)

# names_data_unstack.head()

# Deal with Style

In [15]:
names_data_unstack['style'] = names_data_unstack['style'].fillna('0')
names_data_unstack['style'] = names_data_unstack['style'].str.lower()
names_data_unstack['style'] = names_data_unstack['style'].str.replace('45-65cm','45/65cm')
names_data_unstack['style'] = names_data_unstack['style'].str.replace('t-shirt','tshirt')
# names_data_unstack['style'] = names_data_unstack['style'].str.replace('col bateau','col batea')
names_data_unstack['style'] = names_data_unstack['style'].str.replace(' ','_')
# names_data_unstack['style'] = names_data_unstack['style'].str.replace('col bateau','col batea')
names_data_unstack['style'] = names_data_unstack['style'].apply(lambda x: ' '.join(c for c in x.split('-')))
names_data_unstack.head()

Unnamed: 0,product_id,name,style,price,season_summer,season_winter,category type_top,embellishments_belted,embellishments_buttons,embellishments_fabric embellishment,...,pattern type fallback_solid colors,shoulders_covered shoulders,shoulders_off shoulder,shoulders_one shoulder,shoulders_spaghetti strap,shoulders_wide strap,tops silhouette_crop top,tops silhouette_loose/relaxed,tops silhouette_regular,tops silhouette_tight
0,2134,T-shirt à message,tshirt_standard_45/65cm manches_courtes col_rond,4.0,1,0,1,0,0,1,...,0,1,0,0,0,0,0,0,1,0
1,2135,T-shirt brodé,tshirt_standard_45/65cm manches_courtes col_rond,4.0,1,0,1,0,0,1,...,0,1,0,0,0,0,0,0,0,0
2,2136,T-shirt brodé,tshirt_standard_45/65cm manches_courtes col_rond,4.0,1,0,1,0,0,1,...,0,1,0,0,0,0,0,0,1,0
3,2137,T-shirt imprimé,tshirt_standard_45/65cm manches_courtes col_rond,4.0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,2138,T-shirt imprimé,tshirt_standard_45/65cm manches_courtes col_rond,4.0,1,0,1,0,0,1,...,0,1,0,0,0,0,0,0,1,0


In [16]:
display(names_data_unstack[names_data_unstack['product_id']==11002])

Unnamed: 0,product_id,name,style,price,season_summer,season_winter,category type_top,embellishments_belted,embellishments_buttons,embellishments_fabric embellishment,...,pattern type fallback_solid colors,shoulders_covered shoulders,shoulders_off shoulder,shoulders_one shoulder,shoulders_spaghetti strap,shoulders_wide strap,tops silhouette_crop top,tops silhouette_loose/relaxed,tops silhouette_regular,tops silhouette_tight
2213,11002,Cache-coeur-Manches courtes-Col V-,cache coeur manches_courtes col_v,16.0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [17]:
freq = pd.DataFrame(names_data_unstack['style'].str.split(expand=True).stack().value_counts())
freq.rename(columns={0: 'frequency'}, inplace=True)
freq.to_csv('frequency of style features.csv')
display(freq)

Unnamed: 0,frequency
tshirt_standard_45/65cm,2625
col_rond,1795
manches_courtes,1414
col_v,670
manches_longues,575
sans_manches,470
bretelles,445
tshirt_court_(<45cm),206
divers,168
col_monta,154


In [18]:
mlb = MultiLabelBinarizer()
labellist = []
for i in range(len(names_data_unstack)):
    if names_data_unstack.iloc[i,2] is not None:
        words = names_data_unstack.iloc[i,2].split()
        labellist.append(words)
    else:
        labellist.append([])
y = mlb.fit_transform(labellist)
print(mlb.classes_)
print(y.shape)

['0' 'body' 'bretelles' 'cache' 'coeur' 'col_batea' 'col_bateau'
 'col_bénit' 'col_bénitier' 'col_carré' 'col_fendu' 'col_monta'
 'col_montant' 'col_polo' 'col_rapporté' 'col_rond' 'col_roulé' 'col_v'
 'divers' 'dos_nu' 'gilet_court_(<65cm)' 'manches_3/4' 'manches_courtes'
 'manches_longues' 'sans_col' 'sans_manches' 'tshirt_asymétrique'
 'tshirt_bandeau' 'tshirt_court_(<45cm)' 'tshirt_long_(>65cm)'
 'tshirt_standard_45/65cm']
(3035, 31)


In [19]:
countvec = CountVectorizer(vocabulary=freq.index.values)
print(countvec.get_feature_names())
style_feature = pd.DataFrame(y, columns = list(mlb.classes_), index = names_data_unstack.product_id).reset_index()
style_feature = style_feature.drop(['0'],axis = 1)
# style_feature.drop_duplicates(inplace=True)
# style_feature.reset_index(drop=True, inplace=True)
style_feature = style_feature.drop(['coeur'],axis = 1)
style_feature = style_feature.rename(columns = {'cache':'cache_coeur'})
display(style_feature)
# display(style_feature.iloc[0,:])
# display(style_feature.loc[style_feature.index[style_feature['product_id']==2259],:])

['tshirt_standard_45/65cm', 'col_rond', 'manches_courtes', 'col_v', 'manches_longues', 'sans_manches', 'bretelles', 'tshirt_court_(<45cm)', 'divers', 'col_monta', 'col_carré', 'body', 'manches_3/4', 'col_montant', 'sans_col', 'tshirt_long_(>65cm)', 'tshirt_bandeau', '0', 'col_roulé', 'dos_nu', 'col_batea', 'col_polo', 'tshirt_asymétrique', 'coeur', 'cache', 'col_bénitier', 'col_fendu', 'col_rapporté', 'col_bateau', 'col_bénit', 'gilet_court_(<65cm)']


Unnamed: 0,product_id,body,bretelles,cache_coeur,col_batea,col_bateau,col_bénit,col_bénitier,col_carré,col_fendu,...,manches_3/4,manches_courtes,manches_longues,sans_col,sans_manches,tshirt_asymétrique,tshirt_bandeau,tshirt_court_(<45cm),tshirt_long_(>65cm),tshirt_standard_45/65cm
0,2134,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,1
1,2135,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,1
2,2136,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,1
3,2137,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,1
4,2138,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,1
5,2139,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,1
6,2144,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,1
7,2145,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
8,2146,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
9,2149,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1


# Deal with names

In [20]:
names_features = pd.merge(names_data_unstack, 
    style_feature, on='product_id', how='left')

names_features.drop(['style'], axis=1, inplace=True)
print(names_features.shape)
names_features.head()

(3035, 69)


Unnamed: 0,product_id,name,price,season_summer,season_winter,category type_top,embellishments_belted,embellishments_buttons,embellishments_fabric embellishment,embellishments_fringe,...,manches_3/4,manches_courtes,manches_longues,sans_col,sans_manches,tshirt_asymétrique,tshirt_bandeau,tshirt_court_(<45cm),tshirt_long_(>65cm),tshirt_standard_45/65cm
0,2134,T-shirt à message,4.0,1,0,1,0,0,1,0,...,0,1,0,0,0,0,0,0,0,1
1,2135,T-shirt brodé,4.0,1,0,1,0,0,1,0,...,0,1,0,0,0,0,0,0,0,1
2,2136,T-shirt brodé,4.0,1,0,1,0,0,1,0,...,0,1,0,0,0,0,0,0,0,1
3,2137,T-shirt imprimé,4.0,1,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,1
4,2138,T-shirt imprimé,4.0,1,0,1,0,0,1,0,...,0,1,0,0,0,0,0,0,0,1


In [21]:
names_features.columns

Index(['product_id', 'name', 'price', 'season_summer', 'season_winter',
       'category type_top', 'embellishments_belted', 'embellishments_buttons',
       'embellishments_fabric embellishment', 'embellishments_fringe',
       'embellishments_fur', 'embellishments_pockets',
       'embellishments_sequins', 'embellishments_simple',
       'embellishments_tie closure', 'embellishments_zippers',
       'pattern type_animal', 'pattern type_floral',
       'pattern type_geometric/tribal', 'pattern type_graphics',
       'pattern type_metallic', 'pattern type_motives',
       'pattern type_polka dots', 'pattern type_solid color',
       'pattern type_squares/diamonds', 'pattern type_stripes',
       'pattern type_text/numbers/letters',
       'pattern type fallback_fabric/texture',
       'pattern type fallback_full print',
       'pattern type fallback_positional print',
       'pattern type fallback_solid colors', 'shoulders_covered shoulders',
       'shoulders_off shoulder', 'shoulders

In [22]:
names_features['name'] = names_features['name'].fillna('0')
names_features['name'] = names_features['name'].str.lower()
names_features['name'] = names_features['name'].str.replace('[^\w\s]','')
names_features['name'] = names_features['name'].astype(str)

stopwords = list(fr_stop) + list(en_stop) + [word for line in list(names_features.columns.values) for word in line.split()] + [word for line in list(names_features.columns.values) for word in line.split('_')]
str(stopwords)

names_features['name'].astype(str)

names_features['name_split'] = names_features['name'].apply(lambda x: ' '.join([item for item in x.split() if item not in stopwords]))
names_features['name_split'] = names_features['name_split'].apply(lambda x: ' '.join(c for c in x.split() if not c.isdigit()))

names_features.head()

Unnamed: 0,product_id,name,price,season_summer,season_winter,category type_top,embellishments_belted,embellishments_buttons,embellishments_fabric embellishment,embellishments_fringe,...,manches_courtes,manches_longues,sans_col,sans_manches,tshirt_asymétrique,tshirt_bandeau,tshirt_court_(<45cm),tshirt_long_(>65cm),tshirt_standard_45/65cm,name_split
0,2134,tshirt à message,4.0,1,0,1,0,0,1,0,...,1,0,0,0,0,0,0,0,1,message
1,2135,tshirt brodé,4.0,1,0,1,0,0,1,0,...,1,0,0,0,0,0,0,0,1,brodé
2,2136,tshirt brodé,4.0,1,0,1,0,0,1,0,...,1,0,0,0,0,0,0,0,1,brodé
3,2137,tshirt imprimé,4.0,1,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,1,imprimé
4,2138,tshirt imprimé,4.0,1,0,1,0,0,1,0,...,1,0,0,0,0,0,0,0,1,imprimé


In [23]:
vectorizer = CountVectorizer(min_df = 55)
name_feature_real = pd.DataFrame(vectorizer.fit_transform(names_features['name_split']).toarray(), index=names_features.product_id, 
                 columns=vectorizer.get_feature_names()).reset_index()
name_feature_real.drop(['4565cmbretellescol','4565cmmanches','4565cmsans','courtescol','longuescol','manchescol'], axis = 1, inplace = True)

names_features_2 = pd.merge(names_features, name_feature_real, on='product_id', how='left')



names_features_2.drop(['name', 'name_split'], axis=1, inplace=True)
display(names_features_2.head())
display(name_feature_real.head())

Unnamed: 0,product_id,price,season_summer,season_winter,category type_top,embellishments_belted,embellishments_buttons,embellishments_fabric embellishment,embellishments_fringe,embellishments_fur,...,basique,brodé,cropped,côtelé,dentelle,débardeur,fines,imprimé,maille,message
0,2134,4.0,1,0,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
1,2135,4.0,1,0,1,0,0,1,0,0,...,0,1,0,0,0,0,0,0,0,0
2,2136,4.0,1,0,1,0,0,1,0,0,...,0,1,0,0,0,0,0,0,0,0
3,2137,4.0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,2138,4.0,1,0,1,0,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0


Unnamed: 0,product_id,basique,brodé,cropped,côtelé,dentelle,débardeur,fines,imprimé,maille,message
0,2134,0,0,0,0,0,0,0,0,0,1
1,2135,0,1,0,0,0,0,0,0,0,0
2,2136,0,1,0,0,0,0,0,0,0,0
3,2137,0,0,0,0,0,0,0,1,0,0
4,2138,0,0,0,0,0,0,0,1,0,0


In [24]:
display(name_feature_real.shape)
display(vectorizer.get_feature_names())
display(vectorizer.fit_transform(names_features['name_split']).toarray().sum(axis = 0))

(3035, 11)

['4565cmbretellescol',
 '4565cmmanches',
 '4565cmsans',
 'basique',
 'brodé',
 'courtescol',
 'cropped',
 'côtelé',
 'dentelle',
 'débardeur',
 'fines',
 'imprimé',
 'longuescol',
 'maille',
 'manchescol',
 'message']

array([ 77, 242,  62, 140,  82, 150, 123, 104, 251, 182,  56, 117,  98,
        58,  80,  72], dtype=int64)

In [49]:
# test = names_data.drop_duplicates(subset = ['product_id']).reset_index()
# test.head()

In [50]:
# all_features = pd.merge(names_features_2,test[['product_id','season','price']],on = 'product_id')
# print(all_features.shape)
# all_features.head()

In [51]:
# all_features.columns

In [25]:
newest_order = list(names_features_2)
# newest_order.remove('gender_female')
# newest_order.remove('gender_male')
newest_order.remove('category type_top')
newest_order.remove('price')
newest_order.append('price')
all_features = names_features_2[newest_order]
display(all_features.head())

Unnamed: 0,product_id,season_summer,season_winter,embellishments_belted,embellishments_buttons,embellishments_fabric embellishment,embellishments_fringe,embellishments_fur,embellishments_pockets,embellishments_sequins,...,brodé,cropped,côtelé,dentelle,débardeur,fines,imprimé,maille,message,price
0,2134,1,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,4.0
1,2135,1,0,0,0,1,0,0,0,0,...,1,0,0,0,0,0,0,0,0,4.0
2,2136,1,0,0,0,1,0,0,0,0,...,1,0,0,0,0,0,0,0,0,4.0
3,2137,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,4.0
4,2138,1,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,1,0,0,4.0


In [26]:
style_name_feature = pd.merge(style_feature,name_feature_real,on = 'product_id')
display(style_name_feature.head())
reliable_feature = pd.merge(style_name_feature,all_features[['product_id','season_summer','season_winter','price']],on = 'product_id')
display(reliable_feature.head())
display(reliable_feature.shape)

Unnamed: 0,product_id,body,bretelles,cache_coeur,col_batea,col_bateau,col_bénit,col_bénitier,col_carré,col_fendu,...,basique,brodé,cropped,côtelé,dentelle,débardeur,fines,imprimé,maille,message
0,2134,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,2135,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
2,2136,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
3,2137,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,2138,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


Unnamed: 0,product_id,body,bretelles,cache_coeur,col_batea,col_bateau,col_bénit,col_bénitier,col_carré,col_fendu,...,côtelé,dentelle,débardeur,fines,imprimé,maille,message,season_summer,season_winter,price
0,2134,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,1,0,4.0
1,2135,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,4.0
2,2136,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,4.0
3,2137,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,1,0,4.0
4,2138,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,1,0,4.0


(3035, 43)

In [27]:
reliable_feature_wideeyes = all_features[all_features.product_id.isin(reliable_wideeyes_id)].reset_index(drop = True)
display(reliable_feature_wideeyes.head())
display(reliable_feature_wideeyes.shape)

Unnamed: 0,product_id,season_summer,season_winter,embellishments_belted,embellishments_buttons,embellishments_fabric embellishment,embellishments_fringe,embellishments_fur,embellishments_pockets,embellishments_sequins,...,brodé,cropped,côtelé,dentelle,débardeur,fines,imprimé,maille,message,price
0,2134,1,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,4.0
1,2135,1,0,0,0,1,0,0,0,0,...,1,0,0,0,0,0,0,0,0,4.0
2,2136,1,0,0,0,1,0,0,0,0,...,1,0,0,0,0,0,0,0,0,4.0
3,2138,1,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,1,0,0,4.0
4,2144,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,6.0


(853, 77)

In [28]:
all_features.to_csv('final_dataset/reliable_sparse_withwideeyes_features.csv', index = False)

In [29]:
reliable_feature.to_csv('final_dataset/reliable_features.csv',index = False)

In [30]:
reliable_feature_wideeyes.to_csv('final_dataset/reliable_withwideeyes_features.csv', index = False)