Code by JGIO

# data analysis

## import

In [1]:
from gensim.models import Word2Vec
import polars as pl
import re
import spacy
import numpy as np
%matplotlib widget

### Test W2V model

In [2]:
w2v = Word2Vec.load("wikipedia2vec_eswiki_20231101.model")
w2v.wv.most_similar(positive=["rey","mujer"],negative=["hombre"])

[('reina', 0.6377933621406555),
 ('monarca', 0.6222385168075562),
 ('soberana', 0.5336526036262512),
 ('princesa', 0.5175269246101379),
 ('reyes', 0.5049415230751038),
 ('infanta', 0.5036953687667847),
 ('regente', 0.4985811114311218),
 ('soberano', 0.49261191487312317),
 ('corregente', 0.4901221692562103),
 ('trono', 0.4885112941265106)]

### Read data csv

In [3]:
df = pl.read_csv("data.csv", has_header=False)
df

column_1
str
"""Ubicación: Fer…"
"""Historia: Fund…"
"""Productos y Se…"
"""Herramientas M…"
"""Materiales de …"
"""Ferretería en …"
"""Pinturas y Aca…"
"""Asesoramiento …"
"""Compromiso con…"
"""Horario de Ate…"


## Transform and Analyse

### Lemmatize

import spacy model and create cleaning function to lemmatize

In [5]:
nlp = spacy.load("es_core_news_lg", disable=['ner', 'parser'])
def cleaning(doc):
    txt = [token.lemma_ for token in doc if not token.is_stop]
    if len(txt) > 2:
        return ' '.join(txt)
cleaning(nlp(re.sub('[^A-Za-zÁÉÍÓÚÜÑáéíóúüñ]+',' ',df.row(0)[0]).lower()))

'ubicación ferretería patito estratégicamente ubicado corazón ciudad méxico concurrido zona comercial colonia roma dirección exacto avenida insurgente sur colonia roma ciudad méxico'

In [5]:
df1 = df.map_rows(lambda row: (cleaning(nlp(re.sub('[^A-Za-zÁÉÍÓÚÜÑáéíóúüñ]+',' ',row[0]).lower()))))
df1

map
str
"""ubicación ferr…"
"""historia funda…"
"""producto servi…"
"""herramienta ma…"
"""material const…"
"""ferretería gen…"
"""pintura acabad…"
"""asesoramiento …"
"""compromiso cal…"
"""horario atenci…"


### Separate lemmatized text to list and drop map column

In [6]:
df1 = df1.with_columns(pl.col("map").str.split(by=" ").alias("split"))
df1 = df1.drop(["map"])
df1

split
list[str]
"[""ubicación"", ""ferretería"", … ""méxico""]"
"[""historia"", ""fundado"", … ""ciudad""]"
"[""producto"", ""servicio"", … ""profesional""]"
"[""herramienta"", ""manual"", … ""proyecto""]"
"[""material"", ""construcción"", … ""construcción""]"
"[""ferretería"", ""general"", … ""reparación""]"
"[""pintura"", ""acabado"", … ""espacio""]"
"[""asesoramiento"", ""profesional"", … ""proyecto""]"
"[""compromiso"", ""calidad"", … ""expectativa""]"
"[""horario"", ""atención"", … ""hogar""]"


### Create doc to vec function as an average of the vectors of the words in the doc

In [7]:
def c2v(l):
    r = []
    for i in l:
        try:
            r.append(w2v.wv[i])
        except:
            print("skipped")
    return np.mean(np.array(r),axis=0)
c2v(df1.row(0)[0])

array([ 2.12836012e-01, -1.27560198e-01, -4.52510148e-01, -4.02413979e-02,
       -5.87529913e-02, -5.56848586e-01, -2.52582669e-01, -1.62145481e-01,
       -6.31863534e-01,  4.13512111e-01, -1.66260108e-01,  3.57398111e-03,
       -2.59342939e-01, -1.95746034e-01,  1.68473497e-01, -5.25520481e-02,
       -3.26525062e-01, -1.96536973e-01, -1.72310010e-01, -1.01338744e-01,
        2.88965762e-01,  5.22659361e-01,  2.63483226e-01, -1.47314772e-01,
       -5.58546185e-01,  3.66457365e-02, -7.03997374e-01, -3.93181503e-01,
        2.49967158e-01, -8.30279067e-02, -4.23828401e-02,  1.02340090e+00,
        2.46616691e-01, -1.99524656e-01,  1.71442851e-01,  7.16329552e-03,
       -3.90876293e-01,  2.55525172e-01, -2.23956004e-01,  3.00640404e-01,
       -1.65765628e-01,  5.22411950e-02, -4.04426277e-01,  2.37454429e-01,
        4.78758782e-01, -7.07135618e-01, -1.57686606e-01, -8.59083772e-01,
        3.02236915e-01, -2.16300577e-01, -1.26846731e-01, -3.14361542e-01,
       -6.67401969e-01,  

In [8]:
df1 = df1.map_rows(lambda row: (tuple(c2v(row[0]))))
df1

skipped
skipped


column_0,column_1,column_2,column_3,column_4,column_5,column_6,column_7,column_8,column_9,column_10,column_11,column_12,column_13,column_14,column_15,column_16,column_17,column_18,column_19,column_20,column_21,column_22,column_23,column_24,column_25,column_26,column_27,column_28,column_29,column_30,column_31,column_32,column_33,column_34,column_35,column_36,…,column_263,column_264,column_265,column_266,column_267,column_268,column_269,column_270,column_271,column_272,column_273,column_274,column_275,column_276,column_277,column_278,column_279,column_280,column_281,column_282,column_283,column_284,column_285,column_286,column_287,column_288,column_289,column_290,column_291,column_292,column_293,column_294,column_295,column_296,column_297,column_298,column_299
f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,…,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
0.212836,-0.12756,-0.45251,-0.040241,-0.058753,-0.556849,-0.252583,-0.162145,-0.631864,0.413512,-0.16626,0.003574,-0.259343,-0.195746,0.168473,-0.052552,-0.326525,-0.196537,-0.17231,-0.101339,0.288966,0.522659,0.263483,-0.147315,-0.558546,0.036646,-0.703997,-0.393182,0.249967,-0.083028,-0.042383,1.023401,0.246617,-0.199525,0.171443,0.007163,-0.390876,…,-0.116024,0.053837,-0.215412,0.318292,0.058482,-0.030807,-0.179667,-0.098336,0.567621,0.296195,-0.085612,-0.073767,-0.089834,-0.205213,-0.611677,-0.227301,0.029142,0.291956,-0.066367,-0.002605,0.251722,0.465745,0.623218,-0.078883,0.259047,-0.190617,-0.762434,-0.094988,0.102185,-0.167386,-0.354981,-0.008689,-0.455456,0.042842,0.072888,-0.051383,0.027874
0.423871,0.025214,0.012068,-0.223127,-0.202673,-0.393712,-0.374561,0.259974,-0.375474,0.06919,-0.166603,-0.134075,-0.216289,-0.189332,-0.060694,-0.030486,0.194823,-0.116378,-0.23559,-0.259037,0.487899,0.427689,-0.060725,-0.160337,-0.401157,0.066193,-0.329237,-0.123573,0.203745,0.069274,-0.137278,0.664172,0.19144,0.035487,0.096587,0.262722,-0.819545,…,0.267579,0.319991,-0.283283,0.020059,-0.031595,0.381906,-0.10924,-0.185706,0.140849,0.76805,0.039422,-0.261649,-0.275816,-0.011501,-0.567118,-0.219702,0.128224,-0.286524,-0.320208,-0.145584,0.290893,0.140684,0.475552,-0.257042,-0.119533,-0.028789,-0.290617,0.111265,-0.381478,0.116171,-0.124392,0.09486,-0.237941,-0.333296,-0.013143,0.089247,0.146553
0.385859,0.035165,0.033237,-0.557369,-0.579002,-0.485783,-0.877546,0.242791,-0.705961,0.402078,0.10023,-0.173728,-0.509713,0.049718,-0.323029,0.215364,-0.32701,-0.495029,-0.169137,-0.423384,0.628343,0.513872,-0.604525,-0.628208,-0.175458,0.023092,-0.372065,0.229564,0.205592,0.203918,-0.505088,0.472427,0.121915,0.179966,0.326367,0.073849,-0.992093,…,0.516963,0.472586,-0.286367,-0.176856,-0.47359,0.40081,0.153803,-0.199136,-0.087543,0.523957,-0.087257,-0.010219,-0.898598,-0.019129,-0.566014,-0.239108,-0.191422,-0.797167,-0.207087,0.112941,0.868247,-0.092758,0.433557,-0.189287,-0.035938,0.469664,0.090938,-0.036988,-0.16287,0.181958,0.165856,0.176685,-0.436765,-0.665546,-0.239671,0.18433,0.093376
0.406118,0.302412,0.036468,-0.50005,-0.131052,0.002493,-0.054389,0.72176,-0.524406,0.606892,0.128333,-0.136146,-0.278787,0.468067,-0.310626,0.253619,-0.132123,-0.324798,-0.075846,-0.545838,0.582537,0.21807,-0.396565,-0.138196,-0.415168,-0.063165,-0.273881,0.257613,0.096426,0.149001,-0.198695,0.069611,0.242321,-0.137488,0.053384,-0.184186,-0.629436,…,0.379404,0.059307,-0.275136,-0.114843,-0.393695,0.361433,0.257503,-0.198394,-0.191552,0.34365,-0.197258,-0.053532,-0.647032,0.17516,-0.095757,-0.259961,0.152357,-0.237122,-0.124926,0.305039,0.662504,-0.040117,0.388982,0.139257,-0.198855,0.37803,-0.324647,-0.206529,-0.521431,0.201104,0.012297,0.27739,-0.32314,-0.437672,0.221411,-0.216267,-0.10492
-0.222472,-0.005717,0.01755,-0.152431,0.082138,-0.45164,-0.665867,0.165432,-0.534628,0.498674,0.236038,0.094743,-0.362782,0.106141,-0.252238,-0.034119,-0.275458,-0.599559,-0.095541,-0.391178,0.712369,0.563902,-0.332539,-0.381367,-1.092756,-0.329965,-0.449016,-0.025966,-0.097124,0.115147,-0.745745,0.39677,0.390946,-0.203624,0.530506,0.317422,-1.065472,…,0.479295,0.197936,-0.287452,-0.013139,-0.389713,0.430592,0.335071,-0.380843,0.295854,0.423726,-0.225374,-0.008997,-0.849865,0.346232,-0.605132,0.107615,-0.361664,-0.381799,-0.208291,0.080691,0.642929,-0.099032,0.289986,-0.093268,-0.340186,0.355064,0.017071,-0.071143,-0.547553,0.31032,0.310858,0.056379,-0.703524,-0.392997,0.362656,0.057918,-0.109119
-0.175948,0.251044,0.57868,-0.18834,-0.134889,-0.674244,-0.575561,0.596989,-0.457235,0.327338,-0.155796,-0.347386,-0.030644,0.294292,-0.489563,0.245185,-0.291255,-0.444115,-0.018272,-0.428581,0.361373,0.204568,-0.167897,-0.371942,-0.465282,-0.053389,-0.552517,0.244529,0.130768,0.177434,-0.778337,0.366748,0.278908,-0.230686,0.251298,0.522265,-0.693394,…,0.422927,0.448021,-0.464064,-0.356252,-0.192774,0.453984,0.569944,-0.287805,-0.046933,0.214438,-0.467071,-0.366539,-0.498214,-0.211967,-0.563618,0.142657,0.076739,-0.506741,-0.293737,0.22705,0.485774,0.017933,0.501325,-0.172824,0.022931,0.570613,-0.155529,-0.140879,-0.513497,0.158303,-0.383541,0.091375,-0.15397,-0.578252,-0.189774,0.058405,0.032852
0.178135,0.022982,-0.213848,-0.133932,-0.852305,-0.104335,-0.79042,-0.073752,-0.671372,0.704523,0.182869,0.044848,-0.675904,0.246938,-0.094934,-0.008603,-0.457433,-0.445713,-0.627258,-0.3387,0.520883,0.579376,-0.401504,-0.198822,-0.288183,0.074724,-0.552751,0.093068,-0.066773,0.287742,-0.416837,0.224975,0.255587,0.045755,0.200477,0.336002,-0.642228,…,0.352373,0.339299,-0.560981,-0.674634,-0.310045,0.19487,0.309852,-0.43044,0.129791,-0.055731,-0.274468,-0.363785,-0.830623,-0.102053,-0.580149,-0.296471,-0.080918,-0.57889,-0.538502,-0.042045,0.373307,-0.347435,0.662217,0.044096,0.038719,0.321522,-0.109339,-0.389375,-0.469735,0.111092,0.039666,0.227299,-0.457352,-0.4216,-0.079483,-0.201947,0.242044
0.599185,0.025005,0.356816,-0.978445,-0.614492,-0.609616,-0.604118,0.605126,-0.815449,0.662975,-0.076576,0.013116,-0.258014,-0.021262,-0.556723,0.222125,-0.398456,-0.281881,-0.231095,-0.2412,0.892776,0.855575,-0.629748,-0.613585,-0.21441,-0.448102,0.011035,-0.124304,0.442128,0.012933,-0.339237,0.132372,-0.17417,0.627024,0.054415,-0.438037,-0.620077,…,0.257782,0.306392,-0.281901,-0.171799,-0.144764,0.13587,0.487954,0.373235,0.056123,0.543401,-0.376073,0.424472,-0.599846,-0.005143,-0.274084,-0.557398,-0.475633,-0.241881,-0.11527,0.134698,0.305313,-0.025863,0.545038,-0.462512,-0.004291,0.423122,-0.359964,-0.287656,-0.107858,-0.265118,-0.084689,0.029696,-0.032281,-0.457986,-0.23115,0.613272,0.015541
0.236955,0.260596,0.040948,-0.410093,-0.455945,-0.404472,-0.930851,0.227978,-0.649143,0.368169,-0.080791,0.121134,-0.302288,-0.250972,-0.10699,-0.127647,-0.374245,-0.393302,-0.075611,-0.194209,0.815483,0.415962,-0.537307,-0.121029,-0.408175,-0.314397,-0.017073,-0.124826,0.399553,-0.055004,-0.494988,0.658587,-0.188438,0.664433,0.30551,-0.005952,-1.108427,…,0.372246,0.528815,-0.388967,-0.200894,-0.200967,0.362474,0.061914,-0.283254,-0.0909,0.887724,0.196602,0.154666,-0.512238,0.026876,-0.140718,-0.105519,-0.424078,-0.525434,-0.437513,-0.095445,0.798837,-0.19612,0.437806,-0.171648,-0.03646,0.203062,0.192346,0.034439,-0.083275,0.113247,-0.162199,-0.222194,-0.356362,-0.290594,-0.165772,0.217507,0.070869
-0.32656,0.241863,-0.010343,-0.071194,-0.114975,-0.455396,-0.167113,-0.05485,-0.779437,-0.039307,0.181978,-0.217534,0.256694,0.301415,0.105846,-0.276178,0.230826,-0.159156,-0.141713,-0.120152,0.33321,0.715146,-0.028786,-0.30931,-0.811367,0.310113,-0.166017,0.015034,0.602633,-0.40989,-0.073203,0.629144,-0.005542,0.343271,-0.250643,0.042482,-0.521132,…,0.272225,0.162943,0.110473,0.100124,0.276004,-0.10689,0.312849,0.103457,0.050042,0.12044,-0.148665,-0.104692,-0.149209,-0.05543,-0.352111,0.021126,-0.226647,0.065913,0.018696,-0.13437,0.339083,0.260341,0.185274,-0.263954,-0.266613,-0.25394,-0.158179,0.130954,-0.379301,-0.009433,0.06357,0.629039,-0.254832,-0.057408,-0.091928,0.18714,-0.01406


### Name each document from similar word from vector

In [9]:
w2v.wv.most_similar(df1.to_numpy()[0])

[('ciudad', 0.8020497560501099),
 ('zona', 0.6590033769607544),
 ('colonia', 0.658691942691803),
 ('norte', 0.6529296040534973),
 ('centro', 0.6522804498672485),
 ('alrededor', 0.6347556114196777),
 ('ubicación', 0.6186294555664062),
 ('barrio', 0.610630989074707),
 ('oriente', 0.6105065941810608),
 ('inmediación', 0.6083475351333618)]

In [10]:
[[j[0] for j in w2v.wv.most_similar(i)] for i in df1.to_numpy()]

[['ciudad',
  'zona',
  'colonia',
  'norte',
  'centro',
  'alrededor',
  'ubicación',
  'barrio',
  'oriente',
  'inmediación'],
 ['negocio',
  'trabajador',
  'establecimiento',
  'llegar',
  'asimismo',
  'momento',
  'pobre',
  'tener',
  'tiempo',
  'empleado'],
 ['producto',
  'equipamiento',
  'servicio',
  'mantenimiento',
  'adecuado',
  'herramienta',
  'material',
  'necesario',
  'mejora',
  'necesidad'],
 ['herramienta',
  'permitir',
  'eléctrico',
  'taladro',
  'utilizar',
  'accesorio',
  'dispositivo',
  'instrumento',
  'máquina',
  'equipamiento'],
 ['material',
  'construcción',
  'cemento',
  'hormigón',
  'estructura',
  'concreto',
  'existente',
  'necesario',
  'revestimiento',
  'cantidad'],
 ['reparación',
  'construcción',
  'necesario',
  'refacción',
  'maquinaria',
  'tornillo',
  'cerradura',
  'reparar',
  'utillaje',
  'mantenimiento'],
 ['acabado',
  'pintura',
  'ejemplo',
  'objeto',
  'particular',
  'igualmente',
  'producto',
  'permitir',
  'c

Get top most similar only

In [11]:
categories = [[j[0] for j in w2v.wv.most_similar(i)][0] for i in df1.to_numpy()]
categories

['ciudad',
 'negocio',
 'producto',
 'herramienta',
 'material',
 'reparación',
 'acabado',
 'asesoramiento',
 'calidad',
 'lunes',
 'obstante',
 'producto',
 'compra',
 'cliente',
 'precio',
 'contenido',
 'cliente',
 'privacidad']

Count unique in list for unique categories

In [12]:
len(set(categories))

16

Get not-unique indexes from category names list

In [13]:
[k for k in [[j for j in range(len(categories)) if i == categories[j]] for i in set(categories)] if len(k) > 1]

[[2, 11], [13, 16]]

WIP automated concatenation rows from the previous indexes.

### import "edited" data

In [14]:
df2 = pl.read_csv("edited_data.csv", has_header=False)
df2

column_1
str
"""Ubicación: Fer…"
"""Historia: Fund…"
"""Productos y Se…"
"""Herramientas M…"
"""Materiales de …"
"""Ferretería en …"
"""Pinturas y Aca…"
"""Asesoramiento …"
"""Compromiso con…"
"""Horario de Ate…"


clean

In [15]:
df2 = df2.map_rows(lambda row: (cleaning(nlp(re.sub('[^A-Za-záéíóúüñ]+',' ',row[0]).lower()))))
df2

map
str
"""ubicación ferr…"
"""historia funda…"
"""producto servi…"
"""herramienta ma…"
"""material const…"
"""ferretería gen…"
"""pintura acabad…"
"""asesoramiento …"
"""compromiso cal…"
"""horario atenci…"


split and drop

In [16]:
df2 = df2.with_columns(pl.col("map").str.split(by=" ").alias("split"))
df2 = df2.drop(["map"])
df2

split
list[str]
"[""ubicación"", ""ferretería"", … ""méxico""]"
"[""historia"", ""fundado"", … ""ciudad""]"
"[""producto"", ""servicio"", … ""resultar""]"
"[""herramienta"", ""manual"", … ""proyecto""]"
"[""material"", ""construcción"", … ""construcción""]"
"[""ferretería"", ""general"", … ""reparación""]"
"[""pintura"", ""acabado"", … ""espacio""]"
"[""asesoramiento"", ""profesional"", … ""proyecto""]"
"[""compromiso"", ""calidad"", … ""expectativa""]"
"[""horario"", ""atención"", … ""hogar""]"


In [17]:
df2 = df2.map_rows(lambda row: (tuple(c2v(row[0]))))
df2

skipped
skipped


column_0,column_1,column_2,column_3,column_4,column_5,column_6,column_7,column_8,column_9,column_10,column_11,column_12,column_13,column_14,column_15,column_16,column_17,column_18,column_19,column_20,column_21,column_22,column_23,column_24,column_25,column_26,column_27,column_28,column_29,column_30,column_31,column_32,column_33,column_34,column_35,column_36,…,column_263,column_264,column_265,column_266,column_267,column_268,column_269,column_270,column_271,column_272,column_273,column_274,column_275,column_276,column_277,column_278,column_279,column_280,column_281,column_282,column_283,column_284,column_285,column_286,column_287,column_288,column_289,column_290,column_291,column_292,column_293,column_294,column_295,column_296,column_297,column_298,column_299
f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,…,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
0.212836,-0.12756,-0.45251,-0.040241,-0.058753,-0.556849,-0.252583,-0.162145,-0.631864,0.413512,-0.16626,0.003574,-0.259343,-0.195746,0.168473,-0.052552,-0.326525,-0.196537,-0.17231,-0.101339,0.288966,0.522659,0.263483,-0.147315,-0.558546,0.036646,-0.703997,-0.393182,0.249967,-0.083028,-0.042383,1.023401,0.246617,-0.199525,0.171443,0.007163,-0.390876,…,-0.116024,0.053837,-0.215412,0.318292,0.058482,-0.030807,-0.179667,-0.098336,0.567621,0.296195,-0.085612,-0.073767,-0.089834,-0.205213,-0.611677,-0.227301,0.029142,0.291956,-0.066367,-0.002605,0.251722,0.465745,0.623218,-0.078883,0.259047,-0.190617,-0.762434,-0.094988,0.102185,-0.167386,-0.354981,-0.008689,-0.455456,0.042842,0.072888,-0.051383,0.027874
0.423871,0.025214,0.012068,-0.223127,-0.202673,-0.393712,-0.374561,0.259974,-0.375474,0.06919,-0.166603,-0.134075,-0.216289,-0.189332,-0.060694,-0.030486,0.194823,-0.116378,-0.23559,-0.259037,0.487899,0.427689,-0.060725,-0.160337,-0.401157,0.066193,-0.329237,-0.123573,0.203745,0.069274,-0.137278,0.664172,0.19144,0.035487,0.096587,0.262722,-0.819545,…,0.267579,0.319991,-0.283283,0.020059,-0.031595,0.381906,-0.10924,-0.185706,0.140849,0.76805,0.039422,-0.261649,-0.275816,-0.011501,-0.567118,-0.219702,0.128224,-0.286524,-0.320208,-0.145584,0.290893,0.140684,0.475552,-0.257042,-0.119533,-0.028789,-0.290617,0.111265,-0.381478,0.116171,-0.124392,0.09486,-0.237941,-0.333296,-0.013143,0.089247,0.146553
0.348836,0.035179,-0.045416,-0.575901,-0.594321,-0.454019,-0.839649,0.231388,-0.653599,0.377517,0.057821,-0.193992,-0.450284,0.024695,-0.286494,-0.041422,-0.350436,-0.452235,-0.02499,-0.396185,0.67974,0.504568,-0.70397,-0.675819,-0.242816,-0.068937,-0.318696,0.171014,0.257769,0.311708,-0.55574,0.413056,0.074943,0.202789,0.2601,0.07812,-1.044546,…,0.477848,0.491257,-0.264699,-0.211598,-0.494662,0.304034,0.33598,-0.147852,-0.220011,0.565224,-0.083292,0.01169,-0.838337,0.06128,-0.422726,-0.29855,-0.215033,-0.786702,-0.289516,0.173903,0.794086,-0.067954,0.442392,-0.148371,-0.174001,0.416275,0.113394,-0.065195,-0.169659,0.20363,0.207764,0.122176,-0.473829,-0.636293,-0.257456,0.11705,-0.01146
0.406118,0.302412,0.036468,-0.50005,-0.131052,0.002493,-0.054389,0.72176,-0.524406,0.606892,0.128333,-0.136146,-0.278787,0.468067,-0.310626,0.253619,-0.132123,-0.324798,-0.075846,-0.545838,0.582537,0.21807,-0.396565,-0.138196,-0.415168,-0.063165,-0.273881,0.257613,0.096426,0.149001,-0.198695,0.069611,0.242321,-0.137488,0.053384,-0.184186,-0.629436,…,0.379404,0.059307,-0.275136,-0.114843,-0.393695,0.361433,0.257503,-0.198394,-0.191552,0.34365,-0.197258,-0.053532,-0.647032,0.17516,-0.095757,-0.259961,0.152357,-0.237122,-0.124926,0.305039,0.662504,-0.040117,0.388982,0.139257,-0.198855,0.37803,-0.324647,-0.206529,-0.521431,0.201104,0.012297,0.27739,-0.32314,-0.437672,0.221411,-0.216267,-0.10492
-0.222472,-0.005717,0.01755,-0.152431,0.082138,-0.45164,-0.665867,0.165432,-0.534628,0.498674,0.236038,0.094743,-0.362782,0.106141,-0.252238,-0.034119,-0.275458,-0.599559,-0.095541,-0.391178,0.712369,0.563902,-0.332539,-0.381367,-1.092756,-0.329965,-0.449016,-0.025966,-0.097124,0.115147,-0.745745,0.39677,0.390946,-0.203624,0.530506,0.317422,-1.065472,…,0.479295,0.197936,-0.287452,-0.013139,-0.389713,0.430592,0.335071,-0.380843,0.295854,0.423726,-0.225374,-0.008997,-0.849865,0.346232,-0.605132,0.107615,-0.361664,-0.381799,-0.208291,0.080691,0.642929,-0.099032,0.289986,-0.093268,-0.340186,0.355064,0.017071,-0.071143,-0.547553,0.31032,0.310858,0.056379,-0.703524,-0.392997,0.362656,0.057918,-0.109119
-0.175948,0.251044,0.57868,-0.18834,-0.134889,-0.674244,-0.575561,0.596989,-0.457235,0.327338,-0.155796,-0.347386,-0.030644,0.294292,-0.489563,0.245185,-0.291255,-0.444115,-0.018272,-0.428581,0.361373,0.204568,-0.167897,-0.371942,-0.465282,-0.053389,-0.552517,0.244529,0.130768,0.177434,-0.778337,0.366748,0.278908,-0.230686,0.251298,0.522265,-0.693394,…,0.422927,0.448021,-0.464064,-0.356252,-0.192774,0.453984,0.569944,-0.287805,-0.046933,0.214438,-0.467071,-0.366539,-0.498214,-0.211967,-0.563618,0.142657,0.076739,-0.506741,-0.293737,0.22705,0.485774,0.017933,0.501325,-0.172824,0.022931,0.570613,-0.155529,-0.140879,-0.513497,0.158303,-0.383541,0.091375,-0.15397,-0.578252,-0.189774,0.058405,0.032852
0.178135,0.022982,-0.213848,-0.133932,-0.852305,-0.104335,-0.79042,-0.073752,-0.671372,0.704523,0.182869,0.044848,-0.675904,0.246938,-0.094934,-0.008603,-0.457433,-0.445713,-0.627258,-0.3387,0.520883,0.579376,-0.401504,-0.198822,-0.288183,0.074724,-0.552751,0.093068,-0.066773,0.287742,-0.416837,0.224975,0.255587,0.045755,0.200477,0.336002,-0.642228,…,0.352373,0.339299,-0.560981,-0.674634,-0.310045,0.19487,0.309852,-0.43044,0.129791,-0.055731,-0.274468,-0.363785,-0.830623,-0.102053,-0.580149,-0.296471,-0.080918,-0.57889,-0.538502,-0.042045,0.373307,-0.347435,0.662217,0.044096,0.038719,0.321522,-0.109339,-0.389375,-0.469735,0.111092,0.039666,0.227299,-0.457352,-0.4216,-0.079483,-0.201947,0.242044
0.599185,0.025005,0.356816,-0.978445,-0.614492,-0.609616,-0.604118,0.605126,-0.815449,0.662975,-0.076576,0.013116,-0.258014,-0.021262,-0.556723,0.222125,-0.398456,-0.281881,-0.231095,-0.2412,0.892776,0.855575,-0.629748,-0.613585,-0.21441,-0.448102,0.011035,-0.124304,0.442128,0.012933,-0.339237,0.132372,-0.17417,0.627024,0.054415,-0.438037,-0.620077,…,0.257782,0.306392,-0.281901,-0.171799,-0.144764,0.13587,0.487954,0.373235,0.056123,0.543401,-0.376073,0.424472,-0.599846,-0.005143,-0.274084,-0.557398,-0.475633,-0.241881,-0.11527,0.134698,0.305313,-0.025863,0.545038,-0.462512,-0.004291,0.423122,-0.359964,-0.287656,-0.107858,-0.265118,-0.084689,0.029696,-0.032281,-0.457986,-0.23115,0.613272,0.015541
0.236955,0.260596,0.040948,-0.410093,-0.455945,-0.404472,-0.930851,0.227978,-0.649143,0.368169,-0.080791,0.121134,-0.302288,-0.250972,-0.10699,-0.127647,-0.374245,-0.393302,-0.075611,-0.194209,0.815483,0.415962,-0.537307,-0.121029,-0.408175,-0.314397,-0.017073,-0.124826,0.399553,-0.055004,-0.494988,0.658587,-0.188438,0.664433,0.30551,-0.005952,-1.108427,…,0.372246,0.528815,-0.388967,-0.200894,-0.200967,0.362474,0.061914,-0.283254,-0.0909,0.887724,0.196602,0.154666,-0.512238,0.026876,-0.140718,-0.105519,-0.424078,-0.525434,-0.437513,-0.095445,0.798837,-0.19612,0.437806,-0.171648,-0.03646,0.203062,0.192346,0.034439,-0.083275,0.113247,-0.162199,-0.222194,-0.356362,-0.290594,-0.165772,0.217507,0.070869
-0.32656,0.241863,-0.010343,-0.071194,-0.114975,-0.455396,-0.167113,-0.05485,-0.779437,-0.039307,0.181978,-0.217534,0.256694,0.301415,0.105846,-0.276178,0.230826,-0.159156,-0.141713,-0.120152,0.33321,0.715146,-0.028786,-0.30931,-0.811367,0.310113,-0.166017,0.015034,0.602633,-0.40989,-0.073203,0.629144,-0.005542,0.343271,-0.250643,0.042482,-0.521132,…,0.272225,0.162943,0.110473,0.100124,0.276004,-0.10689,0.312849,0.103457,0.050042,0.12044,-0.148665,-0.104692,-0.149209,-0.05543,-0.352111,0.021126,-0.226647,0.065913,0.018696,-0.13437,0.339083,0.260341,0.185274,-0.263954,-0.266613,-0.25394,-0.158179,0.130954,-0.379301,-0.009433,0.06357,0.629039,-0.254832,-0.057408,-0.091928,0.18714,-0.01406


In [18]:
cat2 = [[j[0] for j in w2v.wv.most_similar(i)][0] for i in df2.to_numpy()]
cat2

['ciudad',
 'negocio',
 'producto',
 'herramienta',
 'material',
 'reparación',
 'acabado',
 'asesoramiento',
 'calidad',
 'lunes',
 'obstante',
 'compra',
 'cliente',
 'precio',
 'contenido',
 'privacidad']

In [19]:
len(set(cat2))

16

In [None]:
df2.write_csv("edited_data_vectors.csv")