In [1]:
import config as cfg
import pandas as pd

In [2]:
df = pd.read_pickle(cfg.PATHS.CLEAN_DATA_PICKLE)
df.reset_index(drop=True, inplace=True)

# categories_list

In [3]:
categories = {}
for cat in df["categories_list_string"]:
    l = cat.split(", ")
    for c in l:
        categories[c] = []

In [4]:
for cat in df["categories_list_string"]:
    l = set(cat.split(", "))
    for c in l:
        categories[c].append(1)
    for c in categories.keys():
        if c not in l:
            categories[c].append(0)

In [5]:
for k, v in categories.items():
    if len(v) != df.shape[0]:
        print(k, len(v))

In [6]:
df.drop(columns=["categories_list_string"], inplace=True)

# text

## Tokenize the text and remove stop words

In [None]:
texts = df["text"].tolist()

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(texts)

In [None]:
df.drop(columns=["text"], inplace=True)

In [None]:
df = pd.concat([df, pd.DataFrame(X.toarray())], axis=1)

## embeddings

### spacy

In [7]:
import spacy
from tqdm import tqdm

nlp = spacy.load("en_core_web_lg")

# convert all texts into spacy docs
docs = [nlp(text) for text in tqdm(df["text"])]

 40%|███▉      | 556213/1398049 [2:43:30<4:56:06, 47.38it/s]  

: 

: 

In [None]:
embeddings = [doc.vector for doc in tqdm(docs)]

100%|██████████| 139805/139805 [00:28<00:00, 4938.47it/s]


In [None]:
df.drop(columns=["text"], inplace=True)

### gensim

### bert

In [3]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("bert-base-nli-mean-tokens")

embeddings = model.encode(df["text"])

Downloading (…)821d1/.gitattributes:   0%|          | 0.00/391 [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)8d01e821d1/README.md:   0%|          | 0.00/3.95k [00:00<?, ?B/s]

Downloading (…)d1/added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

Downloading (…)01e821d1/config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)821d1/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/399 [00:00<?, ?B/s]

Downloading (…)8d01e821d1/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)1e821d1/modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

## bag of words

# review_count

In [None]:
# add log of review count
import numpy as np

df["log_review_count"] = df["review_count"].apply(lambda x: np.log(x))

# city

In [None]:
# get dummies for `city`
df = pd.get_dummies(df, columns=["city"])

# Put it all back together

In [None]:
df_embeddings = pd.DataFrame(embeddings)
df_categories = pd.DataFrame(categories)

In [None]:
df_new = pd.concat([df, df_embeddings, df_categories], axis=1)

In [None]:
df_embeddings.shape, df_categories.shape, df.shape, df_new.shape

((139805, 300), (139805, 1221), (139805, 931), (139805, 2452))

In [None]:
df.to_pickle(cfg.PATHS.TRANSFORMED_DATA_PICKLE)