In [1]:
import config as cfg
import pandas as pd

In [2]:
df = pd.read_pickle(cfg.PATHS.CLEAN_DATA_PICKLE)
df.reset_index(drop=True, inplace=True)

# categories_list

In [3]:
categories = {}
for cat in df["categories_list_string"]:
    l = cat.split(", ")
    for c in l:
        categories[c] = []

In [4]:
for cat in df["categories_list_string"]:
    l = set(cat.split(", "))
    for c in l:
        categories[c].append(1)
    for c in categories.keys():
        if c not in l:
            categories[c].append(0)

In [5]:
for k, v in categories.items():
    if len(v) != df.shape[0]:
        print(k, len(v))

In [6]:
df.drop(columns=["categories_list_string"], inplace=True)

# text

## Tokenize the text and remove stop words

In [None]:
texts = df["text"].tolist()

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(texts)

In [None]:
df.drop(columns=["text"], inplace=True)

In [None]:
df = pd.concat([df, pd.DataFrame(X.toarray())], axis=1)

## embeddings

### spacy

In [7]:
import spacy
from tqdm import tqdm

nlp = spacy.load("en_core_web_lg")

# convert all texts into spacy docs
docs = [nlp(text) for text in tqdm(df["text"])]

 40%|███▉      | 556213/1398049 [2:43:30<4:56:06, 47.38it/s]  

: 

: 

In [None]:
embeddings = [doc.vector for doc in tqdm(docs)]

100%|██████████| 139805/139805 [00:28<00:00, 4938.47it/s]


In [8]:
df.drop(columns=["text"], inplace=True)

### gensim

### bert

In [7]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("bert-base-nli-mean-tokens")

embeddings = model.encode(df["text"], show_progress_bar=True)

Batches:   0%|          | 0/6974 [00:00<?, ?it/s]

## bag of words

# review_count

In [9]:
# add log of review count
import numpy as np

df["log_review_count"] = df["review_count"].apply(lambda x: np.log(x))

# city

In [10]:
# get dummies for `city`
df = pd.get_dummies(df, columns=["city"])

# Put it all back together

In [11]:
df_categories = pd.DataFrame(categories)

In [12]:
df_new = pd.concat([df, df_categories], axis=1)

In [13]:
df_categories.shape, df.shape, df_new.shape

((223152, 1250), (223152, 946), (223152, 2196))

In [15]:
import pickle


df.to_pickle(cfg.PATHS.TRANSFORMED_DATA_PICKLE)
pickle.dump(embeddings, open(cfg.PATHS.EMBEDDINGS, "wb"))