In [1]:
import pandas as pd
import json

df_flats_to_buy = pd.read_excel('flats_to_buy.xlsx')

print(df_flats_to_buy.columns)


Index(['Url', 'TranactionType', 'Title', 'Object_price', 'Object_currency',
       'LivingSpace', 'Rooms', 'ConstructionYear', 'Object_features',
       'Address', 'Price', 'MediaItems', 'ContactData', 'BasicInfo'],
      dtype='object')


In [12]:

first_transaction_type = df_flats_to_buy['TranactionType'].iloc[0]
print(first_transaction_type)

{"EstateTypeGerman": "WOHNUNG", "DistributionTypeGerman": "ZUM_KAUF", "EstateType": "APARTMENT", "DistributionType": "BUY"}


In [2]:
import preprocessing_methods as pm

df_flats_to_buy = pm.extract_TranactionType_columns(df_flats_to_buy)

print(df_flats_to_buy.columns)

Index(['Url', 'Title', 'Object_price', 'Object_currency', 'LivingSpace',
       'Rooms', 'ConstructionYear', 'Object_features', 'Address', 'Price',
       'MediaItems', 'ContactData', 'BasicInfo', 'EstateTypeGerman',
       'DistributionTypeGerman', 'EstateType', 'DistributionType'],
      dtype='object')


In [5]:
from sklearn.feature_extraction.text import CountVectorizer

# Create an instance of CountVectorizer
vectorizer = CountVectorizer(binary=True)

# Fit the vectorizer on your data
corpus = df_flats_to_buy['Object_features'].tolist()
X = vectorizer.fit_transform(corpus)

# Convert the sparse matrix to an array
X = X.toarray()

# Get the feature names from the fitted CountVectorizer object
feature_names = vectorizer.get_feature_names_out()

# Create a DataFrame with the BoW representation
bow_df = pd.DataFrame(X, columns=feature_names)


In [7]:
print(bow_df.columns)

Index(['1945', 'abstellraum', 'als', 'altbau', 'bad', 'balkon', 'barriefrei',
       'be', 'bis', 'block', 'carport', 'dachgeschoss', 'denkmalgeschuetzt',
       'dielen', 'dsl', 'duplex', 'dusche', 'einbauk', 'elektro', 'entl',
       'erdgeschoss', 'erlaubt', 'erstbezug', 'estrich', 'etagenheizung',
       'fenster', 'ferienimmobilie', 'fern', 'ferne', 'fliesen', 'fluessiggas',
       'frei', 'fu', 'gaestewc', 'garage', 'garten', 'gartennutzung', 'gas',
       'geeignet', 'gepflegt', 'getrennt', 'haustiere', 'holzfenster',
       'kable_sat_tv', 'kamera', 'kamin', 'kapitalanlage', 'kelleranteil',
       'kfw40', 'kfw55', 'kfw70', 'kontrollierte', 'kunststoff',
       'kunststofffenster', 'laminat', 'linoleum', 'loggia', 'luftwp',
       'massivhaus', 'moebliert', 'neubau', 'neuwertig', 'oel', 'offene',
       'pantry', 'parkett', 'pellet', 'personenaufzug', 'provisionsfrei',
       'reinigung', 'renoviert', 'renovierungsbed', 'rollstuhlgerecht', 'sat',
       'sauna', 'souterrain', '

In [11]:

# Get the unique tokens from the "Object_features" column
vocabulary = set()
for entry in df_flats_to_buy['Object_features']:
    vocabulary.update(entry)

# Convert the vocabulary set to a list
vocabulary = list(vocabulary)

# Print the vocabulary
print(vocabulary)

['C', 'R', 'f', 'S', 'O', 'G', 'n', '4', 'e', '"', 'W', '\\', 'I', 'y', 'g', 't', 'p', 'd', 'r', ' ', ',', '(', 'K', '-', 'u', '9', 'v', 'Z', 'l', 'B', 'L', 'b', 'A', '5', 'D', ']', '[', '6', 'o', ')', 'E', 'i', 'w', '0', 'N', '/', 'H', '1', 'x', 'V', 'c', 'T', 'U', 'h', 'a', 'z', 'k', '_', '7', 's', 'F', 'm', 'M', 'P']
