In [1]:
from zipfile import ZipFile

import pandas as pd
import numpy as np

from LinearRegressionRecSys import LinearRegressionRecSys

np.random.seed(42)

Read datasets:

In [2]:
%%time
with ZipFile("../data/estaticos_market.csv.zip").open("estaticos_market.csv") as dataset:
    market_df = pd.read_csv(dataset, index_col=0)

Wall time: 7.84 s


In [3]:
%%time
dataset_files = []
for file_idx in range(8):
    dataset_files.append(pd.read_csv("../output/" +  f"companies_profile_{file_idx}.bz2", compression="bz2"))     
companies_profile = pd.concat(dataset_files, axis=0, ignore_index=True).set_index("id")

Wall time: 52.6 s


In [4]:
# Cluster labels
cluster_labels = pd.read_csv("../output/" + "cluster_labels.zip", compression="zip", index_col=0)   

# Portfolios
portfolio1 = pd.read_csv("../data/" + "estaticos_portfolio1.csv", usecols=["id"])
portfolio2 = pd.read_csv("../data/" + "estaticos_portfolio2.csv", usecols=["id"])
portfolio3 = pd.read_csv("../data/" + "estaticos_portfolio3.csv", usecols=["id"])

# Get list of all clients
all_clients = pd.concat([portfolio1, portfolio2, portfolio3], axis=0, ignore_index=True)

Size of datasets:

In [5]:
cluster_labels.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 462298 entries, 0 to 462297
Data columns (total 1 columns):
 #   Column   Non-Null Count   Dtype
---  ------   --------------   -----
 0   cluster  462298 non-null  int64
dtypes: int64(1)
memory usage: 7.1 MB


In [6]:
market_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 462298 entries, 0 to 462297
Columns: 181 entries, id to qt_filiais
dtypes: bool(9), float64(144), int64(1), object(27)
memory usage: 614.1+ MB


In [7]:
companies_profile.info()

<class 'pandas.core.frame.DataFrame'>
Index: 462298 entries, a6984c3ae395090e3bee8ad63c3758b110de096d5d819583a784a113726db849 to 3d43e934e150b86be1e67524f5ba1018b27da9ef25566d9c0607623ae7f25e3a
Columns: 190 entries, PC_1 to PC_190
dtypes: float64(190)
memory usage: 673.7+ MB


In [8]:
recsys = LinearRegressionRecSys(all_clients, companies_profile, cluster_labels)
rating_df = recsys.rating_df # get rating_df for all clients
clients_mask = rating_df["client"] # get mask for clients
not_clients_mask = ~clients_mask # get mask for companies that are not clients


Testing Portfolio . . .

Database size: 462298
Portfolio size: 1386
Portfolios' ids are in the database



In [9]:
clients_index = list(np.where(clients_mask)[0]) # get indexes for clients 
# get a sample of not clients the size of 30 times the number of clients
not_clients_rnd_index = list(np.random.choice(np.where(not_clients_mask)[0], size=25*len(clients_index), replace=False))
# sum indexes from clients and sample of not clients
sample_idx = clients_index + not_clients_rnd_index
sample_idx.sort()

cluster_labels_sample = cluster_labels.iloc[sample_idx].reset_index(drop=True)
market_df_sample = market_df.iloc[sample_idx].reset_index(drop=True)
companies_profile_sample = companies_profile.iloc[sample_idx]

Sample Sizes:

In [10]:
cluster_labels_sample.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33228 entries, 0 to 33227
Data columns (total 1 columns):
 #   Column   Non-Null Count  Dtype
---  ------   --------------  -----
 0   cluster  33228 non-null  int64
dtypes: int64(1)
memory usage: 259.7 KB


In [11]:
market_df_sample.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33228 entries, 0 to 33227
Columns: 181 entries, id to qt_filiais
dtypes: bool(9), float64(144), int64(1), object(27)
memory usage: 43.9+ MB


In [12]:
companies_profile_sample.info()

<class 'pandas.core.frame.DataFrame'>
Index: 33228 entries, 8bf37c178778e537b108e58bf20552abe10c3a7fae129b04b52506188e7176da to 84db0f17467cf4f02439e01ade57150b6ac59d9dae53edb2f43d68b0e55fb766
Columns: 190 entries, PC_1 to PC_190
dtypes: float64(190)
memory usage: 48.4+ MB


Save samples:

In [13]:
cluster_labels_sample.to_csv(path_or_buf="../output/cluster_labels_sample.csv")

In [14]:
market_df_sample.to_csv(path_or_buf="../output/estaticos_market_sample.zip", compression="zip")

In [15]:
companies_profile_sample.to_csv(path_or_buf="../output/companies_profile_sample.zip", compression="zip")