In [1]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [2]:
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.model_selection import train_test_split

In [13]:
df = pd.read_parquet('/content/drive/MyDrive/ecommerce/2019-Oct-EDA.parquet')

In [4]:
user_unique = df['user_id'].unique()
product_unique = df['product_id'].unique()

user_to_idx = {v:k for k,v in enumerate(user_unique)}
product_to_idx = {v:k for k,v in enumerate(product_unique)}

In [5]:
temp_user_data = df['user_id'].map(user_to_idx.get).dropna()
if len(temp_user_data) == len(df): 
    df['user_id'] = temp_user_data

temp_product_data = df['product_id'].map(product_to_idx.get).dropna()
if len(temp_product_data) == len(df):
    df['product_id'] = temp_product_data

In [7]:
shape = (len(user_to_idx), len(product_to_idx))

In [8]:
df_train , df_test = train_test_split(df, test_size = 0.3)

In [9]:
df_train = df_train.reset_index(drop = True)
df_test = df_test.reset_index(drop = True)

In [10]:
csr_train = csr_matrix((df_train.rating, (df_train.user_id, df_train.product_id)), shape = shape)
csr_train

<2962411x166084 sparse matrix of type '<class 'numpy.float64'>'
	with 17753619 stored elements in Compressed Sparse Row format>

In [11]:
csr_test = csr_matrix((df_test.rating, (df_test.user_id, df_test.product_id)), shape = shape)
csr_test

<2962411x166084 sparse matrix of type '<class 'numpy.float64'>'
	with 9313085 stored elements in Compressed Sparse Row format>

In [3]:
def sparse_matrix_split(fpath, n):
  df = pd.read_parquet(fpath)

  user_unique = df['user_id'].unique()
  product_unique = df['product_id'].unique()
  
  user_to_idx = {v:k for k,v in enumerate(user_unique)}
  product_to_idx = {v:k for k,v in enumerate(product_unique)}

  temp_user_data = df['user_id'].map(user_to_idx.get).dropna()
  
  if len(temp_user_data) == len(df): 
    df['user_id'] = temp_user_data
    
  temp_product_data = df['product_id'].map(product_to_idx.get).dropna()
  
  if len(temp_product_data) == len(df):
    df['product_id'] = temp_product_data

  shape = (len(user_to_idx), len(product_to_idx))

  df_train , df_test = train_test_split(df, test_size = n)
  df_train = df_train.reset_index(drop = True)
  df_test = df_test.reset_index(drop = True)

  csr_train = csr_matrix((df_train.rating, (df_train.user_id, df_train.product_id)), shape = shape)
  csr_test = csr_matrix((df_test.rating, (df_test.user_id, df_test.product_id)), shape = shape)

  return csr_train, csr_test

In [4]:
file_path = '/content/drive/MyDrive/ecommerce/2019-Oct-EDA.parquet'

In [6]:
train, test = sparse_matrix_split(file_path, 0.3)

In [7]:
train

<2962411x166084 sparse matrix of type '<class 'numpy.float64'>'
	with 17750343 stored elements in Compressed Sparse Row format>

In [8]:
test

<2962411x166084 sparse matrix of type '<class 'numpy.float64'>'
	with 9313738 stored elements in Compressed Sparse Row format>