## Imports

In [1]:
import os
import numpy as np
import pandas as pd

## Fetch Data

In [2]:
data_dir = os.path.join(os.getcwd(), 'data')

In [3]:
data_path = os.path.join(data_dir, 'ratings_Electronics.csv')

In [4]:
df = pd.read_csv(data_path)
df.head()

Unnamed: 0,AKM1MP6P0OYPR,0132793040,5.0,1365811200
0,A2CX7LUOHB2NDG,321732944,5.0,1341100800
1,A2NWSAGRHCP8N5,439886341,1.0,1367193600
2,A2WNBOD3WNDNKT,439886341,3.0,1374451200
3,A1GI0U4ZRJA8WN,439886341,1.0,1334707200
4,A1QGNMC6O1VW39,511189877,5.0,1397433600


## Cleaning Data

- The column header is seems like an observation; this has to be dropped as a row in the dataframe
- columns should be renamed appropriately
- drop duplicates
- ensure data types are appropriate for each column

In [5]:
def clean_data(data):
	temp = (
		data
		.pipe(lambda df_: pd.concat([
			pd.DataFrame(df_.columns.values.reshape(1, -1)),
			df_.set_axis([0, 1, 2, 3], axis=1)
		], axis=0))
		.set_axis(['user_id', 'product_id', 'rating', 'timestamp'], axis=1)
		.drop_duplicates()
	)
	return temp.assign(
		user_id=lambda df_: df_.user_id.str.strip(),
		product_id=lambda df_: df_.product_id.str.strip(),
		rating=lambda df_: pd.to_numeric(df_.rating),
		timestamp=lambda df_: pd.to_numeric(df_.timestamp)
	)

In [6]:
df_cleaned = clean_data(df)
df_cleaned

Unnamed: 0,user_id,product_id,rating,timestamp
0,AKM1MP6P0OYPR,0132793040,5.0,1365811200
0,A2CX7LUOHB2NDG,0321732944,5.0,1341100800
1,A2NWSAGRHCP8N5,0439886341,1.0,1367193600
2,A2WNBOD3WNDNKT,0439886341,3.0,1374451200
3,A1GI0U4ZRJA8WN,0439886341,1.0,1334707200
...,...,...,...,...
7824476,A2YZI3C9MOHC0L,BT008UKTMW,5.0,1396569600
7824477,A322MDK0M89RHN,BT008UKTMW,5.0,1313366400
7824478,A1MH90R0ADMIK0,BT008UKTMW,4.0,1404172800
7824479,A10M2KEFPEQDHN,BT008UKTMW,4.0,1297555200


In [7]:
df_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7824482 entries, 0 to 7824480
Data columns (total 4 columns):
 #   Column      Dtype  
---  ------      -----  
 0   user_id     object 
 1   product_id  object 
 2   rating      float64
 3   timestamp   int64  
dtypes: float64(1), int64(1), object(2)
memory usage: 298.5+ MB


## Split into Train and Test subsets

- Dataset has about 8 million observations
- Train: 95% of data
- Test: 5% of data

In [8]:
n_rows = df_cleaned.shape[0]
n_rows

7824482

In [9]:
random_indicies = np.random.permutation(n_rows)

In [10]:
n_train = int(0.95 * n_rows)
n_train

7433257

In [11]:
df_train = df_cleaned.iloc[random_indicies[:n_train]]
df_test = df_cleaned.iloc[random_indicies[:n_train]]

In [12]:
print(f"Train data shape: {df_train.shape}")
print(f"Test data shape: {df_test.shape}")

Train data shape: (7433257, 4)
Test data shape: (7433257, 4)


## Export Train and Test data

In [13]:
train_data_path = os.path.join(data_dir, 'train.parquet')
test_data_path = os.path.join(data_dir, 'test.parquet')

In [14]:
df_train.to_parquet(train_data_path, index=False)
df_test.to_parquet(test_data_path, index=False)