# A sparse matrix is a matrix in which most of the elements are zero. On the contrary, a table in which the majority of elements are non zero is called dense. 

In [None]:
import pandas as pd

In [None]:
data = pd.read_csv('mlexample/data.csv', sep="\t", header=None, engine='python')

In [None]:
data.columns = ["user_id", "item_id", "rating", "timestamp"]

In [None]:
data.drop(["timestamp"], axis=1, inplace=True)

In [None]:
display(data.head())

In [None]:
display(data.shape)

In [None]:
BYTES_TO_MB_DIV = 0.000001
def print_memory_usage_of_data_frame(df):
    mem = round(df.memory_usage().sum() * BYTES_TO_MB_DIV, 3) 
    print("Memory usage is " + str(mem) + " MB")

In [None]:
print_memory_usage_of_data_frame(data)

In [None]:
data_one_hot = pd.get_dummies(data, columns=['user_id', 'item_id'])# get_dummies is used for the transformations,  that converts categorical variables into indicator variables.
display(data_one_hot.head())
display(data_one_hot.shape)
print_memory_usage_of_data_frame(data_one_hot)

# Pandas Sparse Structures: Pandas provides data structures for efficient storage of sparse data. In these structures, zero values (or any other specified value) are not actually stored in the array.

In [None]:
def convert_to_sparse_pandas(df, exclude_columns=[]):
    """
    Converts columns of a data frame into SparseArrays and returns the data frame with transformed columns.
    Use exclude_columns to specify columns to be excluded from transformation.
    :param df: pandas data frame
    :param exclude_columns: list
        Columns not be converted to sparse
    :return: pandas data frame
    """
    df = df.copy()
    exclude_columns = set(exclude_columns)

    for (columnName, columnData) in df.iteritems():
        if columnName in exclude_columns:
            continue
        df[columnName] = pd.SparseArray(columnData.values, dtype='uint8') # we will convert the one-hot encoded columns into SparseArrays, which are 1-d arrays where only non-zero values are stored.

    return df

In [None]:
data_one_hot_sparse = convert_to_sparse_pandas(data_one_hot, exclude_columns=['rating'])
display(data_one_hot_sparse.dtypes)
print_memory_usage_of_sparse_data_frame(data_one_hot_sparse)

#### It is possible to create a sparse data frame directly, using the sparse parameter in pandas get_dummies. This parameter defaults to False. If True the encoded columns are returned as SparseArray. By setting sparse=True we create a sparse data frame directly, without previously having a dense data frame in memory.

In [None]:
data_one_hot_sparse = pd.get_dummies(data, columns=['user_id', 'item_id'], sparse=True)
display(data_one_hot_sparse.dtypes)
print_memory_usage_of_data_frame(data_one_hot_sparse)