In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import time
import scipy
import gc
from collections import Counter
import warnings
from matplotlib import rcParams

config = {
    "font.family":'Times New Roman',  # 设置字体类型
}

rcParams.update(config)
warnings.filterwarnings(action="ignore")

%matplotlib inline

In [7]:
# 问题：如何优化读入数据的内存占用情况？

# 解释内存
def reduce_mem(df):
    """对于数值类型的数据进行内存节省"""
    
    starttime = time.time()
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2  # 统计内存使用情况
    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if pd.isnull(c_min) or pd.isnull(c_max):
                continue
            if str(col_type)[:3] == 'int':
                # 装换数据类型
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
                    
    end_mem = df.memory_usage().sum() / 1024**2
    print('-- Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction),time spend:{:2.2f} min'.format(end_mem,100*(start_mem-end_mem)/start_mem,(time.time()-starttime)/60))
    return df


In [8]:
# user_log = reduce_mem(pd.read_csv("/home/songwq/Documents/workspace/Datasets/data_format1/user_log_format1.csv"))
user_info = reduce_mem(pd.read_csv("/home/songwq/Documents/workspace/Datasets/data_format1/user_info_format1.csv"))
train_data = reduce_mem(pd.read_csv("/home/songwq/Documents/workspace/Datasets/data_format1/train_format1.csv"))
test_data = reduce_mem(pd.read_csv("/home/songwq/Documents/workspace/Datasets/data_format1/test_format1.csv"))

-- Mem. usage decreased to  3.24 Mb (66.7% reduction),time spend:0.00 min
-- Mem. usage decreased to  1.74 Mb (70.8% reduction),time spend:0.00 min
-- Mem. usage decreased to  3.49 Mb (41.7% reduction),time spend:0.00 min


In [10]:
# 问题：如何在pandas读取大批量的数据?

# 数据量过大，采用迭代方法
reader = pd.read_csv("/home/songwq/Documents/workspace/Datasets/data_format1/user_log_format1.csv", iterator=True)
# try:
#     df = reader.get_chunk(100000)
# except StopIteration:
#     print("Iteration is stopped.")
loop = True
chunkSize = 100000
chunks = []

while loop:
    try:
        chunk = reader.get_chunk(chunkSize)
        chunks.append(chunk)
    except StopIteration:
        loop = False
        print("Iteration is stopped.")
        
df = pd.concat(chunks, ignore_index=True)
user_log = reduce_mem(df)
user_log

Iteration is stopped.
-- Mem. usage decreased to 890.48 Mb (69.6% reduction),time spend:0.06 min


Unnamed: 0,user_id,item_id,cat_id,seller_id,brand_id,time_stamp,action_type
0,328862,323294,833,2882,2660.0,829,0
1,328862,844400,1271,2882,2660.0,829,0
2,328862,575153,1271,2882,2660.0,829,0
3,328862,996875,1271,2882,2660.0,829,0
4,328862,1086186,1271,1253,1049.0,829,0
...,...,...,...,...,...,...,...
54925325,208016,107662,898,1346,7996.0,1110,0
54925326,208016,1058313,898,1346,7996.0,1110,0
54925327,208016,449814,898,983,7996.0,1110,0
54925328,208016,634856,898,1346,7996.0,1110,0


In [11]:
user_log["user_id"].nunique()

424170

In [12]:
user_log.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54925330 entries, 0 to 54925329
Data columns (total 7 columns):
 #   Column       Dtype  
---  ------       -----  
 0   user_id      int32  
 1   item_id      int32  
 2   cat_id       int16  
 3   seller_id    int16  
 4   brand_id     float16
 5   time_stamp   int16  
 6   action_type  int8   
dtypes: float16(1), int16(3), int32(2), int8(1)
memory usage: 890.5 MB
