In [5]:
import numpy as np 
import pandas as pd 
import random

path = './steam-200k.csv'
df = pd.read_csv(path, header = None, names = ['UserID', 'Game', 'Action', 'Hours', 'Not Needed'])
# explore, check data
print('show the first 5 data info')
print(df.head())
print('show the data set size')
print(df.shape)


show the first 5 data info
      UserID                        Game    Action  Hours  Not Needed
0  151603712  The Elder Scrolls V Skyrim  purchase    1.0           0
1  151603712  The Elder Scrolls V Skyrim      play  273.0           0
2  151603712                   Fallout 4  purchase    1.0           0
3  151603712                   Fallout 4      play   87.0           0
4  151603712                       Spore  purchase    1.0           0
show the data set size
(200000, 5)


In [17]:
# 创建Hours_Played字段，替代原有的Action和Hours，0表示仅购买，大于0表示购买且游戏时长
df['Hours_Played'] = df['Hours'].astype('float32')

# 如果字段Action=purchase，并且Hours=1.0，将设置Hours_Played=0
df.loc[(df['Action'] == 'purchase') & (df['Hours'] == 1.0), 'Hours_Played'] = 0
print(df['Hours_Played'])
print('增加了Hours_Played字段后，数据大小')
print(df.shape)

# 对数据从小到大进行排序, df下标也会发生变化
df.UserID = df.UserID.astype('int')

df = df.sort_values(['UserID', 'Game', 'Hours_Played'], ascending=True)
print("======")
print(df)

# 删除重复项，并保留最后一项出现的项（因为最后一项是用户游戏时间，第一项为购买）
clean_df = df.drop_duplicates(['UserID', 'Game'], keep = 'last')
print("--------")
print(clean_df)
# 去掉不用的列：Action, Hours, Not Needed

clean_df = clean_df.drop(['Action', 'Hours', 'Not Needed'], axis = 1)
print('删除重复项后的数据集：')
print(clean_df)
print("*********************")
print(clean_df.head(0))
print("*********************")

# 探索下数据集的特征
n_users = len(clean_df.UserID.unique())
n_games = len(clean_df.Game.unique())
print('数据集中包含了 {0} 玩家，{1} 游戏'.format(n_users, n_games))
print(clean_df.shape[0], "i m shape[0]")

# 矩阵的稀疏性
sparsity = clean_df.shape[0] / float(n_users * n_games)
print('用户行为矩阵的稀疏性（填充比例）为{:.2%} '.format(sparsity))




65429       0.0
65430       4.9
65423       0.0
65424     144.0
65435       0.0
          ...  
170025      0.0
10221       0.0
10222       0.7
129084      0.0
129085      0.2
Name: Hours_Played, Length: 200000, dtype: float32
增加了Hours_Played字段后，数据大小
(200000, 6)
           UserID             Game    Action  Hours  Not Needed  Hours_Played
65429        5250      Alien Swarm  purchase    1.0           0           0.0
65430        5250      Alien Swarm      play    4.9           0           4.9
65423        5250  Cities Skylines  purchase    1.0           0           0.0
65424        5250  Cities Skylines      play  144.0           0         144.0
65435        5250   Counter-Strike  purchase    1.0           0           0.0
...           ...              ...       ...    ...         ...           ...
170025  309812026        Robocraft  purchase    1.0           0           0.0
10221   309824202           Dota 2  purchase    1.0           0           0.0
10222   309824202           Dota 2 