<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#1-准备数据" data-toc-modified-id="1-准备数据-1">1 准备数据</a></span><ul class="toc-item"><li><span><a href="#1.1-加载数据集" data-toc-modified-id="1.1-加载数据集-1.1">1.1 加载数据集</a></span></li><li><span><a href="#1.2-统计每个商品被评价次数" data-toc-modified-id="1.2-统计每个商品被评价次数-1.2">1.2 统计每个商品被评价次数</a></span></li><li><span><a href="#1.3-统计只被评价过一次的商品数量" data-toc-modified-id="1.3-统计只被评价过一次的商品数量-1.3">1.3 统计只被评价过一次的商品数量</a></span></li><li><span><a href="#1.4-按商品被评价次数对数据集重新排列" data-toc-modified-id="1.4-按商品被评价次数对数据集重新排列-1.4">1.4 按商品被评价次数对数据集重新排列</a></span></li><li><span><a href="#1.5-将只被评价过一次的商品从数据集中拆分出来" data-toc-modified-id="1.5-将只被评价过一次的商品从数据集中拆分出来-1.5">1.5 将只被评价过一次的商品从数据集中拆分出来</a></span></li><li><span><a href="#2-对被评2次及以上的数据集进行拆分" data-toc-modified-id="2-对被评2次及以上的数据集进行拆分-1.6">2 对被评2次及以上的数据集进行拆分</a></span></li></ul></li><li><span><a href="#3-合并数据集" data-toc-modified-id="3-合并数据集-2">3 合并数据集</a></span></li><li><span><a href="#4-检查训练集中是否包含全部用户数和商品种类" data-toc-modified-id="4-检查训练集中是否包含全部用户数和商品种类-3">4 检查训练集中是否包含全部用户数和商品种类</a></span></li><li><span><a href="#4-打乱数据集顺序" data-toc-modified-id="4-打乱数据集顺序-4">4 打乱数据集顺序</a></span></li><li><span><a href="#4-存入本地" data-toc-modified-id="4-存入本地-5">4 存入本地</a></span></li></ul></div>

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

# 1 准备数据
## 1.1 加载数据集

In [2]:
df = pd.read_csv('ratings.dat', sep='::', header=None, names=['user','item','rating','timestamp'])

  """Entry point for launching an IPython kernel.


In [3]:
df.head(2)

Unnamed: 0,user,item,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109


In [4]:
df.shape

(1000209, 4)

## 1.2 统计每个商品被评价次数

In [5]:
items_rated_times = df['item'].value_counts()

In [6]:
items_rated_times[:2]

2858    3428
260     2991
Name: item, dtype: int64

## 1.3 统计只被评价过一次的商品数量

In [7]:
single = items_rated_times[items_rated_times==1]
single.shape # 114个商品只被评价过一次

(114,)

## 1.4 按商品被评价次数对数据集重新排列

In [8]:
# 把item列设为index，并保留原列
dfnew = df.set_index(keys=['item'], drop=False)
dfnew.head(1)

Unnamed: 0_level_0,user,item,rating,timestamp
item,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1193,1,1193,5,978300760


In [9]:
# 按商品被评价次数由高到底重新排列数据集
dfnew = dfnew.loc[items_rated_times.index]
dfnew.head(1)

Unnamed: 0_level_0,user,item,rating,timestamp
item,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2858,2,2858,4,978298434


## 1.5 将只被评价过一次的商品从数据集中拆分出来

In [10]:
# 商品被评价过2次及以上的数据集
more = dfnew.iloc[:-114]

# 商品只被评价过1次的数据集
single = dfnew.iloc[-114:]

more.shape, single.shape

((1000095, 4), (114, 4))

## 2 对被评2次及以上的数据集进行拆分

In [11]:
x_train, x_test, y_train, y_test = train_test_split(more[['user', 'item', 'timestamp']], 
                                                    more['rating'],
                                                    test_size=0.25, 
                                                    stratify=more['item'],
                                                    random_state=30
                                                   )

In [12]:
x_train.shape, x_test.shape

((750071, 3), (250024, 3))

In [13]:
x_train['rating'] = y_train
x_test['rating'] = y_test

# 3 合并数据集

In [14]:
x_train.head(1)

Unnamed: 0_level_0,user,item,timestamp,rating
item,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2942,5239,2942,961443267,3


In [15]:
single.head(1)

Unnamed: 0_level_0,user,item,rating,timestamp
item,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1815,1858,1815,2,974695341


In [16]:
single.tail(1)

Unnamed: 0_level_0,user,item,rating,timestamp
item,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
133,4486,133,1,965013057


In [17]:
X_train = pd.concat((single, x_train))
X_train.iloc[113:115]

Unnamed: 0_level_0,user,item,rating,timestamp
item,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
133,4486,133,1,965013057
2942,5239,2942,3,961443267


In [18]:
# 拆分好的数据集形状
X_train.shape, x_test.shape

((750185, 4), (250024, 4))

# 4 检查训练集中是否包含全部用户数和商品种类

In [19]:
X_train['user'].unique().shape

(6040,)

In [20]:
x_test['user'].unique().shape

(6040,)

In [21]:
X_train['item'].unique().shape

(3706,)

In [22]:
x_test['item'].unique().shape

(3503,)

In [23]:
df['user'].unique().shape, df['item'].unique().shape

((6040,), (3706,))

# 4 打乱数据集顺序

In [24]:
X_train.head()

Unnamed: 0_level_0,user,item,rating,timestamp
item,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1815,1858,1815,2,974695341
2039,2106,2039,1,974756684
3220,1470,3220,2,974839243
3881,2885,3881,5,972452947
989,1915,989,5,974693867


In [25]:
X_train.values[:2]

array([[     1858,      1815,         2, 974695341],
       [     2106,      2039,         1, 974756684]])

In [26]:
# 打乱行间顺序，行内数据不变
np.random.shuffle(X_train.values)
np.random.shuffle(x_test.values)

In [27]:
X_train.head()

Unnamed: 0_level_0,user,item,rating,timestamp
item,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1815,3148,2542,4,968962072
2039,4618,3623,4,964126577
3220,3283,750,5,968121461
3881,5042,1419,3,962656673
989,3900,110,5,965845788


# 4 存入本地

In [28]:
x_test = x_test[['user','item','rating','timestamp']]
x_test.head(1)

Unnamed: 0_level_0,user,item,rating,timestamp
item,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2372,543,1732,5,976233639


In [29]:
X_train.to_csv('./ratings_train.csv', 
               index=False,  # 不存储index
               header=False) # 不存储标题

In [30]:
x_test.to_csv('./ratings_test.csv', index=False, header=False)