### 读取数据

In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder

In [2]:
# 读取Test数据集
dpath = "../Data/"
testTemp = pd.read_csv(dpath+"test.csv", iterator=True)
# 设置分块参数
loop = True
chunkSize = 100000
chunks = []

while loop:
    try:
        chunk = testTemp.get_chunk(chunkSize)
        chunks.append(chunk)
    except StopIteration:
        loop = False
        print("Iteration is stopped.")
test = pd.concat(chunks, ignore_index=True)
print(len(test))

Iteration is stopped.
2556790


In [3]:
# 读取members数据集
members = pd.read_csv(dpath+"members.csv")
print(len(members))

34403


In [4]:
test_members = pd.merge(test, members, how='outer')
test_members = test_members.drop(["id"], axis=1)
test_members.head()

Unnamed: 0,msno,song_id,source_system_tab,source_screen_name,source_type,city,bd,gender,registered_via,registration_init_time,expiration_date
0,V8ruy7SGk7tDm3zA51DPpn6qutt+vmKMBKa21dp54uM=,WmHKgKMlp1lQMecNdNvDMkvIycZYHnFwDT72I5sIssc=,my library,Local playlist more,local-library,1,0,,7,20160219,20170918
1,V8ruy7SGk7tDm3zA51DPpn6qutt+vmKMBKa21dp54uM=,y/rsZ9DC7FwK5F2PK2D5mj+aOBUJAjuu3dZ14NgE0vM=,my library,Local playlist more,local-library,1,0,,7,20160219,20170918
2,V8ruy7SGk7tDm3zA51DPpn6qutt+vmKMBKa21dp54uM=,A4Xu2KK/i4wlarqf8SOQ61S/XTIDEaNtKSKKHWGca4w=,my library,Local playlist more,local-library,1,0,,7,20160219,20170918
3,V8ruy7SGk7tDm3zA51DPpn6qutt+vmKMBKa21dp54uM=,YJKHI4Asj1oM5vJftbISis2QYliVSExtWiWnwb7TOgo=,my library,Local playlist more,local-library,1,0,,7,20160219,20170918
4,V8ruy7SGk7tDm3zA51DPpn6qutt+vmKMBKa21dp54uM=,pcxJbwDNuVwQhMfbiZdKvw/KMlUEeYmoXVbC7v/78SY=,my library,Local playlist more,local-library,1,0,,7,20160219,20170918


### 对空值的粗处理

In [5]:
# 查看数据组成
test_members.info()
# 寻找空值
print(test_members.isnull().sum())
print(test_members.shape)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2566062 entries, 0 to 2566061
Data columns (total 11 columns):
msno                      object
song_id                   object
source_system_tab         object
source_screen_name        object
source_type               object
city                      int64
bd                        int64
gender                    object
registered_via            int64
registration_init_time    int64
expiration_date           int64
dtypes: int64(5), object(6)
memory usage: 234.9+ MB
msno                            0
song_id                      9272
source_system_tab           17714
source_screen_name         172155
source_type                 16569
city                            0
bd                              0
gender                    1059136
registered_via                  0
registration_init_time          0
expiration_date                 0
dtype: int64
(2566062, 11)


In [6]:
# 删除空值比较少的字段
filterData = ['song_id', 'source_system_tab', 'source_type']
TestMembers = test_members.dropna(subset = filterData)
print(TestMembers.isnull().sum())

msno                            0
song_id                         0
source_system_tab               0
source_screen_name         153477
source_type                     0
city                            0
bd                              0
gender                    1047088
registered_via                  0
registration_init_time          0
expiration_date                 0
dtype: int64


### 对数量比较少的数据类型进行合并

In [7]:
# source_screen_name中的量少的数据进行合并
otherScrName = ["Discover New", "Search Trends", "Search Home", "My library_Search",
 "Self profile more", "Concert", "Payment"]
col_origin = TestMembers.columns

column_np = np.array(TestMembers)
cols=column_np.tolist()
for col in cols:
    if col[3] in otherScrName:
        col[3] = "Other"
TestMembers = pd.DataFrame(columns = col_origin, data = cols)
TestMembers["source_screen_name"].value_counts()

Local playlist more     844887
Online playlist more    529785
Radio                   211201
Album more              175744
Search                  121955
Artist more             110999
Discover Feature         93226
Others profile more      90437
Discover Chart           78930
Discover Genre           41612
Explore                  27852
My library               25230
Unknown                  23620
Other                    17134
People local                13
People global                1
Name: source_screen_name, dtype: int64

In [8]:
# 将source_screen_name中的nan值转化为Unknown
values = {'source_screen_name': "Unknown"}
TestMembers.fillna(value = values)
# 只是对之后的数据进行修改，这个是对原数据进行修改
TestMembers["source_screen_name"].fillna("Unknown",inplace=True)
print(TestMembers["source_screen_name"].value_counts())
TestMembers.isnull().sum()

Local playlist more     844887
Online playlist more    529785
Radio                   211201
Unknown                 177097
Album more              175744
Search                  121955
Artist more             110999
Discover Feature         93226
Others profile more      90437
Discover Chart           78930
Discover Genre           41612
Explore                  27852
My library               25230
Other                    17134
People local                13
People global                1
Name: source_screen_name, dtype: int64


msno                            0
song_id                         0
source_system_tab               0
source_screen_name              0
source_type                     0
city                            0
bd                              0
gender                    1047088
registered_via                  0
registration_init_time          0
expiration_date                 0
dtype: int64

In [9]:
# 对年龄进行离散化
column_np = np.array(TestMembers)
cols=column_np.tolist()
for col in cols:
    if col[6] < 3 or col[6] > 80:
        col[6] = 0
    if col[6] >= 3 and col[6] < 23:
        col[6] = "3~23"
    elif col[6] >=23 and col[6] < 27:
        col[6] = "23~27"
    elif col[6] >=27 and col[6] < 33:
        col[6] = "27~33"
    elif col[6] >=33 and col[6] <= 78:
        col[6] = "33~78"
    elif col[6] == 0:
        col[6] = np.nan
TestMembers = pd.DataFrame(columns = col_origin, data = cols)

In [10]:
print(TestMembers["bd"].value_counts())
print(TestMembers["source_type"].value_counts())

27~33    421244
33~78    396566
3~23     348798
23~27    338025
Name: bd, dtype: int64
online-playlist           772629
local-library             581641
local-playlist            294384
radio                     214696
album                     195163
top-hits-for-artist       179348
song                      129108
song-based-playlist        87132
listen-with                84470
topic-article-playlist      5082
my-daily-playlist           2022
artist                       428
Name: source_type, dtype: int64


In [11]:
otherSouType = ["listen-with", "topic-article-playlist", "artist", "my-daily-playlist"]
col_origin = TestMembers.columns

column_np = np.array(TestMembers)
cols=column_np.tolist()
for col in cols:
    if col[4] in otherSouType:
        col[4] = "Other"
TestMembers = pd.DataFrame(columns = col_origin, data = cols)
TestMembers["source_type"].value_counts()

online-playlist        772629
local-library          581641
local-playlist         294384
radio                  214696
album                  195163
top-hits-for-artist    179348
song                   129108
Other                   92002
song-based-playlist     87132
Name: source_type, dtype: int64

In [12]:
for col in TestMembers.columns:
    print(TestMembers[col].value_counts())

KGXNZ/H3VxvET/+rGxlrAe7Gpz2eKMXyuSg3xh8Ij1M=    2475
yBXKYfcmQ+TtIEvkLQXabAY2uwudIGn21dEZYAwCDCg=    2389
MXIMDXO0j3UpaT7FvOSGW6Y5zfhlh+xYjTqGoUdMzEE=    2075
y+PDdTzWxEVgd0r1BH38ib0SuyWzgd1FAV5iMl53Uw0=    2068
y3IT85vrzY2iBxFg6nsh4Cmg+mV3oqR73TPekK7x1Rc=    2058
o+5RNlSWrzvrphgBNGIo1FLkGxBgyICns6qXj3nS7Pk=    1781
FGqBebsinq4fszaFAyN4WqDL6vnhMlMe248eN5EgnWI=    1762
mGDObQQojFOJfK2rJKcme282huuk0qDxBzWL22/qYbc=    1675
BqnomjS+KJtzWrGRkqLEoO3I7KapwN4B8H9in4Fsf+I=    1629
6zb1JIozg37wsF7TsVd/U/ZT3peCtosRl/KpxyZvILs=    1614
2pUzjoYYUf9j1xpJOGmzu0Zp+52MdLKsVH0eFLePBbs=    1613
Hm3iH5wb4xg6YJa4HKRhTN2OfOecobNmHfxCAl4T374=    1582
FGtllVqz18RPiwJj/edr2gV78zirAiY/9SmYvia+kCg=    1564
8G9kFHqzaGUu0BRbHV87yYhrWKEaGkuE6beClg2buAc=    1559
K1BJCDeN15EDF2kmNJ4pVKNp6kAhNOyiNOdivzOVe+Q=    1540
x1staqrE1JMIUlvyEVgMYgDfZhlTGfpA3qtPSj+N77g=    1480
cqjRBV/jWN2ujhc+z/4tz+Mj6xEfflAAt6qBXCqxKvw=    1438
pForpfrz+EtnctFemGiaGXRER8TXx3C63pD13pGXwPo=    1434
38LUs5mVWNKfyjL7ilf9JCnWZLjnno2obLXr69JJKcw=  

### 将类别型特征进行LabelEncoder处理

In [13]:
"""
# 实例化对象
LE = preprocessing.LabelEncoder()
# 将msno字符串转换为数字
LE.fit(TestMembers["msno"].values)
leMsno = LE.transform(TestMembers["msno"])
# 将转换后的结果添加到TestMembers里
le_msno = pd.Series(data=leMsno, name="LE_msno")

TestMembers = pd.concat([le_msno, TestMembers],axis=1)
# 将原msno剔除
TestMembers = TestMembers.drop(["msno"], axis=1)
TestMembers.head()
"""

'\n# 实例化对象\nLE = preprocessing.LabelEncoder()\n# 将msno字符串转换为数字\nLE.fit(TestMembers["msno"].values)\nleMsno = LE.transform(TestMembers["msno"])\n# 将转换后的结果添加到TestMembers里\nle_msno = pd.Series(data=leMsno, name="LE_msno")\n\nTestMembers = pd.concat([le_msno, TestMembers],axis=1)\n# 将原msno剔除\nTestMembers = TestMembers.drop(["msno"], axis=1)\nTestMembers.head()\n'

In [14]:
# leSongid = LE.fit_transform(TestMembers["song_id"])
# le_songid = pd.Series(data=leSongid, name="LE_song_id")
# TestMembers = pd.concat([le_songid, TestMembers], axis=1)
# TestMembers = TestMembers.drop(["song_id"], axis=1)
# TestMembers.head()

In [15]:
"""
# 把target放入最后一列
Target = TestMembers['target']
TestMembers = TestMembers.drop(['target'],axis=1)

TestMembers.insert(12,'target',Target)
TestMembers.head()
"""

"\n# 把target放入最后一列\nTarget = TestMembers['target']\nTestMembers = TestMembers.drop(['target'],axis=1)\n\nTestMembers.insert(12,'target',Target)\nTestMembers.head()\n"

### 将数据进行One-hot编码

In [16]:
# New["gender"].astype("object")
# genderCat = pd.get_dummies(New["gender"], prefix="gender")
# TestMembers = [New, genderCat]
# TestMembers = pd.concat(TestMembers, axis=1).drop(['gender'], axis=1)
# TestMembers.head()

In [17]:
# # 将bd进行One-hot
# New["bd"].astype("object")
# BdCat = pd.get_dummies(New["bd"], prefix="bd")
# TestMembers = [New, BdCat]
# TestMembers = pd.concat(TestMembers, axis=1).drop(['bd'], axis=1)
# TestMembers.head()

### 将这些数据存起来

In [18]:
TestMembers.to_csv(dpath +'TestMembers.csv',index=False,header=True)