In [None]:
import pandas as pd
import re
import numpy as np
import warnings
warnings.filterwarnings('ignore')
import scipy.spatial.distance as dist # 距离计算

In [None]:
# 数据读取
tipdm_initdata = pd.read_csv('tipdm.initdata.csv',encoding='gbk')

In [None]:
tipdm_initdata.head()

In [None]:
tipdm_initdata.shape

In [None]:
tipdm_initdata.dtypes

In [None]:
# 字段提取
tipdm_data = tipdm_initdata[['content_id','page_path','userid','sessionid']]

# 数据预处理

## 网址信息处理

1. 网址补全

In [None]:
# 查看
tipdm_data['page_path'].value_counts()
# www.tipdm.org
tipdm_data['page_path'] = tipdm_data['page_path'].apply(lambda x: 'www.tipdm.org'+x)

2.保留.jhtml后缀网址

In [None]:
# 查看
tipdm_data[tipdm_data['page_path'].apply(lambda x: x.endswith('jhtml'))]
tipdm_data[tipdm_data['page_path'].apply(lambda x: x.endswith('htm'))]

jhtml_page = tipdm_data[tipdm_data['page_path'].apply(lambda x: x.endswith('jhtml'))]

In [None]:
jhtml_page

3. 删除主页网址

In [None]:
# 查看
jhtml_page[jhtml_page['page_path'].apply(lambda x: re.match(r'www.tipdm.org/index.jhtml',x)!=None)]


tmp_index = jhtml_page[jhtml_page['page_path']
                       .apply(lambda x: re.match(r'www.tipdm.org/index.jhtml',x)!=None)].index
jhtml_page = jhtml_page.drop(tmp_index)

In [None]:
jhtml_page

4. 删除包含%的网址

In [None]:
# 查看
jhtml_page[jhtml_page['page_path'].apply(lambda x: re.search(r'%',x) !=None)]['page_path'][10031]

In [None]:
tmp_index = jhtml_page[jhtml_page['page_path']
                       .apply(lambda x: re.search(r'%',x) !=None)].index
jhtml_page = jhtml_page.drop(tmp_index)

In [None]:
jhtml_page

5. 网址变换

In [None]:
# 查看
jhtml_page[jhtml_page['page_path'].apply(lambda x: re.search(r'//',x)!=None)]

jhtml_page['page_path'] = jhtml_page['page_path'].apply(lambda x: x.replace('//','/'))
jhtml_page['page_path'] = jhtml_page['page_path'].apply(lambda x: x.replace('/.jhtml','.jhtml'))
jhtml_page['page_path'] = jhtml_page['page_path'].apply(lambda x: x.replace('jmg/','jmgj/'))

## 内容ID处理

1. 删除content_id值不在100-999之间或者不为空的记录

In [None]:
tmp_index = jhtml_page[(jhtml_page['content_id'] <100) |
                       (jhtml_page['content_id'] >999)].index
jhtml_page.drop(tmp_index,inplace=True)

2. 将网址与内容ID一一对应

In [None]:
# 1 去重
content_page = jhtml_page[['content_id','page_path']].drop_duplicates()
# 2 统计
content_count = content_page.groupby('content_id')['page_path'].count()

# 3 提取
morethan1_id = content_count[content_count>1].index
morethan1_id.shape
morethan1_id

In [None]:
# 4 赋值
for i in morethan1_id:
    num = jhtml_page[jhtml_page['content_id']==i].index
    jhtml_page.loc[num, 'page_path'] = jhtml_page.loc[num[0],'page_path']

## 构建特征与数据拆分

1. 特征1：网址类别特征

In [None]:
# 定义网址类别
# 1 竞赛类
competition_label = ["jszz","sm","stpj","qk","notice","yxzp",
                     "firsttipdm","secondtipdm","thirdtipdm","fourthtipdm"] 
# 2 教学资源类
teaching_label = ["zytj","jmgj","jxsp","ts","information"]   
# 3 企业合作类
enterprise_label = ["xtxm","wjxq","zxns","cgal","kjxm","qyal","zzszl"] 
# 4 新闻动态类
notice_label = ["notices","sj","news"]

In [None]:
# 网址拆分
jhtml_page['网址拆'] = jhtml_page['page_path'].apply(lambda x: x.split('/')[1])

In [None]:
# 构建属性label
jhtml_page['label'] = 0
jhtml_page.loc[jhtml_page['网址拆'].isin(competition_label),'label'] = '竞赛'
jhtml_page.loc[jhtml_page['网址拆'].isin(teaching_label),'label'] = '教学资源'
jhtml_page.loc[jhtml_page['网址拆'].isin(enterprise_label),'label'] = '企业合作'
jhtml_page.loc[jhtml_page['网址拆'].isin(notice_label),'label'] = '新闻动态'

In [None]:
jhtml_page['label'].value_counts()

2. 特征2：用户属性构建

In [None]:
jhtml_page['userid'].isnull().sum()
# 用户ID基本为空

jhtml_page.columns
jhtml_page['sessionid'].value_counts()

In [None]:
jhtml_page['user_id'] = jhtml_page['sessionid'].rank()

In [None]:
# 提取所需数据
model_data = jhtml_page[['user_id','page_path','label']]

3. 数据拆分

In [26]:
competition_input = model_data.loc[model_data['label']=='竞赛',['user_id','page_path']]

teaching_input = model_data.loc[model_data['label']=='教学资源',['user_id','page_path']]

In [27]:
# 保存
competition_input.to_csv('competition_input.csv', encoding='utf-8-sig')
teaching_input.to_csv('teaching_input.csv', encoding='utf-8-sig')

# 模型构建

## 以竞赛数据为基础，进行推荐

In [28]:
competition_input = pd.read_csv('competition_input.csv')
competition_input = competition_input.drop_duplicates()

1. 构建用户—物品矩阵

In [29]:
competition_matrix = pd.pivot_table(data=competition_input, index='user_id', columns='page_path', 
               values='Unnamed: 0', aggfunc='count', fill_value=0)
competition_matrix[competition_matrix>0] = 1

2. 物品相似度矩阵

In [30]:
# 创建空数据框
sim = pd.DataFrame(columns=competition_matrix.columns, index=competition_matrix.columns)
# 填充
for i in range(len(competition_matrix.columns)):
    for j in range(len(competition_matrix.columns)):
        # 提取用户物品矩阵中两列数据
        matv = np.mat(competition_matrix[[competition_matrix.columns[i],
                            competition_matrix.columns[j]]]).T
        # 使用杰卡德距离计算网址的相似度
        sim.iloc[i,j] = dist.pdist(matv, 'jaccard')

3. 依据用户点击网址，进行推荐

In [31]:
# 以一个用户为例
user = competition_matrix.iloc[[0]]

p = pd.DataFrame(np.dot(user, sim),columns=user.columns, index=['interest']).T
p.sort_values(by='interest', ascending=False).index[0:4]

# 访问的网址
user.T[user.T[user.index[0]]>0].index.values

array(['www.tipdm.org/thirdtipdm/678.jhtml'], dtype=object)

In [32]:
# 针对所有用户
# 创建数据框
re_list = pd.DataFrame(index=competition_matrix.index, 
                       columns=['访问网址','推荐1','推荐2','推荐3','推荐4'])

In [None]:
for i in range(len(competition_matrix)):
    # 提取用户的点击记录
    user = competition_matrix.iloc[[i]]
    # 计算兴趣程度
    p = pd.DataFrame(np.dot(user, sim),columns=user.columns, index=['interest']).T
    # 提取前4个推荐
    recommend = p.sort_values(by='interest', ascending=False).index[0:4]
    # 存储
    re_list.iloc[i,1:] = recommend

    # 访问的网址
    re_list.iloc[i,0] = user.T[user.T[user.index[0]]>0].index.values

In [None]:
re_list

## 以教学资源数据为基础，进行推荐

In [None]:
teaching_input = pd.read_csv('tmp/teaching_input.csv')
teaching_input = teaching_input.drop_duplicates()

1. 构建用户—物品矩阵

In [None]:
def user_item_matrix(model):
    matrix = pd.pivot_table(data=model, index='user_id', columns='page_path', 
                   values='Unnamed: 0', aggfunc='count', fill_value=0)
    matrix[matrix>0] = 1
    return matrix

In [None]:
teaching_matrix = user_item_matrix(teaching_input)

In [None]:
teaching_matrix

2. 物品相似度矩阵

In [None]:
def similarity(matrix):
    # 创建空数据框
    sim = pd.DataFrame(columns=matrix.columns, index=matrix.columns)
    # 填充
    for i in range(len(matrix.columns)):
        for j in range(len(matrix.columns)):
            # 提取用户物品矩阵中两列数据
            matv = np.mat(matrix[[matrix.columns[i],
                                matrix.columns[j]]]).T
            # 使用杰卡德距离计算网址的相似度
            sim.iloc[i,j] = dist.pdist(matv, 'jaccard')
    return sim

In [None]:
teaching_sim = similarity(teaching_matrix)

3. 依据用户点击网址，进行推荐

In [None]:
# 针对所有用户
def recommend_list(matrix, sim):
    # 创建数据框
    re_list = pd.DataFrame(index=matrix.index, 
                           columns=['访问网址','推荐1','推荐2','推荐3','推荐4'])
    for i in range(len(matrix)):
        # 提取用户的点击记录
        user = matrix.iloc[[i]]
        # 计算兴趣程度
        p = pd.DataFrame(np.dot(user, sim),columns=user.columns, index=['interest']).T
        # 提取前4个推荐
        recommend = p.sort_values(by='interest', ascending=False).index[0:4]
        # 存储
        re_list.iloc[i,1:] = recommend

        # 访问的网址
        re_list.iloc[i,0] = user.T[user.T[user.index[0]]>0].index.values
    return re_list

In [None]:
teaching_recommend = recommend_list(teaching_matrix, teaching_sim)

In [None]:
teaching_recommend

In [None]:
teaching_recommend.to_csv('tteaching_recommend.csv', encoding='utf-8-sig')