# 推荐系统

In [26]:
# 正则化处理
import re
import string
import jieba

DATA_PATH = "../data/"

class Normalization:
    def __init__(self):
        with open(DATA_PATH + 'stop_words.txt', 'r', encoding='utf-8') as f:
            self.stopword_list = f.read().split('\n')
    
    def tokenize_text(self, text):
        tokens = jieba.lcut(text) # 分词
        tokens = [token.strip() for token in tokens] # 去除空格
        
        return tokens
    
    def remove_special_characters(self, text):
        tokens = self.tokenize_text(text)
        pattern = re.compile('[{}]'.format(re.escape(string.punctuation)))
        filtered_tokens = filter(None, [pattern.sub('', token) for token in tokens])
        filtered_text = ' '.join(filtered_tokens)
        
        return filtered_text
    
    def remove_stopwords(self, text):
        tokens = self.tokenize_text(text)
        filtered_tokens = [token for token in tokens if token not in self.stopword_list]
        filtered_text = ''.join(filtered_tokens)
        
        return filtered_text
    
    def normalize_corpus(self, corpus):
        normalized_corpus = []
        for text in corpus:
            text = ' '.join(jieba.lcut(text))
            normalized_corpus.append(text)
        
        return normalized_corpus

normalization = Normalization()

## 训练模型

In [27]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from database import *

DATA_PATH = '../data/'

with open(DATA_PATH + 'stop_words.txt', 'r', encoding='utf-8') as f:
    stop_words = f.read().split('\n')

sql1 = "select * from movies"
sql2 = "select * from reviews"

# 电影数据
cursor.execute(sql1)
items = cursor.fetchall()
lenMovie = len(items)
data = [items[i][1] + items[i][7] for i in range(lenMovie)]


# 评论数据
cursor.execute(sql2)
items = cursor.fetchall()
lenReviews = len(items)
data += [items[i][3] + items[i][7] for i in range(lenReviews)]


In [28]:
# 标准化语料库
print("normalizing...")
normData = normalization.normalize_corpus(data)

print("training...")
# 使用 TF-IDF 将文本转换为向量
vectorizer = TfidfVectorizer(stop_words=stop_words)
X = vectorizer.fit_transform(normData)

# 使用 K-means 进行聚类
kmeans = KMeans(n_clusters=20, random_state=0).fit(X)

# 输出每个查询的聚类标签
print(kmeans.labels_)



normalizing...
training...




[15  1  5 ...  0  1  1]


In [29]:
from collections import Counter
clusters = kmeans.labels_

# 获取每个cluster的数量
c = Counter(clusters)
print(c.items())



dict_items([(15, 591), (1, 8693), (5, 926), (9, 364), (3, 312), (2, 240), (13, 1146), (0, 4029), (4, 1084), (18, 952), (6, 721), (7, 2620), (14, 366), (12, 398), (8, 506), (16, 379), (11, 1971), (10, 292), (19, 278), (17, 150)])


## 保存模型

In [30]:
import pickle

with open(DATA_PATH + 'kmeans.pkl', 'wb') as f:
    pickle.dump(kmeans, f)

with open(DATA_PATH + 'vectorizer.pkl', 'wb') as f:
    pickle.dump(vectorizer, f)

## 调整数据库

In [31]:
# sql = "alter table movies add cluster int(11)"
# cursor.execute(sql)
# sql = "alter table reviews add cluster int(11)"
# cursor.execute(sql)

sql = "update movies set cluster=%s where id=%s"
for i in range(lenMovie):
    cursor.execute(sql, (clusters[i], i+1))

sql = "update reviews set cluster=%s where id=%s"
for i in range(lenReviews):
    cursor.execute(sql, (clusters[i+lenMovie], i+1))

db.commit()