## 设置一个类

In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import StratifiedKFold
import numpy as np
import jieba
import re
from joblib import dump, load
import pandas as pd

In [5]:
class Age_cls():
    
    def __init__(self,file_stop_words='./Data/stopwords.txt',classifier = MultinomialNB()):
        self.classifier = classifier
        self.vec = TfidfVectorizer(analyzer='word',max_features=4000)
        self.stop_words(file_stop_words)
    
  
    def stop_words(self,file_stoppath):
        stopwords=pd.read_csv(file_stoppath,index_col=False,quoting=3,sep="\t",names=['stopword'], encoding='utf-8')
        self.stopwords=stopwords['stopword'].values
    
    def process_data_line(self,data_line):
        noise_pattern = re.compile("|".join(["http\S+", ":http\S+", "ｗｗｗ.+?\t",'\d+\@\S+']))
        clean_text = re.sub(noise_pattern, "", data_line)
        segs=jieba.lcut(clean_text.strip())
        segs = list(filter(lambda x:len(x)>1, segs)) #没有解析出来的新闻过滤掉
        segs = list(filter(lambda x:x not in self.stopwords, segs)) #把停用词过滤掉
        return (" ".join(segs))
        
    # 特征构建
    def features(self, X):
        return self.vec.transform(X)

    # 拟合数据
    def fit(self, X, y,shuffle=True, n_folds=5):
        return self.stratifiedkfold_cv(np.array(X),np.array(y),shuffle=True, n_folds=5)
              
    
    def stratifiedkfold_cv(self,x, y, shuffle=True, n_folds=5):
        stratifiedk_fold = StratifiedKFold(n_splits=n_folds, shuffle=shuffle)
        history_score=[]
        self.vec.fit(x)
        for train_index, test_index in stratifiedk_fold.split(x, y):
            X_train, X_test = x[train_index], x[test_index]
            y_train,y_test = y[train_index],y[test_index]
            self.classifier.fit(self.vec.transform(X_train),y_train)
            history_score.append(self.classifier.score(self.vec.transform(X_test), y_test)) 
        return history_score 
    
    # 预估类别
    def predict(self, x):
        return self.classifier.predict(self.features([x]))

    # 测试集评分
    def score(self, X, y):
        return self.classifier.score(self.features(X), y)
    
    # 模型持久化存储
    def save_model(self, path):
        dump((self.classifier, self.vec), path)
    
    # 模型加载
    def load_model(self, path):
        self.classifier, self.vec = load(path)
    
   

In [12]:
age_text_cls = Age_cls()

In [2]:
file_train_path = './Data/train.csv' 
traindf = pd.read_csv(file_train_path,sep="###__###",header = None,encoding='utf-8')
traindf.head()

  


Unnamed: 0,0,1,2,3,4
0,22DD920316420BE2DF8D6EE651BA174B,1,1,4,柔和双沟\t女生\t中财网首页 财经\thttp://pan.baidu.com/s/1pl...
1,43CC3AF5A8D6430A3B572337A889AFE4,2,1,3,"广州厨宝烤箱\t世情薄,人情恶,雨送黄昏花易落,晓风干,泪痕\t厦门酒店用品批发市场\t我只..."
2,E97654BFF5570E2CCD433EA6128EAC19,4,1,0,钻石之泪耳机\t盘锦到沈阳\t旅顺公交\t辽宁阜新车牌\tbaidu\tk715\tk716...
3,6931EFC26D229CCFCEA125D3F3C21E57,4,2,3,最受欢迎狗狗排行榜\t舶怎么读\t场景描 写范例\t三维绘图软件\t枣和酸奶能一起吃吗\t好...
4,E780470C3BB0D340334BD08CDCC3C71A,2,2,4,干槽症能自愈吗\t太太万岁叶舒心去没去美国\t干槽症\t右眼皮下面一直跳是怎么回事\t麦当劳...


In [4]:
traindf.loc[0,'Query List']

'柔和双沟\t女生\t中财网首页 财经\thttp://pan.baidu.com/s/1plpjtn9\t周公解梦大全查询2345\t曹云金再讽郭德纲\t总裁大人行行好\t中财网第一财经传媒\t教师节全文\t男子砸毁15墓碑\t黄岩岛最新填海图\t引起的疲\t缘来未迟落跑甜心不好惹\t梁朝伟与替身同框\t笑傲江湖电视剧任贤齐\t小起名字女孩名字\t海运行李到堪培拉\t确定\t诱爱99天 司少的天价宝贝\t什么是遥控魔棒\t徽信表情动态搞笑图片\t教师节征文100字\t安微联通网上营业厅\t甜宠百分百:校草的萌萌未婚妻\t豪门重生之暖爱成婚\tnikehypershift和kd5哪个好看\t韭菜炒鸡蛋\t陈赫玩王者荣耀\t虎牙楚河\t三国演义小说txt下载\t威县欧派\t炒馍花怎么做好吃\t黄岩岛最新消息2016年\t中秋节诗句大全祝福\t教师节征文\t菜谱\t柔和双沟卖的怎么样\t七位数开奖结果\t以色列停车场坍塌\t天龙家庭农场\t7.22什么星座\t新旧约圣经和合本下载\t4π\twifi万能钥匙\t威灵仙图片\t临泉长官天龙家庭农场\t早安总统大人\t百合\t莲藕的做法\t花街\t无锡\t蚬壳胃散怎么吃\t触手忆寒\t中秋节的诗句\t孟州电信 电子发票\t鸡丝汤的做法\t我等你\t临泉长官镇桥口李小刚农场\t朋仇\t全民k歌\t炸葱花\t蒜苔炒肉\t冰川的图片\tkd5\t…\t若风\t好奇纸尿裤\t清蒸鱼\t189.8是谁的平方\t重庆餐馆发生爆炸\t捡手机被失主抢劫\thttps://yunpan.cn/ocsqfgtfya2ewj\t炒馍花的家常做法\t三国演义小说百度云\t总裁掠爱小舅别太坏\t:https://yunpan.cn/cmh8tmeyraiww\t周公解梦\t查坦克冰川\t凉拌藕片的做法\t投票\t鸡丝炒什么好吃\t被时光掩埋的秘密小说下载\t中国电信电子发票\t张续豪\t关于月亮的诗句\t用酵母蒸馒头的方法\t赵丽颖碧瑶坐\t触手兵长\t图集 下载腾讯新闻,看街头混战武警\t厦门航空\t蚬壳胃散\t炒茄子做法\t身份类别怎么填\t最好的我们里面的方特在哪里\t牢里面的生活是怎样的\t强迫症有哪些表现\t白袍法师暖暖图片\t朋仇广场舞\t小宇热游\t蒸馒头的方法\t狡滑的意思\t黄石大冶东岳派出所服务电话\t三国演义小说下载txt\th

In [3]:
columns_values = ['ID','Age','Gender','Education','Query List']
traindf.columns = columns_values

In [13]:
traindf['Query List'] = traindf['Query List'].map(age_text_cls.process_data_line)

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\SEELE\AppData\Local\Temp\jieba.cache
Loading model cost 0.555 seconds.
Prefix dict has been built successfully.


In [14]:
traindf.head()

Unnamed: 0,ID,Age,Gender,Education,Query List
0,22DD920316420BE2DF8D6EE651BA174B,1,1,4,柔和 双沟 女生 中财网 首页 财经 周公 解梦 大全 查询 2345 曹云金 再讽 郭德纲...
1,43CC3AF5A8D6430A3B572337A889AFE4,2,1,3,广州 厨宝 烤箱 世情 人情 雨送 黄昏 花易落 风干 泪痕 厦门 酒店用品 批发市场 不想...
2,E97654BFF5570E2CCD433EA6128EAC19,4,1,0,钻石 之泪 耳机 盘锦 沈阳 旅顺 公交 辽宁 阜新 车牌 baidu k715 k716 ...
3,6931EFC26D229CCFCEA125D3F3C21E57,4,2,3,受欢迎 狗狗 排行榜 场景 范例 三维 绘图 软件 酸奶 壮观 衣服 网站 动漫 绘图 软件...
4,E780470C3BB0D340334BD08CDCC3C71A,2,2,4,干槽症 自愈 太太 万岁 舒心 美国 干槽症 眼皮 怎么回事 麦当劳 旋风 勺子 吉林市 鹿...


In [9]:
agedf= traindf[['Age','Query List']]
agedf = agedf[agedf['Age']!=0]
X_age = agedf['Query List'].values.tolist()
Y_age = agedf['Age'].values.tolist()

In [16]:
age_history_score = age_text_cls.fit(X_age,Y_age)

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


In [17]:
print(age_history_score)

[0.5425288525090244, 0.54128533658735, 0.5386179895256012, 0.5402959271876748, 0.5437115394395565]


In [18]:
age_text_cls.save_model('./Model/age_textcls.model')

## 加载测试模型

In [6]:
load_model_age = Age_cls()

In [7]:
load_model_age.load_model('./Model/age_textcls.model')

In [21]:
load_model_age.score(X_age[0:20],Y_age[0:20])

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


0.3

In [10]:
result = load_model_age.predict(X_age[0])

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


In [13]:
print(result[0])

1


## 教育

In [26]:
educationdf = traindf[['Education','Query List']]
educationdf = educationdf[educationdf['Education']!=0]
X_education = educationdf['Query List'].values.tolist()
Y_education = educationdf['Education'].values.tolist() 

In [27]:
education_cls = Age_cls()

In [28]:
educa_history = education_cls.fit(X_education,Y_education)

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


In [29]:
education_cls.save_model('./Model/education_textcls.model')

## 性别

In [57]:
class Third_Sample_Cls():
    
    def __init__(self,file_stop_words='./Data/stopwords.txt',classifier = MultinomialNB()):
        self.classifier = classifier
        self.vec = TfidfVectorizer(analyzer='word',max_features=4000)
        self.stop_words(file_stop_words)
    
  
    def stop_words(self,file_stoppath):
        stopwords=pd.read_csv(file_stoppath,index_col=False,quoting=3,sep="\t",names=['stopword'], encoding='utf-8')
        self.stopwords=stopwords['stopword'].values
    
    def process_data_line(self,data_line):
        noise_pattern = re.compile("|".join(["http\S+", ":http\S+", "ｗｗｗ.+?\t",'\d+\@\S+']))
        clean_text = re.sub(noise_pattern, "", data_line)
        segs=jieba.lcut(clean_text.strip())
        segs = list(filter(lambda x:len(x)>1, segs)) #没有解析出来的新闻过滤掉
        segs = list(filter(lambda x:x not in self.stopwords, segs)) #把停用词过滤掉
        return (" ".join(segs))
        
    # 特征构建
    def features(self, X):
        return self.vec.transform(X)

    # 拟合数据
    def fit(self, X, y,y_flg,shuffle=True, n_folds=5):
        return self.skfold_gender_cv(x=np.array(X),y=np.array(y),y_flag = np.array(y_flg),shuffle=shuffle, n_folds=n_folds)
              
    
    def skfold_gender_cv(self,x, y,y_flag,shuffle=True, n_folds=5):
        stratifiedk_fold = StratifiedKFold(n_splits=n_folds, shuffle=shuffle)
        history_score=[]
        self.vec.fit(x)
        for train_index, test_index in stratifiedk_fold.split(x, y_flag):
            X_train, X_test = x[train_index],x[test_index]
            y_train,y_test = y[train_index],y[test_index]
            self.classifier.fit( self.vec.transform(X_train),y_train)
            history_score.append(self.classifier.score( self.vec.transform(X_test), y_test)) 
        return history_score 
    
    # 预估类别
    def predict(self, x):
        return self.classifier.predict(self.features([x]))

    # 测试集评分
    def score(self, X, y):
        return self.classifier.score(self.features(X), y)
    
    # 模型持久化存储
    def save_model(self, path):
        dump((self.classifier, self.vec), path)
    
    # 模型加载
    def load_model(self, path):
        self.classifier, self.vec = load(path)

In [41]:
genderdf = traindf[['Age','Query List','Gender']]
genderdf = genderdf[genderdf['Gender']!= 0]
X_gender = genderdf['Query List'].values.tolist()
Y_gender = genderdf['Gender'].values.tolist() 
Y_ageflg = genderdf['Age'].values.tolist() 


In [43]:
print(len(X_gender))
print(len(Y_gender))
print(len(Y_ageflg))

97845
97845
97845


In [59]:
gender_cls = Third_Sample_Cls()

In [60]:
gender_history = gender_cls.fit(X_gender,Y_gender,Y_ageflg)

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


In [58]:
del gender_cls

In [47]:
print(educa_history)

[0.5682795106359528, 0.568830596274661, 0.5647285753651143, 0.5664755815235365, 0.5724601730885839]


In [61]:
print(gender_history)

[0.8006846515430206, 0.8048132440856369, 0.7955950738412796, 0.8022180201359431, 0.8015946028825514]


In [62]:
gender_cls.save_model('./Model/gender_textcls.model')