代码对下一棵最有可能被领养的树进行了预测

使用的特征为`template_1`, `likeCount`, `longitude`, `template_68`, `intercept`

In [1]:
import pandas as pd
import numpy as np
import pylab as pl
from sklearn import linear_model
from statsmodels.formula.api import ols

## 准备数据

In [91]:
df = pd.read_json("../../data/all_trees.json")

In [92]:
data = df.loc[:,[
  # 需要的数据
  'id','owner', 'place', 'longitude', 'latitude', 'likeCount', 'template', 
  # 不考虑纳入模型的
  'height', 'radius', 'subHeight', 'bottomHeight', 
  # 不需要的
  # 'number', 'remark', 'qrcode','available', 'createdAt', 'updatedAt', 'images', 'user', 
  ]]

In [93]:
for i in range(len(df)):
  #template种类编号，habit习性
  template_id = pd.json_normalize(data.loc[i,['template']])
  data.loc[i,'template'] = template_id.loc[0,'id']
  data.loc[i,'habit'] = template_id.loc[0,'habit']
  #owner是否已被认养
  if(data.loc[i,['owner']].isnull().values.any() == False):
    data.loc[i,['owner']] = 1
  else:
    data.loc[i,['owner']] = 0
  #place校区，中北1，闵行0
  if(data.loc[i,['longitude']][0] >= 121.44 ):
    data.loc[i,['place']] = 0
  else: 
    data.loc[i,['place']] = 1

In [94]:
data = data.fillna(0)
data = data.replace('', 0)

In [95]:
data

Unnamed: 0,id,owner,place,longitude,latitude,likeCount,template,height,radius,subHeight,bottomHeight,habit
0,3023,1,0,121.453962,31.032579,3,68,0,0,0,0,乔木
1,3021,0,0,121.454747,31.032390,0,223,0,0,0,0,草本
2,3020,0,0,121.451890,31.031580,0,222,2,6,1.5,0,乔木
3,3019,0,0,121.451762,31.029240,0,11,20,60,3.5,0,乔木
4,3018,0,0,121.450062,31.033594,2,30,0,0,0,0,乔木
...,...,...,...,...,...,...,...,...,...,...,...,...
2971,5,1,1,121.406714,31.226994,1,1,17.7,115,3.0,0,乔木
2972,4,1,1,121.406724,31.226958,2,1,21,89.8,6.5,0,乔木
2973,3,1,1,121.406559,31.227018,0,1,18,74,4.7,0,乔木
2974,2,1,1,121.406527,31.226913,0,1,26.3,120,5.3,0,乔木


In [101]:
train_cols = ['owner', 'longitude', 'latitude', 'likeCount']
dummy_ranks = pd.get_dummies(data['template'], prefix='template')
train_data = data[train_cols].join(dummy_ranks.loc[:,:])
train_data['intercept'] = 1.0

## 变量筛选

In [102]:
#定义向前逐步回归函数
def forward_select(data,target):
    variate = set(data.columns)  #将字段名转换成字典类型
    variate.remove(target)  #去掉因变量的字段名
    selected = []
    current_score = float('inf') 
    best_new_score = float('inf')  #目前的分数和最好分数初始值都为无穷大（因为AIC越小越好）
    #循环筛选变量
    while variate:
        aic_with_variate = []
        for candidate in variate:  #逐个遍历自变量
            formula = "{}~{}".format(target,"+".join(selected+[candidate]))  #将自变量名连接起来
            aic = ols(formula = formula,data = data).fit().aic  #利用ols训练模型得出aic值
            aic_with_variate.append((aic,candidate))  #将第每一次的aic值放进空列表
        aic_with_variate.sort(reverse = True)  #降序排序aic值
        best_new_score,best_candidate = aic_with_variate.pop()  #最好的aic值等于删除列表的最后一个值，以及最好的自变量等于列表最后一个自变量
        if current_score > best_new_score:  #如果目前的aic值大于最好的aic值
            variate.remove(best_candidate)  #移除加进来的变量名，即第二次循环时，不考虑此自变量了
            selected.append(best_candidate)  #将此自变量作为加进模型中的自变量
            current_score = best_new_score  #最新的分数等于最好的分数
            print("aic is {},continuing!".format(current_score))  #输出最小的aic值
        else:
            print("for selection over!")
            break
    formula = "{}~{}".format(target,"+".join(selected))  #最终的模型式子
    print("final formula is {}".format(formula))
    model = ols(formula = formula,data = data).fit()
    return(model)

# 利用向前逐步回归筛选变量
forward_select(data = train_data,target='owner')


aic is -1240.3587452709498,continuing!
aic is -1510.7409305560604,continuing!
aic is -1554.729806112292,continuing!
aic is -1559.0419881847583,continuing!
aic is -1559.0419881847674,continuing!
for selection over!
final formula is owner~template_1+likeCount+longitude+template_68+intercept


<statsmodels.regression.linear_model.RegressionResultsWrapper at 0x2a4c3412140>

In [103]:
train_cols = ['template_1','likeCount','longitude','template_68','intercept']

## 训练模型

In [104]:
log_reg = linear_model.LogisticRegression()
log_reg.fit(train_data[train_cols].astype(float), train_data['owner'])

In [106]:
log_reg.coef_

array([[ 5.18132203,  1.30891004, -0.05450198,  0.81946388,  0.00917449]])

### 模型为
$$log(\frac{p}{1-p}) = 5.18 * template_1 + 1.31 * likecount - 0.05 *longitude + 0.82 * template_{68} + 0.01$$

## 代回原数据进行预测

In [107]:
pred = log_reg.predict_proba(train_data[train_cols].astype(float))
accuracy = log_reg.score(train_data[train_cols].astype(float), train_data['owner'])

print("test data accuracy is :", accuracy)

test data accuracy is : 0.960013440860215


In [83]:
pre_0 = pd.Series(pred[:, 0], name='pre_0')
pre_1 = pd.Series(pred[:, 1], name='pre_1')

In [111]:
final_data = data.join(dummy_ranks.loc[:,:])
final_data.insert(2,'pre_0',pre_0)
final_data.insert(3,'pre_1',pre_1)

In [124]:
outputpath=r'D:\Repos\myprj\Plants-in-ECNU\code\statmodel\pred_tree.csv'
final_data.to_csv(outputpath,sep=',',index=False,header=True)

### 下一个最有可能被领养的植物是 : id=368

In [125]:
temp =  final_data[train_data['owner'] == 0]
max_index = temp['pre_1'].idxmax()
max_record = temp.loc[max_index,:]
max_record

id                   368
owner                  0
pre_0           0.021474
pre_1           0.978526
place                  1
                  ...   
template_218       False
template_220       False
template_221       False
template_222       False
template_223       False
Name: 2636, Length: 208, dtype: object