In [2]:
import random, re, csv
import numpy as np
import math
from simdata import *
# for pre-processing data
from sklearn.preprocessing import StandardScaler
# for kmeans
from scipy import stats
from sklearn.cluster import KMeans
# for svm
from sklearn import metrics,svm
#for evaluation
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
# for visulization
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

In [26]:
"""
#==============================================
# make synthic data
#==============================================
sample_num=1000
d_ratio=0.5
dsample_num=math.ceil(sample_num*d_ratio)
ndsample_num=sample_num-dsample_num

#depressive:  OR:[0.6,0,9], CR:[0.4,1.0], IR:[0.5,0.7], RR:[0.4,0.6]
#no_depessive: OR:[0.3,0.5], CR:[0.6,1.0], IR:[0.1,0.3], RR:[0.1,0.3]
bhf_range={'or':[6,10,3,6],"cr":[4,11,6,11],"ir":[5,8,1,4],"rr":[4,7,1,4]}

syn_bhdata=make_syn_bhdata(sample_num,d_ratio,bhf_range)

syn_bhf=syn_bhdata["syn_bhf"]
syn_prel=syn_bhdata["syn_pre_bhl"]

nbhf_num=10
syn_nbhf=make_syn_nbhf(nbhf_num,sample_num)

synf=np.insert(syn_nbhf,[0],syn_bhf,axis=1)
syndata=np.insert(syn_prel.reshape(1000,1),[0],synf,axis=1)

syndata_filedir="../resources/gamedata/syndata.csv"
create_csvfile(syndata,syndata_filedir)
"""

#=================================
# get mixdata from csv files:
# 1. ipld mixdata
# 2. syn mixdata
# mixdata: [[interdata,biodata,prelabel],...,[]]
#================================
syndata_filedir="../resources/gamedata/syndata.csv"
ipldmixdata_sim_filedir='../resources/gamedata/ipldmixdata_sim.csv'

#mixdata=np.asarray(get_mldata(ipldmixdata_sim_filedir),dtype=float)
mixdata=np.asarray(get_mldata(syndata_filedir),dtype=float)

#==============================================
#prepare features/labels for k-means
#==============================================
mixdata_feature=mixdata[:,0:mixdata[0].shape[0]-1]
mixdata_scaler = StandardScaler()
mixdata_feature_nd=mixdata_scaler.fit_transform(mixdata_feature)

mixdata_prelabel=mixdata[:,mixdata[0].shape[0]-1]



In [27]:
#-----------------------------------------------------
# aim: tranformed new data to normal distribution followed by old data
# in: new_data 1*n, 
#     olddata_scaler: data scaler
# out: transformed data
#-----------------------------------------------------
def trans_data(new_data,olddata_scaler):
    transformed_data=olddata_scaler.transform(new_data.reshape(1, -1))
    return transformed_data




In [32]:
#-----------------------------------------------------
# aim: build km cluster and svm classifier from data
# in: bhdata,nbhdata;
#     subset_num(number of subsets)
# out: dict={"cluer":cluster_km,"clfer":classifier_svm}
#-----------------------------------------------------
def build_analyser(bhdata,nbhdata,subset_num):
    #Cluster from kmeans
    cluster_km = KMeans(n_clusters=subset_num, random_state=0).fit(bhdata)
    #Classifier from SVM
    cluster_result=cluster_km.labels_
    '''clf_train_feature,\
    clf_test_feature,\
    clf_train_label,\
    clf_test_label = train_test_split(nbhdata,cluster_result, test_size=test_ratio)'''
    # build classifier
    classifier_svm=svm.SVC(C=1).fit(nbhdata,cluster_result)
    return {"clu":cluster_km,"clf":classifier_svm}

In [33]:
mixtrain_feature,\
mixtest_feature,\
mixtrain_label,\
mixtest_label = train_test_split(mixdata_feature_nd,mixdata_prelabel, test_size=.2)

In [34]:
#==================================
# Build processors from original
#==================================
processors_ori=build_analyser(mixtrain_feature[:,0:4],mixtrain_feature[:,5:],2)
clfer_ori=processors_ori["clf"]
cluer_ori=processors_ori['clu']

In [35]:
clu_result=cluer_ori.predict(mixtest_feature[:,0:4])
print("similarity test: ",metrics.adjusted_rand_score(clu_result,mixtest_label))

similarity test:  1.0


In [None]:
#==================================
# make noise
#==================================
#depressive:  OR:[0.6,0,9], CR:[0.4,1.0], IR:[0.5,0.7], RR:[0.4,0.6]
#no_depessive: OR:[0.3,0.5], CR:[0.6,1.0], IR:[0.1,0.3], RR:[0.1,0.3]

# make uniform noise(int) for behaviourdata
num_sample=mixdata_feature.shape[0]

bhf_noise=np.random.randint(0,11,num_sample).reshape(num_sample,1)
for i in range(3):
    noise_v=np.random.randint(0,11,num_sample).reshape(num_sample,1)
    bhf_noise = np.insert(bhf_noise,[1],noise_v,axis=1)



In [None]:
data_scaler = StandardScaler()
bhf_noise_nd=data_scaler.fit_transform(bhf_noise)
all_processors_noise=build_analyser(bhf_noise_nd,mixdata_feature_nd[:,5:],2)

In [None]:
clfer_noise=all_processors_noise["clf"]
cluer_noise=all_processors_noise['clu']

In [None]:
clu_result_ori=cluer_ori.predict(mixdata_feature_nd[:,0:4])



one_num=0
for i in mixdata_prelabel:
    if i==1.0:
        one_num=one_num+1

print(one_num)

In [None]:
#make new classifier
all_processors=build_analyser(noised_bhf,mixdata_feature_nd[:,5:],2)

In [None]:
num_data=np.arange(0,noisydata.size)
plt.plot(num_data,noisydata,'ro-',label='Noisy Data')
plt.plot(rawdata,'bo-',label='Raw Data')
plt.legend() # 展示图例
plt.xlabel('Offer type') # 给 x 轴添加标签
plt.ylabel('Offer Value') # 给 y 轴添加标签
plt.title('Uniform Noise Analysis') # 添加图形标题
plt.show()

In [None]:
#=========================
# add noise 
#=========================
# name:get_uninoisylabel
# fun: add uniform noise in to data
# in: rawdata(np.array, NOT normalized), valuerange([low,high+1], range of noise)
# out: noisydata(np.array)
def get_uninoisylabel(rawlabel,noiselevel):
    noisenum=int(noiselevel/100*rawlabel.size)
    uninoise=np.random.randint(0,1,size=noisenum)
    label_uninoise=np.append(uninoise,rawlabel[noisenum:])
    return label_uninoise


In [None]:
noiselevel=40
#temp_tl=train_labels[0:10]
train_labels_uninoise=get_uninoisylabel(train_labels,noiselevel) 
#============================
#classify interata by SVM
#============================

clf_svm=svm.SVC(C=1)
clf_svm.fit(train_features,train_labels_uninoise)
labels_pre_svm=clf_svm.predict(test_features)
#============================
# Evaluation
#============================

acc_svm=accuracy_score(test_labels,labels_pre_svm)
print("acc_svm:",acc_svm)


In [None]:
count, bins, ignored = plt.hist(s, 2,normed=True)
plt.plot(bins, np.ones_like(bins), linewidth=2, color='r')
plt.show()
print(s)

In [None]:
# bhdata=10*3, nbhdata=10*2 
bhdata=np.append(np.arange(15,dtype=float),np.arange(20,35)).reshape(10,3)
nbhdata=np.append(np.arange(10,11,0.1),np.arange(20,23,0.3)).reshape(10,2)
data_indx=np.arange(1,bhdata.shape[0]+1).reshape(bhdata.shape[0],1)

#alldata=[ [indx,bhdata,nhbdata],...,[] ], 10*6
alldata=np.insert(np.insert(bhdata,[3],nbhdata,axis=1),[0],data_indx,axis=1)

print(alldata[:,1:4])

In [None]:
kmeans = KMeans(n_clusters=2, random_state=0).fit(bhdata)
kresult= kmeans.predict(bhdata)

In [None]:
s = np.random.uniform(-1,0,1000)