In [86]:
import pandas as pd
import numpy as np
data = pd.read_csv("Narrativedata.csv")
data.head()

Unnamed: 0,Age,Sex,Embarked,Survived
0,22.0,male,S,No
1,38.0,female,C,Yes
2,26.0,female,S,Yes
3,35.0,female,S,Yes
4,35.0,male,S,No


In [87]:
# 使用SimpleImputer
Age = data['Age'].values.reshape(-1,1)

# 使用中位数填补Age
from sklearn.impute import SimpleImputer
imp_median_age = SimpleImputer(strategy = 'median').fit(Age)
data['Age'] = imp_median_age.transform(Age)

# 使用众数填补Embarked
Embarked = data['Embarked'].values.reshape(-1,1)
imp_mode = SimpleImputer(strategy = 'most_frequent')
imp_mode_Embarked = imp_mode.fit(Embarked)
data['Embarked'] = imp_mode_Embarked.transform(Embarked)

# 对Y值进行空缺值填补
Survived = data['Survived'].values.reshape(-1, 1)
imp_s = SimpleImputer(missing_values = 'Unknown', strategy = 'most_frequent')
imp_sf = imp_s.fit(Survived)
data['Survived'] = imp_sf.transform(Survived)

In [30]:
# 编码与哑变量

# preprocessing.LabelEncoder：标签专用，能够将分类转换为分类数值

y = data['Survived']

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)
data['Survived'] = y

In [31]:
# preprocessing.OneHotEncoder：独热编码，创建哑变量
data_ = data[['Sex', 'Embarked']]

from sklearn.preprocessing import OneHotEncoder
onehot = OneHotEncoder()

# 训练和转换
onehot_f = onehot.fit(data_)
onehot_result = onehot_f.transform(data_)

In [36]:
# 将onehot_result做成dataframe，并且添加上这个是正确的原始的列名
onehot_df = pd.DataFrame(onehot_result.toarray(), columns = np.append(*onehot_f.categories_))

# 用concat进行拼接
data_new = pd.concat([data.drop(['Sex', 'Embarked'], axis = 1), onehot_df], axis = 1)

# 将当前的data中列名里的标签给pop出来，用lable变量接住
col_name = data_new.columns.tolist()
label = col_name.pop(1)

# pop给标签名字给追加到col_name的最后，从而使得数据的标签都在最后一列
col_name.append(label)

# 使用pandas的reindex的方式来按col_name重新排序列名
data_new = data_new.reindex(col_name, axis = 1)

In [37]:
data_new

Unnamed: 0,Age,female,male,C,Q,S,Survived
0,22.0,0.0,1.0,0.0,0.0,1.0,0
1,38.0,1.0,0.0,1.0,0.0,0.0,1
2,26.0,1.0,0.0,0.0,0.0,1.0,1
3,35.0,1.0,0.0,0.0,0.0,1.0,1
4,35.0,0.0,1.0,0.0,0.0,1.0,0
...,...,...,...,...,...,...,...
886,27.0,0.0,1.0,0.0,0.0,1.0,0
887,19.0,1.0,0.0,0.0,0.0,1.0,1
888,28.0,1.0,0.0,0.0,0.0,1.0,0
889,26.0,0.0,1.0,1.0,0.0,0.0,0


In [51]:
data_new2 = data_new.drop(['male','C'], axis = 1)
data_new2.rename({'male' : 'Sex'}, axis = 1, inplace = True)

In [96]:
data_new.head(5)

Unnamed: 0,Age,female,male,C,Q,S,Survived
0,22.0,0.0,1.0,0.0,0.0,1.0,0
1,38.0,1.0,0.0,1.0,0.0,0.0,1
2,26.0,1.0,0.0,0.0,0.0,1.0,1
3,35.0,1.0,0.0,0.0,0.0,1.0,1
4,35.0,0.0,1.0,0.0,0.0,1.0,0


In [52]:
from sklearn.linear_model import LogisticRegression

In [102]:
model1 = LogisticRegression(fit_intercept = True)
model1.fit(data_new.iloc[:, :-1],data_new.iloc[:, -1])
model1.coef_, model1.intercept_

(array([[-0.00943076,  1.11412033, -1.13250849,  0.57131052, -0.278493  ,
         -0.31120568]]), array([-0.01926271]))

In [103]:
model2 = LogisticRegression(fit_intercept = True)
model2.fit(data_new2.iloc[:, :-1],data_new2.iloc[:, -1])
model2.coef_, model2.intercept_

(array([[-0.00935626,  2.21254895, -0.77261837, -0.84125   ]]),
 array([-0.60305483]))

In [69]:
model1.score(data_new.iloc[:, :-1],data_new.iloc[:, -1]), model2.score(data_new2.iloc[:, :-1],data_new2.iloc[:, -1])

(0.7732884399551067, 0.7732884399551067)

In [113]:
from sklearn.tree import DecisionTreeClassifier
model1 = DecisionTreeClassifier()
model1.fit(data_new.iloc[:, :-1],data_new.iloc[:, -1])
model1.feature_importances_

array([0.41616026, 0.        , 0.52115121, 0.02269744, 0.01558944,
       0.02440165])

In [114]:
model2 = DecisionTreeClassifier()
model2.fit(data_new2.iloc[:, :-1],data_new2.iloc[:, -1])
model2.feature_importances_

array([0.42820719, 0.52115121, 0.01847009, 0.03217152])

In [115]:
model1.score(data_new.iloc[:, :-1],data_new.iloc[:, -1]), model2.score(data_new2.iloc[:, :-1],data_new2.iloc[:, -1])

(0.8305274971941639, 0.8305274971941639)

In [116]:
# SMOTE 算法
# 读数据
import numpy as np
import pandas as pd
table = pd.read_csv('table_1.csv', encoding = 'utf-8')
table.head()

Unnamed: 0,uid,roomid,orderlabel,star,rank,returnvalue,price_deduct,basic_maxarea,roomservice_1,roomservice_2,...,roomservice_6,roomservice_7,roomservice_8,basic_week_ordernum_ratio,basic_recent3_ordernum_ratio,basic_comment_ratio,basic_30days_ordnumratio,basic_30days_realratio,room_30days_ordnumratio,room_30days_realratio
0,USER_545,ROOM_1818649,0,11,9,410,2054,71.0,2,0,...,0,0,3,0.010893,0.0,0.03088,0.038793,1.246535,0.029095,1.253572
1,USER_545,ROOM_1818645,0,11,5,200,1754,71.0,2,0,...,0,0,1,0.010893,0.0,0.03088,0.038793,1.246535,0.029095,1.253572
2,USER_545,ROOM_1818667,0,11,7,200,4196,71.0,2,0,...,0,0,2,0.010893,0.0,0.03088,0.038793,1.246535,,
3,USER_545,ROOM_18188319,0,11,19,200,1769,77.0,2,0,...,0,0,2,0.77342,0.820388,0.670532,0.693966,1.000773,,
4,USER_545,ROOM_1818515,0,11,5,200,7196,121.0,2,0,...,0,0,2,0.006536,0.0,0.004159,0.002155,1.864469,,


In [117]:
#使用均值来对数据集的控制做填补
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split

Y = table['orderlabel']
X = table.iloc[:, 3:]

Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, Y, test_size = 0.3, random_state = 0)

columns = X.columns[3:]

SI = SimpleImputer(missing_values = np.nan, strategy = 'mean').fit(Xtrain)
Xtrain = SI.transform(Xtrain)
Xtest = SI.transform(Xtest)

In [123]:
Ytrain.value_counts()/Ytrain.value_counts().sum()

0    0.972174
1    0.027826
Name: orderlabel, dtype: float64

In [73]:
#使用smote算法
#Synthetic Minority Over-sampling Technique
#使用的是差值法来合成少数类
#设少数类的样本为T，SMOTE将少数类合成NT个新的样本。N是正整数，如果N小于1的，算法就会认为少数类T=NT，将N=1
#1. 从T个样本中找到某一个样本的K个近邻
#2. 从k个近邻中随机的选取一个同类样本，再生成一个0-1之前的随机数, 比如xi到x_knn中随机生成一个值
#3. 这是随机的样本，不是由重新采样得到的。
#4. 也就是说利用KNN算法选择样本店新最近的k个样本店，从k个样本店中随机挑选M个样本点
#5. M个样本点的数量选择依赖于最希望的平衡率

#pip install imblearn -i https://pypi.tuna.tsinghua.edu.cn/simple

In [74]:
# 对于训练数据集作平衡处理
from sklearn import model_selection, tree, metrics
from imblearn.over_sampling import SMOTE

over_samples = SMOTE(random_state=1234)
over_samples_x, over_samples_y = over_samples.fit_sample(Xtrain, Ytrain)
over_x_test, over_y_test = over_samples.fit_sample(Xtest, Ytest)

In [124]:
over_samples_y.value_counts(), over_y_test.value_counts()

(1    745053
 0    745053
 Name: orderlabel, dtype: int64, 1    319249
 0    319249
 Name: orderlabel, dtype: int64)

In [125]:
# pipeline简单的介绍
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
data = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/'
                 'breast-cancer-wisconsin/wdbc.data', header=None)

In [126]:
X = data.values[:,2:]
Y = data.values[:,1]

Xtrain, Xtest, Ytrain, Ytest = train_test_split(X,
                                                Y,
                                                test_size = 0.3,
                                                random_state= 420)

In [135]:
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import VarianceThreshold
from sklearn.decomposition import PCA

from sklearn.pipeline import Pipeline
pipe = Pipeline([('preprocess', StandardScaler()),
                 ('Var', VarianceThreshold(threshold = 0.5)),
                 ('pca', PCA(2)),
                 ('clf', LogisticRegression())
                ])
pipe.fit(Xtrain, Ytrain)

Pipeline(steps=[('preprocess', StandardScaler()),
                ('Var', VarianceThreshold(threshold=0.5)),
                ('pca', PCA(n_components=2)), ('clf', LogisticRegression())])

In [136]:
pipe.score(Xtrain, Ytrain), pipe.score(Xtest, Ytest)

(0.9597989949748744, 0.9532163742690059)

In [85]:
class frank_transform:
    def __init__(self):
        pass
    
    def fit(self):
        pass
    
    def transform(self):
        pass