In [1]:
# 指定路徑
import os 
os.environ['R_HOME'] = '/Users/idsl/anaconda3/Lib/R'

import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier

train_data = pd.read_csv('adult.data', header= None, names=['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'income'])
test_data = pd.read_csv('adult.test', header= None, skiprows=1,  names=['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'income'])


t_d = test_data

# fnlwgt 是 ID 與資料集無關
train_data = train_data.drop(['fnlwgt'], axis=1)
test_data = test_data.drop(['fnlwgt'], axis=1)

# education與education-num相對應(重複)，故刪除。
train_data.drop(['education'], axis = 1, inplace = True)
test_data.drop(['education'], axis = 1, inplace = True)

# 每個元素的前後空格去掉
train_data = train_data.apply(lambda x: x.str.strip() if x.dtype == "object" else x)
test_data = test_data.apply(lambda x: x.str.strip() if x.dtype == "object" else x)

# 把"?"取代為NaN
train_data.replace("?", pd.NaN, inplace = True)
test_data.replace("?", pd.NaN, inplace = True)


# 把缺失值補齊，名目資料填眾數，數值資料填平均值。
fill_data = {'workclass': train_data['workclass'].mode()[0], 'occupation': train_data['occupation'].mode()[0], 'native-country': train_data['native-country'].mode()[0]}
fill_data_test = {'workclass': test_data['workclass'].mode()[0], 'occupation': test_data['occupation'].mode()[0], 'native-country': test_data['native-country'].mode()[0]}

# 缺失值填充
train_data.fillna(fill_data, inplace=True)
test_data.fillna(fill_data_test, inplace=True)

#將income欄位>50K改為1，<=50K改為0
train_data['income'] = train_data['income'].apply(lambda x: 0 if x == "<=50K" else 1)
test_data['income'] = test_data['income'].apply(lambda x: 0 if x == '<=50K.' else 1)

# 特徵欄位！！
from sklearn.preprocessing import MinMaxScaler
numerical_columns = ['age', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week']

scaler = MinMaxScaler()

# 正規化
train_data_scaled = pd.DataFrame(scaler.fit_transform(train_data[numerical_columns]), columns=numerical_columns)
train_data[numerical_columns] = train_data_scaled
test_data_scaled = pd.DataFrame(scaler.transform(test_data[numerical_columns]), columns=numerical_columns)
test_data[numerical_columns] = test_data_scaled


train_data = pd.get_dummies(train_data, columns=['workclass', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country'], dtype=int)
test_data = pd.get_dummies(test_data, columns=['workclass', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country'], dtype=int)
test_data['native-country_Holand-Netherlands'] = 0

# 刪除重複欄位
train_data.drop_duplicates(inplace=True)
test_data.drop_duplicates(inplace=True)
# 資料隨機
from sklearn.utils import shuffle
train_data = shuffle(train_data)


feature_mapping = {
    'native-country_Trinadad&Tobago': 'native-country_Trinadad_and_Tobago',
    'native-country_Outlying-US(Guam-USVI-etc)': 'native-country_Outlying-US-Guam-USVI-etc'
}

train_data.rename(columns=feature_mapping, inplace=True)
test_data.rename(columns=feature_mapping, inplace=True)

train_data = train_data.sort_index(axis=1)
test_data = test_data.sort_index(axis=1)
train_data
# columns = train_data.columns
# with open("columns.csv", "w") as f:
#     f.write("\n".join(columns))



Unnamed: 0,age,capital-gain,capital-loss,education-num,hours-per-week,income,marital-status_Divorced,marital-status_Married-AF-spouse,marital-status_Married-civ-spouse,marital-status_Married-spouse-absent,...,sex_Female,sex_Male,workclass_Federal-gov,workclass_Local-gov,workclass_Never-worked,workclass_Private,workclass_Self-emp-inc,workclass_Self-emp-not-inc,workclass_State-gov,workclass_Without-pay
7096,0.397260,0.0,0.365014,0.800000,0.397959,0,1,0,0,0,...,0,1,0,0,0,1,0,0,0,0
24332,0.328767,0.0,0.000000,0.600000,0.204082,0,1,0,0,0,...,0,1,0,0,0,1,0,0,0,0
8196,0.095890,0.0,0.000000,0.600000,0.489796,0,0,0,0,0,...,0,1,0,0,0,1,0,0,0,0
14924,0.356164,0.0,0.000000,0.600000,0.142857,0,0,0,0,0,...,1,0,1,0,0,0,0,0,0,0
13561,0.232877,0.0,0.000000,0.800000,0.479592,1,0,0,1,0,...,0,1,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19123,0.041096,0.0,0.000000,0.533333,0.448980,0,0,0,0,0,...,1,0,0,0,0,1,0,0,0,0
5410,0.054795,0.0,0.362489,0.666667,0.397959,0,0,0,1,0,...,1,0,0,0,0,1,0,0,0,0
32252,0.273973,0.0,0.000000,0.800000,0.500000,0,0,0,1,0,...,1,0,0,0,0,0,0,1,0,0
21388,0.095890,0.0,0.000000,0.000000,0.295918,0,0,0,0,0,...,1,0,0,0,0,1,0,0,0,0


In [2]:
import rpy2.robjects as robjects
from rpy2.robjects import pandas2ri
from rpy2.robjects.packages import importr
## 無法下載 可以去官方網站的版本裡下載需要的套件！！
robjects.r('''install.packages('https://cran.ma.imperial.ac.uk/bin/windows/contrib/3.6/stringr_1.4.0.zip', repos=NULL, type='source')''')

R[write to console]: trying URL 'https://cran.ma.imperial.ac.uk/bin/windows/contrib/3.6/stringr_1.4.0.zip'

R[write to console]: Content type 'application/zip'
R[write to console]:  length 216784 bytes (211 KB)

R[write to console]: downloaded 211 KB




In [3]:
utils = importr('utils')
C50 = importr("C50")
partykit = importr("partykit")

# 無視錯誤消息
robjects.r('Sys.setlocale("LC_ALL", "C")')

robjects.r('library(C50)')

pandas2ri.activate()

r_train_data = pandas2ri.py2rpy(train_data)
r_test_data = pandas2ri.py2rpy(test_data)


robjects.r.assign("r_train_data", r_train_data)
robjects.r.assign("r_test_data", r_test_data)

robjects.r('col<-r_test_data$"income"')


In [4]:

robjects.r('colnames(r_train_data)[colnames(r_train_data) == "income"] <- "class"')
robjects.r('r_train_data$class <- as.factor(r_train_data$class)')
robjects.r('subset_data <- r_train_data[, !colnames(r_train_data) %in% "class"]')

robjects.r('model <- C5.0(x = subset_data, y = r_train_data$class, trials = 8, control = C5.0Control(CF = FALSE, label = "dick"))')

model = robjects.globalenv['model']


In [5]:
robjects.r('subset_data_test <- r_test_data[, !colnames(r_test_data) %in% "income"]')
r_predictions = robjects.r('''
    # 模型套用在adult.test                 
    predictions <- predict(model, newdata = subset_data_test)         
    freq1 <-table(predictions, col)
                           
    accuracy<-sum(diag(freq1)/sum(freq1))
    print(accuracy)
''')

[1] 0.8647565


In [6]:
predictions = robjects.globalenv['predictions']

with (robjects.default_converter + pandas2ri.converter).context():
  pd_from_r_df = robjects.conversion.get_conversion().rpy2py(predictions)

from openpyxl import Workbook
wb = Workbook()
ws = wb.active
ws.append(['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country','income','Predict result'])

for i in range(len(pd_from_r_df)):
    if pd_from_r_df[i] == '0':
        result = '<=50K.'
    else:
        result = '>50K.'
    li = t_d.iloc[i,:].tolist()
    #
    li.append(result)
    ws.append(li)
wb.save('Adult_dt_C50.xlsx')

print(pd_from_r_df)

['0', '0', '0', '1', '0', ..., '0', '0', '1', '0', '1']
Length: 15217
Categories (2, object): ['0', '1']
