In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
import matplotlib

# 学習用データの読み込み
train = pd.read_table("train.tsv" ) 
# 評価用データの読み込み
test = pd.read_table("test.tsv")
# 応募用データの読み込み
sub=pd.read_csv('./sample_submit.csv')

# 学習用,評価用のidの切り出し。
df_id_train = train["id"]
df_id_test = test["id"]
print(df_id_train )
print(df_id_test)

# idの削除　
df_train = train.drop("id", axis=1)
df_test = test.drop("id", axis=1)
print(df_train )
print(df_test)

0          0
1          2
2          3
3          5
4          7
        ... 
4057    8110
4058    8113
4059    8117
4060    8118
4061    8121
Name: id, Length: 4062, dtype: int64
0          1
1          4
2          6
3          8
4          9
        ... 
4057    8116
4058    8119
4059    8120
4060    8122
4061    8123
Name: id, Length: 4062, dtype: int64
      Y cap-shape cap-surface cap-color bruises odor gill-attachment  \
0     p         f           y         n       f    f               f   
1     p         f           y         y       f    f               f   
2     e         b           y         w       t    l               f   
3     p         x           s         b       t    f               f   
4     p         x           s         w       t    f               f   
...  ..       ...         ...       ...     ...  ...             ...   
4057  p         f           y         n       f    s               f   
4058  p         x           f         y       f    f            

In [2]:
# データに欠損値が無いか確認
# 欠損値が見つかった場合、平均や中央値などで補完したり、また、欠損値があまりに多い特徴は丸ごと削除するなど、対応が必要。
train.isnull().sum()

id                          0
Y                           0
cap-shape                   0
cap-surface                 0
cap-color                   0
bruises                     0
odor                        0
gill-attachment             0
gill-spacing                0
gill-size                   0
gill-color                  0
stalk-shape                 0
stalk-root                  0
stalk-surface-above-ring    0
stalk-surface-below-ring    0
stalk-color-above-ring      0
stalk-color-below-ring      0
veil-type                   0
veil-color                  0
ring-number                 0
ring-type                   0
spore-print-color           0
population                  0
habitat                     0
dtype: int64

In [3]:
#順序尺度以外をダミー変数化
# ダミー変数を使用する場合は、変数同士の相関が高いと多重共線性という問題が発生するため、配列から列の1つを削除して使用しなければならない。
# get_dummieのdrop_firstパラメータにTrueを渡すと最初の列を削除できる。
dummie_data_train = pd.get_dummies(train, drop_first=True, columns=["cap-shape","cap-surface","cap-color","bruises","odor","gill-attachment","gill-color","stalk-shape","stalk-root","stalk-surface-above-ring","stalk-surface-below-ring","stalk-color-above-ring","stalk-color-below-ring","veil-type","veil-color","ring-type","spore-print-color","habitat"])

In [4]:
# 順序尺度をカテゴリー変数化
from sklearn import preprocessing
for column in ["Y","gill-spacing","gill-size","ring-number","population"]:
    selected_column = dummie_data_train[column]
#  LabelEncoder() は，文字列や数値で表されたラベルを， 0~(ラベル種類数-1) までの数値に変換
    le = preprocessing.LabelEncoder()
# le.fit()で変換したいデータを選択。
    le.fit(selected_column)
# le.transform()で数値へ変換。
    column_le = le.transform(selected_column)
    dummie_data_train [column] = pd.Series(column_le).astype('category')

In [5]:
#p=1(毒キノコ),e=(食用キノコ)
dummie_data_train.head()

Unnamed: 0,id,Y,gill-spacing,gill-size,ring-number,population,cap-shape_c,cap-shape_f,cap-shape_k,cap-shape_s,...,spore-print-color_r,spore-print-color_u,spore-print-color_w,spore-print-color_y,habitat_g,habitat_l,habitat_m,habitat_p,habitat_u,habitat_w
0,0,1,0,1,1,4,0,1,0,0,...,0,0,1,0,0,0,0,0,0,0
1,2,1,0,0,1,5,0,1,0,0,...,0,0,0,0,1,0,0,0,0,0
2,3,0,0,0,1,2,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
3,5,1,0,0,1,4,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,7,1,0,0,1,4,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0


In [6]:
dummie_data_test = pd.get_dummies(test, drop_first=True, columns=["cap-shape","cap-surface","cap-color","bruises","odor","gill-attachment","gill-color","stalk-shape","stalk-root","stalk-surface-above-ring","stalk-surface-below-ring","stalk-color-above-ring","stalk-color-below-ring","veil-type","veil-color","ring-type","spore-print-color","habitat"])

In [7]:
from sklearn import preprocessing
for column in ["gill-spacing","gill-size","ring-number","population"]:
    selected_column = dummie_data_test[column]
#  LabelEncoder() は，文字列や数値で表されたラベルを， 0~(ラベル種類数-1) までの数値に変換
    le = preprocessing.LabelEncoder()
# le.fit()で変換したいデータを選択。
    le.fit(selected_column)
# le.transform()で数値へ変換。
    column_le = le.transform(selected_column)
    dummie_data_test [column] = pd.Series(column_le).astype('category')

In [8]:
dummie_data_test.head()

Unnamed: 0,id,gill-spacing,gill-size,ring-number,population,cap-shape_c,cap-shape_f,cap-shape_k,cap-shape_s,cap-shape_x,...,spore-print-color_r,spore-print-color_u,spore-print-color_w,spore-print-color_y,habitat_g,habitat_l,habitat_m,habitat_p,habitat_u,habitat_w
0,1,0,0,1,4,0,0,0,0,1,...,0,0,0,0,1,0,0,0,0,0
1,4,0,0,1,5,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,6,0,0,1,5,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,8,0,0,1,4,0,1,0,0,0,...,0,0,0,0,1,0,0,0,0,0
4,9,0,1,1,4,0,0,0,0,1,...,0,0,1,0,0,1,0,0,0,0


In [9]:
# 目的変数の切り出し
df_train = dummie_data_train.drop("id", axis=1)
print(df_train)
y = df_train["Y"]
print(y)

      Y gill-spacing gill-size ring-number population  cap-shape_c  \
0     1            0         1           1          4            0   
1     1            0         0           1          5            0   
2     0            0         0           1          2            0   
3     1            0         0           1          4            0   
4     1            0         0           1          4            0   
...  ..          ...       ...         ...        ...          ...   
4057  1            0         1           1          4            0   
4058  1            0         0           1          4            0   
4059  1            0         1           1          4            0   
4060  1            0         1           1          4            0   
4061  1            0         1           1          4            0   

      cap-shape_f  cap-shape_k  cap-shape_s  cap-shape_x  ...  \
0               1            0            0            0  ...   
1               1            

In [10]:
#ランダムフォレストを実行
from sklearn.ensemble import RandomForestClassifier
# モデルの構築
model1 = RandomForestClassifier()
#  モデルの学習
model1.fit(df_train,y)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [11]:
# df_test(idなしのテストデータ)に対するモデルの分類予測結果
y_pred = model1.predict(dummie_data_test)
print(y_pred)

[1 0 0 ... 0 1 1]


In [12]:
# # 予想データをdfに入れる
# df = pd.Series(y_pred)
# # testデータで切り出したidの部分をdfの1行目の部分をindexにする。
# df.index = df_id_test
# # csvファイルに保存,格納する。
# df.to_csv("original.csv", header=False)