### National Chengchi University
### Department of Computer Science
### Introduction to Machine Learning
#### Term Project 2023
#### Simple Example
####
#### prepared by Chao-Lin Liu
#### Date: 22 November 2023

In [6]:
# 參考網址：https://scikit-learn.org/stable/modules/tree.html
# 參考網址：https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html

In [5]:
# !pip install scikit-learn

In [4]:
# 升級 imbalanced-learn
#!pip install --upgrade imbalanced-learn


In [5]:
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import SelectFromModel

In [6]:
trainfile = 'training_data.csv'
testfile = 'testing_data.csv'
# testanswers = 'test_nov28_task1_features_answers.csv'

In [7]:
# 讀入訓練資料檔案
df = pd.read_csv(trainfile)
df[0:5]

Unnamed: 0.1,Unnamed: 0,x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16,label
0,0,7.809,-4.121,-2.166,-4.045,,-2.472,0.007,-2.081,,-1.128,-0.521,-0.569,-0.985,-0.048,-8.003247,-4.806843,4.148356,A
1,1,2.134,-1.906,-0.834,,-2.064,-1.704,1.233,0.42,0.204,-2.506,-0.914,-1.456,2.29,-2.858,17.200422,5.29226,-14.406137,A
2,2,7.471,-0.613,-1.064,7.556,-1.755,-2.192,3.216,1.288,-3.782,-2.145,-1.701,1.904,,-2.389,19.90057,6.041156,,A
3,3,-4.281,-0.603,1.108,0.021,-1.659,-0.71,0.458,,-1.01,2.847,0.317,0.121,-1.123,-1.144,-0.012258,-0.583383,0.149031,B
4,4,,4.037,-6.119,-0.283,-1.032,2.285,-0.816,-0.32,2.272,-1.083,1.182,-1.877,0.308,-0.802,-2.59207,,4.365393,G


In [8]:
# 取用 features
features_list = ['x0','x1','x2','x3','x4','x5','x6','x7','x8','x9',"x10","x11","x12","x13",'x14','x15','x16']
features = df[features_list]
features[0:5]

Unnamed: 0,x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16
0,7.809,-4.121,-2.166,-4.045,,-2.472,0.007,-2.081,,-1.128,-0.521,-0.569,-0.985,-0.048,-8.003247,-4.806843,4.148356
1,2.134,-1.906,-0.834,,-2.064,-1.704,1.233,0.42,0.204,-2.506,-0.914,-1.456,2.29,-2.858,17.200422,5.29226,-14.406137
2,7.471,-0.613,-1.064,7.556,-1.755,-2.192,3.216,1.288,-3.782,-2.145,-1.701,1.904,,-2.389,19.90057,6.041156,
3,-4.281,-0.603,1.108,0.021,-1.659,-0.71,0.458,,-1.01,2.847,0.317,0.121,-1.123,-1.144,-0.012258,-0.583383,0.149031
4,,4.037,-6.119,-0.283,-1.032,2.285,-0.816,-0.32,2.272,-1.083,1.182,-1.877,0.308,-0.802,-2.59207,,4.365393


In [9]:
# 處理缺失值
imp = SimpleImputer(strategy="mean")
features = imp.fit_transform(features)

In [10]:
# 取用 答案類別
targets = ['label']
labels = df[targets]

In [7]:
# 檢查 features 中是否有 NaN 或無窮大的數值
if not np.all(np.isfinite(features)):
    # 如果有，使用 SimpleImputer 來填充這些值
    imp = SimpleImputer(strategy="mean")
    features = imp.fit_transform(features)

# 使用 pandas 的 isnull() 函數來檢查 DataFrame 中的 NaN 值
if labels.isnull().values.any():
    # 移除含有 NaN 的行
    labels = labels.dropna()

In [11]:
# 特徵選擇
sel = SelectFromModel(RandomForestClassifier(n_estimators = 100))
sel.fit(features, labels.values.ravel())
selected_feat= [f for (f, s) in zip(features_list, sel.get_support()) if s]
features = df[selected_feat]

In [12]:
# 檢查 features 中是否有 NaN 或無窮大的數值
if not np.all(np.isfinite(features)):
    print("Invalid values detected in features:")
    print(np.where(~np.isfinite(features)))

    # 如果有，使用 SimpleImputer 來填充這些值
    imp = SimpleImputer(strategy="mean")
    features = imp.fit_transform(features)

    # 再次檢查 features 中是否有 NaN 或無窮大的數值
    if not np.all(np.isfinite(features)):
        print("Invalid values still present in features after imputation:")
        print(np.where(~np.isfinite(features)))
    else:
        print("All invalid values in features have been successfully imputed.")

Invalid values detected in features:
(array([   0,    4,    8, ..., 5271, 5271, 5272], dtype=int64), array([3, 0, 2, ..., 2, 3, 1], dtype=int64))
All invalid values in features have been successfully imputed.


In [13]:
# 處理不平衡的資料集
sm = SMOTE(random_state=42)
features, labels = sm.fit_resample(features, labels.values.ravel())

In [14]:
# 使用隨機森林分類器
clf = RandomForestClassifier(n_estimators=100)


In [15]:
# 進行訓練
clf = clf.fit(features, labels)

In [16]:
# 讀入測試資料檔案
df2 = pd.read_csv(testfile)
df2[0:5]

Unnamed: 0.1,Unnamed: 0,x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16
0,3371,-1.266,1.071,3.441,-0.864,,-4.152,,,-1.641,,,-3.637,2.839,-1.98,-2.310983,-4.246461,2.438406
1,1742,8.488,-5.545,-1.512,-2.425,0.505,,0.684,-2.239,1.167,0.471,-0.694,-0.975,,0.167,-4.665127,-2.264956,2.153424
2,6949,2.961,9.638,-3.275,3.086,2.766,-1.866,1.001,-0.171,0.294,2.402,-3.198,-0.845,-1.404,0.801,10.888652,3.100296,-12.917832
3,1377,1.757,,0.838,2.641,-0.813,-3.531,-0.879,-0.066,-0.886,1.155,-1.761,0.344,4.473,1.534,,,-10.324148
4,3535,-1.579,0.026,-1.562,-0.467,-3.195,2.306,0.774,0.508,-2.336,0.891,,-2.642,2.034,-1.193,-1.838944,1.058983,2.195671


In [17]:
# 取用測試檔案的 features
features2 = df2[features_list]
# 假設測試檔案包含正確答案
# target2 = df2[targets]

In [17]:

# 假設 features2 是你的測試數據
# 檢查 features2 中是否有 NaN 或無窮大的數值
if not np.all(np.isfinite(features2)):
    # 如果有，使用 SimpleImputer 來填充這些值
    imp = SimpleImputer(strategy="mean")
    features2 = imp.fit_transform(features2)

# 現在你可以安全地使用 features2 進行後續操作


In [18]:
# 假設 imp 是已經用訓練資料集 fit 過的 SimpleImputer 實例
# features2 是你要轉換的新資料集，它應該是一個 DataFrame 而不是 numpy array

# 確保 features2 是一個 DataFrame 並且包含特徵名稱
features2 = pd.DataFrame(features2, columns=imp.feature_names_in_)

# 現在你可以使用 imp 來轉換 features2 而不會收到警告訊息
features2 = imp.transform(features2)


In [19]:
# 印出測試資料的預測類別
# 可以整理這裡的輸出上傳到 kaggle
pred=clf.predict(features2)
df = pd.DataFrame(pred, columns=['label'])
# 將 DataFrame 寫入 CSV 檔案
df.to_csv('sample_submission.csv', index=True, index_label='id')

In [None]:
# 如果測試檔案內涵正確答案，直接計算分類結果的 accuracy
clf.score(features2, target2)