### National Chengchi University
### Department of Computer Science
### Introduction to Machine Learning
#### Term Project 2023
#### Simple Example
####
#### prepared by Chao-Lin Liu
#### Date: 22 November 2023

In [6]:
# 參考網址：https://scikit-learn.org/stable/modules/tree.html
# 參考網址：https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html

In [5]:
# !pip install scikit-learn

In [4]:
# 升級 imbalanced-learn
#!pip install --upgrade imbalanced-learn


In [37]:
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import SelectFromModel

In [38]:
trainfile = 'training_data.csv'
testfile = 'testing_data.csv'
# testanswers = 'test_nov28_task1_features_answers.csv'

In [39]:
# 讀入訓練資料檔案
df = pd.read_csv(trainfile)
df = df.drop('x1', axis=1)
df[0:5]

Unnamed: 0,id,x0,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,label
0,0,20.425,-1.652,0.316,-0.314,-0.369,1.042,-0.129,1.388,-0.262,0.103,-0.453,1.317,0.204,0.127,G
1,1,-13.343,-8.962,-0.472,0.729,-1.48,-2.592,,2.0,1.0,1.04,0.242,0.949,-2.607,0.119,F
2,2,-7.947,16.805,,0.509,,0.918,,5.0,4.0,1.011,-0.145,0.114,0.149,-0.219,A
3,3,-2.933,-2.21,,-0.112,0.368,-0.788,-0.016,-1.628,-1.035,,-0.521,1.492,-0.404,0.032,L
4,4,,-13.014,-1.427,-0.349,,-2.207,1.593,3.0,3.0,-0.135,0.006,,0.222,0.1,B


In [40]:
# 取用 features
features_list = ['x0','x2','x3','x4','x5','x6','x7','x8','x9',"x10","x11","x12","x13",'x14']
features = df[features_list]
features[0:5]

Unnamed: 0,x0,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14
0,20.425,-1.652,0.316,-0.314,-0.369,1.042,-0.129,1.388,-0.262,0.103,-0.453,1.317,0.204,0.127
1,-13.343,-8.962,-0.472,0.729,-1.48,-2.592,,2.0,1.0,1.04,0.242,0.949,-2.607,0.119
2,-7.947,16.805,,0.509,,0.918,,5.0,4.0,1.011,-0.145,0.114,0.149,-0.219
3,-2.933,-2.21,,-0.112,0.368,-0.788,-0.016,-1.628,-1.035,,-0.521,1.492,-0.404,0.032
4,,-13.014,-1.427,-0.349,,-2.207,1.593,3.0,3.0,-0.135,0.006,,0.222,0.1


In [41]:
# 處理缺失值
imp = SimpleImputer(strategy="mean")
features = imp.fit_transform(features)

In [42]:
# 取用 答案類別
targets = ['label']
labels = df[targets]

In [43]:
# 檢查 features 中是否有 NaN 或無窮大的數值
if not np.all(np.isfinite(features)):
    # 如果有，使用 SimpleImputer 來填充這些值
    imp = SimpleImputer(strategy="mean")
    features = imp.fit_transform(features)

# 使用 pandas 的 isnull() 函數來檢查 DataFrame 中的 NaN 值
if labels.isnull().values.any():
    # 移除含有 NaN 的行
    labels = labels.dropna()

In [44]:
# 特徵選擇
sel = SelectFromModel(RandomForestClassifier(n_estimators = 100))
sel.fit(features, labels.values.ravel())
selected_feat= [f for (f, s) in zip(features_list, sel.get_support()) if s]
features = df[selected_feat]

In [45]:
# 檢查 features 中是否有 NaN 或無窮大的數值
if not np.all(np.isfinite(features)):
    print("Invalid values detected in features:")
    print(np.where(~np.isfinite(features)))

    # 如果有，使用 SimpleImputer 來填充這些值
    imp = SimpleImputer(strategy="mean")
    features = imp.fit_transform(features)

    # 再次檢查 features 中是否有 NaN 或無窮大的數值
    if not np.all(np.isfinite(features)):
        print("Invalid values still present in features after imputation:")
        print(np.where(~np.isfinite(features)))
    else:
        print("All invalid values in features have been successfully imputed.")

Invalid values detected in features:
(array([   2,    2,    3, ..., 8932, 8935, 8936], dtype=int64), array([2, 4, 2, ..., 2, 0, 4], dtype=int64))
All invalid values in features have been successfully imputed.


In [46]:
# 處理不平衡的資料集
sm = SMOTE(random_state=42)
features, labels = sm.fit_resample(features, labels.values.ravel())


In [47]:
# 使用隨機森林分類器
clf = RandomForestClassifier(n_estimators=100)


In [48]:
param_grid = {
    'n_estimators': [100, 200, 300, 400],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10]
}


In [49]:
# 初始化 GridSearchCV
grid_search = GridSearchCV(estimator=clf, param_grid=param_grid, cv=3, n_jobs=-1)

In [None]:
# 進行訓練
grid_search.fit(features, labels)


In [32]:
# 讀入測試資料檔案
df2 = pd.read_csv(testfile)
df2 = df2.drop('x1', axis=1)
df2[0:5]

Unnamed: 0,id,x0,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14
0,6976,,,-0.085,1.515,,-3.207,-0.492,,,-0.663,,-0.29,,0.746
1,11281,20.933,-3.532,,-0.577,-0.574,,0.374,,,-0.481,0.073,-0.942,0.185,0.502
2,8387,-4.319,19.723,-1.067,,,,-0.351,0.648,0.158,0.415,-0.53,-0.213,0.084,0.329
3,1320,-3.867,19.948,,0.176,0.67,-0.459,,0.395,1.0,,,-0.562,,0.35
4,9334,21.244,-4.065,,-0.823,-0.991,-0.168,-0.125,-0.096,-0.343,,0.314,-0.024,-0.321,


In [33]:
# 取用測試檔案的 features
features2 = df2[features_list]
# 假設測試檔案包含正確答案
# target2 = df2[targets]

In [34]:

# 假設 features2 是你的測試數據
# 檢查 features2 中是否有 NaN 或無窮大的數值
if not np.all(np.isfinite(features2)):
    # 如果有，使用 SimpleImputer 來填充這些值
    imp = SimpleImputer(strategy="mean")
    features2 = imp.fit_transform(features2)

# 現在你可以安全地使用 features2 進行後續操作


In [35]:
# 假設 imp 是已經用訓練資料集 fit 過的 SimpleImputer 實例
# features2 是你要轉換的新資料集，它應該是一個 DataFrame 而不是 numpy array

# 確保 features2 是一個 DataFrame 並且包含特徵名稱
features2 = pd.DataFrame(features2, columns=imp.feature_names_in_)

# 現在你可以使用 imp 來轉換 features2 而不會收到警告訊息
features2 = imp.transform(features2)


In [36]:
# 印出測試資料的預測類別
# 可以整理這裡的輸出上傳到 kaggle
# 使用 SelectFromModel 來選擇測試資料的特徵
features2 = sel.transform(features2)
pred = grid_search.predict(features2)
df = pd.DataFrame(pred, columns=['label'])
# 將 DataFrame 寫入 CSV 檔案
df.to_csv('sample_submission.csv', index=True, index_label='id')

In [None]:
# 如果測試檔案內涵正確答案，直接計算分類結果的 accuracy
clf.score(features2, target2)