<a href="https://colab.research.google.com/github/Kmatsu-tokudai/YSEC/blob/master/3_15_3_20.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import files
files.upload() # kaggle.jsonをアップロード
!mkdir -p ~/.kaggle
!mv kaggle.json ~/.kaggle/
!chmod 600 /root/.kaggle/kaggle.json

Saving kaggle.json to kaggle.json


# 3.15～3.20

### プログラム3.15 SelectKBestとSelectPercentileによる特徴選択
#### フィルター法としてカイ二乗検定、ANOVAを使って特徴選択

In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import \
RandomForestClassifier as RandomForest
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# 上位k個の特徴量を選択するクラス、
# 上位percentileパーセントの特徴量を選択するクラス
from sklearn.feature_selection import SelectKBest, \
 SelectPercentile
# カイ二乗検定、分散分析のモジュールをインポート
from sklearn.feature_selection import chi2, f_classif
# ピアソンの積率相関係数のモジュールをインポート
from scipy.stats import pearsonr
import warnings
warnings.filterwarnings('ignore')

# ピアソンの積率相関係数
def Pearson_corr_coeff(X, y):
    X = np.array(X)
    y = np.array(y)
    #gg = f_classif(X, y)
    #print(gg)
    pr = [[], []]
    for i in range(X.shape[1]):
        x = X[:, i]
        scores = pearsonr(y, x)
        #scores = f_classif(y, x)
        pr[0].append(scores[0])
        pr[1].append(scores[1])

    #pr = np.asarray(np)
    print(pr)
    return pr

# データの準備
def prepare():
    !kaggle datasets download -d \
    uciml/mushroom-classification
    !unzip mushroom-classification.zip
    
# カテゴリ変数を数字に変換
def replace_to_digit( dff ):
    t = list(set(dff.values.tolist()))
    t.sort()
    vals = [t.index(v)+1 for v in dff.values]
    return vals

# 前処理(欠損値の除去、カテゴリ変数の数値への変換)
def preprocess():
    df = pd.read_csv('mushrooms.csv')
    df = df.replace('?', 'NaN')
    df = df.dropna(how = 'any')
    df['class'] = df['class'].map(
                  {'p': 1, 'e': 0}).astype(int)
    features = []
    for c in df.columns.values:
      if c != 'class':
        features.append(c)
        df[c] = replace_to_digit(df[c])
    X_train = df.loc[:, features].values
    y_train = df.loc[:, ['class']].values.ravel()
    return X_train, y_train, features

# 特徴選択(SelectKBestを用いる)
def select_feature(selector_type, 
n_features, X_train, y_train, features):
    selector = SelectKBest(
     score_func=selector_type, 
     k=n_features)
    # n_features次元の特徴量に変換
    X_new = selector.fit_transform(X_train, y_train) 
    feature_scores = list(zip(selector.scores_, 
                          features))
    # 特徴スコア順にソート
    sorted_feature_scores = sorted(feature_scores,
                                   reverse=True)
    selected_f = []
    # 特徴スコアの高い順に表示
    for i, fs in enumerate(
                     sorted_feature_scores[:n_features]):
        if fs[0] > 0 and fs[0] != None:
            print('[%d]: %s\t%.2lf' % (i+1, fs[1], fs[0]) )
            selected_f.append(fs[1])
        s_f = list(map(lambda i: features[i], 
                filter(
                lambda i: features[i] in selected_f, range( \
                              len(features)))))
    return s_f

# 特徴選択(SelectPercentile)
def select_feature_percentile(selector_type, 
                              percentile, 
                              X_train, y_train, 
                              features):
    selector = SelectPercentile(
                           score_func=selector_type,
                           percentile=percentile)
    

    # n_features次元の特徴量に変換
    X_new = selector.fit_transform(X_train, y_train) 
    # 選択された特徴の数を取得
    sel_count = np.count_nonzero(
                     selector.get_support()==True) 
    feature_scores = list(
                      zip(selector.scores_, 
                      features))
    # 特徴スコア順にソート
    sorted_feature_scores = sorted(
                            feature_scores, 
                            reverse=True)
    selected_f = []

    # 特徴スコアの高い順に表示
    for i, fs in enumerate( sorted_feature_scores[:sel_count]):
        if fs[0] > 0 and fs[0] != None:
            print('[%d]: %s\t%.2lf' % (i+1, fs[1], fs[0]) )
            selected_f.append(fs[1])
    s_f = list(map(lambda i: features[i], 
                filter(lambda i: features[i] \
                   in selected_f, range(len(features)))))
    return s_f

def main():
    #prepare()
    X_train, y_train, features = preprocess()
    target_names = ['edible', 'poisonous']

    print('[Original Features]\n%s' % '\n'.join(features))
    # 特徴選択手法
    selectors = {'chi2': chi2,
                  'ANOVA': f_classif,
                  'Pearson': Pearson_corr_coeff}
    # SelectKBestを使う場合(n_features個の特徴量を選択)
    n_features = 5
    X_tr, X_te, y_tr, y_te = \
         train_test_split(X_train, y_train, 
         train_size=0.7, random_state=0)
    df_sel = pd.DataFrame(X_tr, columns=features)
    df_sel_te = pd.DataFrame(X_te, columns=features)
    for selN, selector_type in selectors.items():
        print('\n-*-*-*- Select by %s -*-*-*-' % selN)
        s_f =select_feature(selector_type, 
            n_features, X_tr, y_tr, features)
        X_tr_sel = df_sel.loc[:, s_f].values
        X_te_sel = df_sel_te.loc[:, s_f].values
        rf = RandomForest(n_estimators=100, 
          max_depth=4, random_state=0)
        rf.fit(X_tr_sel, y_tr)
        y_pred = rf.predict(X_te_sel)
        print(classification_report(y_te, y_pred,
                    target_names=target_names, 
                    zero_division=1))

    # SelectPercentileを使う場合
    percentile = 5
    for selN, selector_type in selectors.items():
        print('\n-*-*-*- Select by %s -*-*-*-' % selN)
        s_f =select_feature_percentile(selector_type, 
                        percentile, X_train, y_train, 
                        features )
        X_tr_sel = df_sel.loc[:, s_f].values
        X_te_sel = df_sel_te.loc[:, s_f].values
        rf = RandomForest(n_estimators=100, 
                          max_depth=4, random_state=0)
        rf.fit(X_tr, y_tr)
        y_pred = rf.predict(X_te)
        print(classification_report(y_te, y_pred,
                    target_names=target_names, 
                    zero_division=1))

if __name__ == '__main__':
    main()


[Original Features]
cap-shape
cap-surface
cap-color
bruises
odor
gill-attachment
gill-spacing
gill-size
gill-color
stalk-shape
stalk-root
stalk-surface-above-ring
stalk-surface-below-ring
stalk-color-above-ring
stalk-color-below-ring
veil-type
veil-color
ring-number
ring-type
spore-print-color
population
habitat

-*-*-*- Select by chi2 -*-*-*-
[1]: gill-color	3504.00
[2]: ring-type	897.99
[3]: stalk-root	430.39
[4]: habitat	295.17
[5]: gill-size	276.53
              precision    recall  f1-score   support

      edible       0.96      0.96      0.96      1272
   poisonous       0.96      0.96      0.96      1166

    accuracy                           0.96      2438
   macro avg       0.96      0.96      0.96      2438
weighted avg       0.96      0.96      0.96      2438


-*-*-*- Select by ANOVA -*-*-*-
[1]: gill-size	2399.59
[2]: gill-color	2264.93
[3]: bruises	1871.94
[4]: stalk-root	930.28
[5]: gill-spacing	806.29
              precision    recall  f1-score   support

      edible

### プログラム3.16 相互情報量(MI)を使った特徴選択

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectKBest
# 相互情報量を計算するメソッドをインポート
from sklearn.feature_selection import mutual_info_classif

# データの準備
def prepare():
    !kaggle datasets download -d pmenshih/kpmi-mbti-mod-test 
    !unzip kpmi-mbti-mod-test.zip

# 前処理（使用する項目の絞り込み、正規化）
def preprocess():
    # MBTI診断データを読込み
    df = pd.read_csv('kpmi_data.csv', sep=';')
    # 現在の職業に満足しているかどうか(yes:1, no:0)
    y = df.loc[:, 'satisfied'].values
    # 用いる特徴量（MBTIのスコア)
    scales = ['scale_e','scale_i','scale_s',
  'scale_n','scale_t','scale_f',
  'scale_j','scale_p']
    df = pd.DataFrame(df.loc[:,scales], columns=scales)
    print(df)
    X = df.loc[:, df.columns.values].values
    x_train, x_test, y_train, y_test = train_test_split(X, y,
   random_state=0, train_size=0.9)
    return x_train, x_test, y_train, y_test, scales

# Random Forest で分類評価
def predict_satisfaction(x_train, x_test, y_train, y_test):
    clf = RandomForestClassifier(max_depth=4, random_state=2)
    clf.fit(x_train, y_train)
    y_pred = clf.predict(x_test)
    print('Accuracy = {:.3f}'.format(accuracy_score(
  y_test, y_pred)))
    labels = ['no', 'yes']
    print(classification_report(y_test, y_pred,
   target_names=labels))
# MI (相互情報量)を使って特徴選択
def select_feature_by_MI(x_train, x_test, 
        y_train, y_test, 
        scales, n_features): 
    # n_featuers 個の特徴を選択
    selecter = SelectKBest(
        mutual_info_classif, k=n_features).fit(
                     x_train, y_train)
    

    sel_features = []
    selected_feature = selecter.get_support()
    for i in range(len(selected_feature)):
      if selected_feature[i]:
        print('Selected Feature - {}'.format(scales[i])) 
        sel_features.append(scales[i])
    trdf = pd.DataFrame(x_train, columns=scales) 
    tedf = pd.DataFrame(x_test, columns=scales)
    x_train = trdf.loc[:, sel_features].values
    x_test = tedf.loc[:, sel_features].values
    return x_train, x_test

def main():
    prepare()
    x_train, x_test, y_train, y_test, scales = preprocess()
    print('- 特徴選択無し [%d個の特徴量] -' % len(scales))
    predict_satisfaction(x_train, x_test, y_train, y_test)
    n_features = 3
    print('- MIによる特徴選択 [%d個の特徴量] -' % n_features)
    x_train, x_test = select_feature_by_MI(x_train, x_test,
   y_train, y_test, scales, n_features)
    predict_satisfaction(x_train, x_test, y_train, y_test)

if __name__ == '__main__':
    main()


Downloading kpmi-mbti-mod-test.zip to /content
  0% 0.00/985k [00:00<?, ?B/s]
100% 985k/985k [00:00<00:00, 64.2MB/s]
Archive:  kpmi-mbti-mod-test.zip
  inflating: kpmi_data.csv           
  inflating: kpmi_key.json           
  inflating: questionnaire_schema.json  


  exec(code_obj, self.user_global_ns, self.user_ns)


       scale_e  scale_i  scale_s  scale_n  scale_t  scale_f  scale_j  scale_p
0            5       26       25       12       24        8       27       10
1           16       22       17       16       23       11       22       15
2           28        6       14       18       20       16       20       15
3           28       10       22       10       16       14       22       12
4           24       10       16       17       13       23       31        3
...        ...      ...      ...      ...      ...      ...      ...      ...
21841       25       15       17       20       13       13       19       15
21842       26        6       20       18       11       21       22       11
21843       16       21       22       11       28        5       25        9
21844       21       13       12       17        8       30       30        5
21845       22       16       22       15       29        6       29        6

[21846 rows x 8 columns]
- 特徴選択無し [8個の特徴量] -
Accuracy = 0.695
 

### プログラム3.17 SVM-RFEを用いた特徴選択

In [None]:
import pandas as pd 
import numpy as np
from sklearn.feature_selection import RFE
from sklearn.svm import SVC
# 正規化のモジュールをインポート
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report as clf_report
from sklearn.impute import SimpleImputer

# データの準備
def prepare():
    !kaggle datasets download -d \
dileep070/heart-disease-prediction-using-logistic-regression
    !unzip \
    heart-disease-prediction-using-logistic-regression.zip
    
    df = pd.read_csv('framingham.csv')
    # 心臓病データセットのデータを使用
    # 10年後の冠状動脈性心臓病CHDの
    # 発症リスクの有無(0:無, 1:有)
    features = []
    # 分類に使用する特徴量（最後の列以外すべて）
    for i, f in enumerate(df.columns.values ):
      if i != len(df.columns)-1:
        features.append(f)
  
    X_train = df.loc[:,features].values
    y_train = df['TenYearCHD'].values
    return X_train, y_train, features

# 前処理（平均値による欠損値の補完、正規化）
def preprocess(X_train): 
  # 平均値による単一代入法
    simple_imp = SimpleImputer(missing_values=np.nan, 
  strategy='mean') 
    simple_imp.fit(X_train ) 
    X_train = simple_imp.transform(X_train)
    ms = MinMaxScaler() 
    ms.fit(X_train)
    X_train = ms.transform(X_train)
    return X_train

# RFEによる特徴選択と学習
def select_by_rfe(n_features,features,
    X_train,y_train,X_test,y_test): 
    svc = SVC(kernel='linear', gamma=1/2 ,  
          C=1.0,class_weight='balanced',random_state=0)
    rfec = RFE(svc, n_features_to_select= \
              n_features,step=10, verbose=1)
    rfec.fit(X_train, y_train) 
    preds = rfec.predict(X_test) 
    print("RFE + SVC", rfec.n_features_)
    print(clf_report(y_test, preds, digits=3))
    feature_ranks = list(zip(rfec.ranking_, features)) 
    sorted_feature_ranks = sorted(
                           feature_ranks, reverse=False)
  # 選択された特徴を表示する
    for i, fs in enumerate(sorted_feature_ranks[:n_features]):
      print('[%d]: %s\t%.2lf' % (i+1, fs[1], fs[0]))

def main(): 
    # n_features 個に絞り込む
    n_features = 8
    X_train, y_train, features = prepare()
    X_train = preprocess(X_train)
    X_train, X_test, y_train, y_test = train_test_split(
               X_train, y_train, 
                random_state=0, train_size=0.8)
  
    # SVMによる学習・分類（特徴選択なし）
    print('- training SVM with default parameters -')
    svc = SVC(kernel='linear', gamma=1/2,
              C=1.0, class_weight='balanced', random_state=0) 
    svc.fit(X_train, y_train)
    pred = svc.predict(X_test)
    print(clf_report(y_test, pred))
    # SVM-RFEを用いて特徴選択し、
    # n-features個の特徴で分類、学習
    print('-*-*-*- Select by SVM-RFE -*-*-*-')
    select_by_rfe(n_features, features, X_train, y_train,
   X_test, y_test)

if __name__ == '__main__':
    main()



Downloading heart-disease-prediction-using-logistic-regression.zip to /content
  0% 0.00/58.4k [00:00<?, ?B/s]
100% 58.4k/58.4k [00:00<00:00, 21.7MB/s]
Archive:  heart-disease-prediction-using-logistic-regression.zip
  inflating: framingham.csv          
- training SVM with default parameters -
              precision    recall  f1-score   support

           0       0.92      0.66      0.77       710
           1       0.28      0.70      0.40       138

    accuracy                           0.67       848
   macro avg       0.60      0.68      0.59       848
weighted avg       0.81      0.67      0.71       848

-*-*-*- Select by SVM-RFE -*-*-*-
Fitting estimator with 15 features.
RFE + SVC 8
              precision    recall  f1-score   support

           0      0.922     0.662     0.770       710
           1      0.290     0.710     0.412       138

    accuracy                          0.670       848
   macro avg      0.606     0.686     0.591       848
weighted avg      0.8

### プログラム3.18 Borutaを用いた特徴選択

In [None]:
# BorutaPyのインストール
!pip install Boruta
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report as clf_report
from sklearn.ensemble import RandomForestClassifier
from boruta import BorutaPy

# データの準備
def prepare():
    !kaggle datasets download -d dipam7/student-grade-prediction
    !unzip student-grade-prediction.zip
    # ポルトガル語学校の学生の成績の予測データセット
    df = pd.read_csv('student-mat.csv')
    # 欠損値を除去
    df = df.dropna()
    # 性別を数値に変換
    df['sex'] = df['sex'].map({"F": 0, "M": 1}).astype(int)
    # 使用する特徴量
    features = ['sex', 'age','Medu', 'Fedu', 'traveltime', 
                'studytime', 'failures', 'famrel', 
                 'freetime', 'goout', 'Dalc',
                 'Walc', 'health', 'absences']
    X = df.loc[:,features].values
    #　成績ラベルG2（0 to 20)
    y = df['G2'].ravel()
    # ビニングにより成績を2クラスに変換
    bins = [-1, 10, 20]
    labels = ['bad', 'good']
    y_cut = pd.cut(y, bins=bins, labels=labels)
    print(y_cut)
    y = [c for c in y_cut.codes]

    return X, y, features, labels

# Borutaによる特徴選択
def feature_select_by_Boruta(rfc, X, y, features):
    # Boruta による特徴選択を定義
    feat_selector = BorutaPy(rfc, n_estimators='auto',
   verbose=0, random_state=1)
    # 関連する特徴の選択
    feat_selector.fit(X, y)
    # 選択された特徴のチェック
    result = feat_selector.support_
    print('=====Selected Features=====')
    for i,tf in enumerate(result):
      if tf == True:
        print('%s' % features[i])
    
    # 特徴量のランキング
    ranking = feat_selector.ranking_
    rank = {}
    for i in range(len(ranking)):
      rank[i] = ranking[i]
    print('======Feature Ranking======')
    for k,v in sorted(rank.items(), key=lambda x:x[1]):
      print('[%d]\t%s' % (v, features[k]) )
    # 選択された特徴のみのデータに変換
    X_filtered = feat_selector.transform(X) 
    return X_filtered, feat_selector
def main():
    X, y, features, target_names = prepare()
    X, X_test, y, y_test = train_test_split(X, y, random_state=0, train_size=0.8)
    rfc = RandomForestClassifier(n_jobs=-1, 
         class_weight='balanced', 
        max_depth=5)
    # Boruta により特徴選択
    X_filtered, feat_selector = feature_select_by_Boruta(
                                    rfc, X, y, features)
    # 特徴選択せずにRandomForestで学習・予測
    print('Result: all features')
    rfc.fit(X,y)
    y_pred = rfc.predict(X_test)
    print(clf_report(y_test, y_pred, 
                     target_names= target_names)) 
    # 特徴選択の結果を用いてRandomForestで学習・予測
    print('Result: selected features')
    rfc_boruta = RandomForestClassifier(n_jobs=-1, 
                  class_weight='balanced', max_depth=5)
    rfc_boruta.fit(X_filtered, y)
    X_test_filtered = feat_selector.transform(X_test)
    y_pred_boruta = rfc_boruta.predict(X_test_filtered)
    print( clf_report(y_test, y_pred_boruta, 
                      target_names =target_names))

if __name__ == '__main__':
    main()

Collecting Boruta
[?25l  Downloading https://files.pythonhosted.org/packages/b2/11/583f4eac99d802c79af9217e1eff56027742a69e6c866b295cce6a5a8fc2/Boruta-0.3-py3-none-any.whl (56kB)
[K     |█████▉                          | 10kB 8.0MB/s eta 0:00:01[K     |███████████▋                    | 20kB 4.4MB/s eta 0:00:01[K     |█████████████████▍              | 30kB 6.1MB/s eta 0:00:01[K     |███████████████████████▏        | 40kB 4.7MB/s eta 0:00:01[K     |█████████████████████████████   | 51kB 5.3MB/s eta 0:00:01[K     |████████████████████████████████| 61kB 2.8MB/s 
Installing collected packages: Boruta
Successfully installed Boruta-0.3
Downloading student-grade-prediction.zip to /content
  0% 0.00/7.16k [00:00<?, ?B/s]
100% 7.16k/7.16k [00:00<00:00, 12.2MB/s]
Archive:  student-grade-prediction.zip
  inflating: student-mat.csv         
['bad', 'bad', 'bad', 'good', 'bad', ..., 'bad', 'good', 'bad', 'good', 'bad']
Length: 395
Categories (2, object): ['bad' < 'good']
=====Selected F

### プログラム3.19 リッジ回帰、Lasso回帰、Elastic Net による特徴選択および予測モデルの学習と評価

In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.impute import SimpleImputer

# 回帰モデルを基に特徴選択
from sklearn.feature_selection import SelectFromModel
# リッジ回帰, Lasso回帰, Elastic Net
from sklearn.linear_model import Ridge, Lasso, ElasticNet


# カテゴリ変数を数字に変換
def replace_to_digit(dff):
    t = list(set([v for v in dff.values])) 
    t.sort()
    vals = [t.index(v) for v in dff.values]
    return vals

# データの準備
def prepare():
    !kaggle datasets download -d \
    mylesoneill/world-university-rankings
    !unzip world-university-rankings.zip
    # 世界大学ランキングのデータを使用
    # 分類に使用する特徴量
    features = ['country', 'national_rank', 
                'quality_of_education',
                'alumni_employment', 'quality_of_faculty',
                'publications', 'influence',
                'citations', 'broad_impact',
                'patents', 'score']

    df_train = pd.read_csv('cwurData.csv')
    df_train['country'] = \
    replace_to_digit(df_train['country'])
    X_train = df_train.loc[:,features].values
    y_train = df_train.loc[:,['world_rank']].values.ravel()
    # ビニングによりランキングを4分割
    bins = [0, 250, 500, 750, 1000]
    labels = [0, 1, 2, 3]
    classNames = ['(0,250]', '(250,500]', 
  '(500,750]', '(750,1000]']
    y_cut = pd.cut(
  df_train.loc[:,['world_rank']].values.ravel(),   bins=bins, labels=labels)
    y_train = [c for c in y_cut.codes]
    return X_train, y_train, features, classNames
  
# 前処理（平均値による欠損値の補完、正規化）
def preprocess(X_train): 
    # 平均値による単一代入法
    simple_imp = SimpleImputer(missing_values=np.nan, 
                               strategy='mean') 
    simple_imp.fit(X_train) 
    X_train = simple_imp.transform(X_train) 
    ms = MinMaxScaler()
    # 正規化
    ms.fit(X_train)
    X_train = ms.transform(X_train)
    return X_train

def main(): 
    X_train, y_train, features, classNames = prepare()
    print('Original Features ', features)
    X_train = preprocess(X_train)
    X_train, X_test, y_train, y_test = train_test_split(
                                       X_train, y_train, 
                                       random_state=1, 
                                       train_size=0.8)
    # 特徴選択無しを含め4パターンの方法で評価
    selectors = [None, Ridge(alpha=-0.6), 
                 Lasso(alpha=0.02),
                ElasticNet(alpha=0.0001, l1_ratio=0.7)] 
    for sel in selectors:
        if sel == None:
            print('Logistic Regression \
                 without Feature selection')
            X_train_sel = X_train
            X_test_sel = X_test
        else:
            s_f = SelectFromModel(sel) 
            s_f.fit(X_train, y_train)
            print('-- Selected Features by {} --'.format(
                    sel.__class__.__name__))
            for i, f in enumerate(s_f.get_support()):
                if f == 1:
                    print('%d: %s' % (f, features[i]))
            X_train_sel = s_f.transform(X_train)
            X_test_sel = s_f.transform(X_test)
            print('Logistic Regression by {} Feature \
                         selection'.format(sel.__class__.__name__))
        lr = LogisticRegression(max_iter=150)
        lr.fit(X_train_sel, y_train)
        print('\tTest set Accuracy: %.3lf\n' % lr.score( \
        X_test_sel, y_test))
        y_pred = lr.predict(X_test_sel)
        print(classification_report(y_test, y_pred,
   target_names=classNames))

if __name__ == '__main__':
    main()


Downloading world-university-rankings.zip to /content
  0% 0.00/1.41M [00:00<?, ?B/s]
100% 1.41M/1.41M [00:00<00:00, 91.4MB/s]
Archive:  world-university-rankings.zip
  inflating: cwurData.csv            
  inflating: education_expenditure_supplementary_data.csv  
  inflating: educational_attainment_supplementary_data.csv  
  inflating: school_and_country_table.csv  
  inflating: shanghaiData.csv        
  inflating: timesData.csv           
Original Features  ['country', 'national_rank', 'quality_of_education', 'alumni_employment', 'quality_of_faculty', 'publications', 'influence', 'citations', 'broad_impact', 'patents', 'score']
Logistic Regression                  without Feature selection
	Test set Accuracy: 0.850

              precision    recall  f1-score   support

     (0,250]       0.92      0.96      0.94       140
   (250,500]       0.80      0.79      0.80       106
   (500,750]       0.77      0.73      0.75        94
  (750,1000]       0.88      0.87      0.87       10

### プログラム3.20 RidgeCV, LassoCV, ElasticNetCV を用いて、パラメータ選択し、予測モデルの学習と評価

In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.impute import SimpleImputer

# 回帰モデルを基に特徴選択
from sklearn.feature_selection import SelectFromModel
# リッジ回帰, Lasso回帰, RidgeCV, LassoCVをインポート
from sklearn.linear_model import Ridge, Lasso, RidgeCV, LassoCV
# ElasticNet, ElasticNetCV をインポート
from sklearn.linear_model import ElasticNet, ElasticNetCV

# カテゴリ変数を数字に変換
def replace_to_digit(dff):
    t = list(set([v for v in dff.values])) 
    t.sort()
    vals = [t.index(v) for v in dff.values]
    return vals

# データの準備
def prepare():
    #!kaggle datasets download -d \
    #mylesoneill/world-university-rankings
    #!unzip world-university-rankings.zip
    # 世界大学ランキングのデータを使用
    # 分類に使用する特徴量
    features = ['country', 'national_rank', 
                'quality_of_education',
                'alumni_employment', 'quality_of_faculty',
                'publications', 'influence',
                'citations', 'broad_impact',
                'patents', 'score']

    df_train = pd.read_csv('cwurData.csv')
    df_train['country'] = \
    replace_to_digit(df_train['country'])
    X_train = df_train.loc[:,features].values
    y_train = df_train.loc[:,['world_rank']].values.ravel()
    # ビニングによりランキングを4分割
    bins = [0, 250, 500, 750, 1000]
    labels = [0, 1, 2, 3]
    classNames = ['(0,250]', '(250,500]', 
  '(500,750]', '(750,1000]']
    y_cut = pd.cut(
  df_train.loc[:,['world_rank']].values.ravel(),   bins=bins, labels=labels)
    y_train = [c for c in y_cut.codes]
    return X_train, y_train, features, classNames
  
# 前処理（平均値による欠損値の補完、正規化）
def preprocess(X_train): 
    # 平均値による単一代入法
    simple_imp = SimpleImputer(missing_values=np.nan, 
                               strategy='mean') 
    simple_imp.fit(X_train) 
    X_train = simple_imp.transform(X_train) 
    ms = MinMaxScaler()
    # 正規化
    ms.fit(X_train)
    X_train = ms.transform(X_train)
    return X_train

# main
def main(): 
    X_train, y_train, features, classNames = prepare()
    print('Original Features ', features)
    X_train = preprocess(X_train)
    X_train, X_test, y_train, y_test = train_test_split( 
                                       X_train, y_train, 
                                         random_state=0, 
                                         train_size=0.9)
    alphas = (0.01, 0.5, 1.0) 
    cvs = [RidgeCV(alphas=alphas), 
           LassoCV(alphas=alphas), 
           ElasticNetCV(alphas=alphas)]
    selectors = [Ridge(), Lasso(), ElasticNet()]
    for cv, sel in zip(cvs, selectors): 
        # リッジ回帰、Lasso回帰、ElasticNetを
        # の正則化パラメータの選択および、選択された
        # パラメータを用いた特徴選択、
        # ロジスティック回帰による学習・予測
        print('Logistic Regression with {} \
               Feature Selection'.format(cv.__class__.__name__))
        cv.fit(X_train, y_train)
        print(cv.alpha_)
        sel.alpha = cv.alpha_
        s_f = SelectFromModel(sel) 
        s_f.fit(X_train, y_train)
        print('- Selected Features \
                by {} -'.format(sel.__class__.__name__))
        for i, f in enumerate(s_f.get_support()):
            if f == 1:
                print('%d: %s' % (f, features[i]))

        X_train_sel = s_f.transform(X_train)
        X_test_sel = s_f.transform(X_test)
        lr = LogisticRegression()
        lr.fit(X_train_sel, y_train)
        y_pred = lr.predict(X_test_sel)
        print('\tTest set Accuracy: %.3lf\n' % lr.score(
                X_test_sel, y_test) )
        print(classification_report(y_test, y_pred,
                target_names=classNames ))



if __name__ == '__main__':
    main()


Original Features  ['country', 'national_rank', 'quality_of_education', 'alumni_employment', 'quality_of_faculty', 'publications', 'influence', 'citations', 'broad_impact', 'patents', 'score']
Logistic Regression with RidgeCV                Feature Selection
0.5
- Selected Features                 by Ridge -
1: alumni_employment
1: publications
1: broad_impact
1: score
	Test set Accuracy: 0.855

              precision    recall  f1-score   support

     (0,250]       0.91      0.91      0.91        68
   (250,500]       0.80      0.75      0.78        60
   (500,750]       0.74      0.80      0.77        44
  (750,1000]       0.94      0.96      0.95        48

    accuracy                           0.85       220
   macro avg       0.85      0.85      0.85       220
weighted avg       0.85      0.85      0.85       220

Logistic Regression with LassoCV                Feature Selection
0.01
- Selected Features                 by Lasso -
1: quality_of_education
1: alumni_employment
1: 