In [99]:
from scipy.io import arff
import pandas as pd
import numpy as np 

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
from sklearn.svm import SVC

from sklearn.preprocessing import LabelEncoder


In [97]:
data, meta = arff.loadarff("Lung.arff")

df = pd.DataFrame(data)

print(df)


     AFFX-MurIL2_at  AFFX-MurIL10_at  AFFX-MurIL4_at  AFFX-MurFAS_at  \
0           -18.600            10.54           0.010          19.440   
1             9.120             9.12          10.180          29.290   
2            -2.175            -2.21          -0.060           6.320   
3            -1.540            21.75           5.835          23.815   
4            -9.070             3.08          -1.980          17.260   
..              ...              ...             ...             ...   
198          35.140           106.16          52.280          65.340   
199         -21.150           -31.20         -11.820           8.280   
200          26.900            10.44          18.230          33.830   
201          23.800            29.14          31.800          65.610   
202         -18.370            -1.03          -8.260          27.150   

     AFFX-BioB-5_at  AFFX-BioB-M_at  AFFX-BioB-3_at  AFFX-BioC-5_at  \
0           -16.980          -27.50          -1.600           38

In [100]:
df.duplicated().sum()

0

In [101]:
df.isnull().sum().sum()

0

In [102]:
dicti = {}
for i in df[:]['type']:
    if i not in dicti.keys():
        dicti[i] = 1
    else:
        dicti[i] += 1
print(dicti)

{b'1': 139, b'2': 17, b'3': 6, b'4': 21, b'5': 20}


In [103]:
X = df.drop('type', axis=1)
y = df['type']

In [104]:

label_encoder = LabelEncoder()
y_binary = label_encoder.fit_transform(y)
y_binary

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
       4, 4, 4, 4, 4])

In [141]:

scores=[]

svc=SVC()

no_of_features=[x*10 for x in range(1,1200)]

for i in no_of_features:
    
    selector = SelectKBest(f_classif, k=i)  
    X_selected = selector.fit_transform(X, y_binary)

    svc.fit(X_selected,y_binary)
    scores.append(svc.score(X_selected,y_binary))
    

In [143]:
socre_per_features=[i for i in zip(no_of_features,scores)]
socre_per_features=np.array(socre_per_features)
socre_per_features

array([[1.00000000e+01, 7.83251232e-01],
       [2.00000000e+01, 7.83251232e-01],
       [3.00000000e+01, 7.98029557e-01],
       ...,
       [1.19700000e+04, 9.35960591e-01],
       [1.19800000e+04, 9.35960591e-01],
       [1.19900000e+04, 9.35960591e-01]])

In [176]:

best_no_of_features_values = max(socre_per_features, key=lambda x: x[1])

best_no_of_features=best_no_of_features_values[0]
highest_accuracy=best_no_of_features_values[1]
print(f"{int(best_no_of_features)} is the best number of feature based on f_classif feature selection technique technique with {highest_accuracy: .3} accuracy using SVM model")


750 is the best number of feature based on f_classif feature selection technique technique with  0.961 accuracy using SVM model


In [177]:
selector = SelectKBest(f_classif, k=750)  # Select top 5 features (adjust as needed)
X_selected = selector.fit_transform(X, y)

selected_indices = selector.get_support(indices=True)

#selected genes
selected_features = X.columns[selected_indices]
print("Selected Features:", selected_features)


Selected Features: Index(['31403_at', '31444_s_at', '31525_s_at', '31638_at', '31687_f_at',
       '31918_at', '32425_at', '32444_at', '32996_g_at', '33051_at',
       ...
       '268_at', '260_at', '227_g_at', '236_at', '202_at', '185_at', '197_at',
       '198_g_at', '162_at', '113_i_at'],
      dtype='object', length=750)
