## 1. Introduction GNS data to microbe classification

 We have a big dataset so we want to reduce the dimensions in it....

In [2]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [3]:
path_train = 'classification_vg/Bacterial_identification_DNA_oligomers/data/train.csv'
df = pd.read_csv(path_train, sep  = ',', index_col = 0)

In [4]:
Y = df['target']
X = df.drop(columns = ['target'])

from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
Y = label_encoder.fit_transform(df['target'])

In [15]:
print(f"The dimensions of my train DataFrame are:{X.shape}")
print(f"The dimension of my target vector is:{Y.shape}")

The dimensions of my train DataFrame are:(200000, 286)
The dimension of my target vector is:(200000,)


In [16]:
print(X.isnull().sum()[X.isnull().sum() != 0])
print(X.dtypes[X.dtypes != 'float64'])

Series([], dtype: int64)
Series([], dtype: object)


How to reduce the dimensions of the data? The first idea is to use a model of decomposition, or a information criterion to exclude some variables while doing the analysis

We have a lot of methods to reduce dimentionality, the principal ideas are ''RandomForestClassifier'', Principal Component Analysis (PCA), Autoencoders, pearson correlation test, Recursive Feature Elimination (RFE), akaike criterion

### Akaike criterion

In [5]:
import statsmodels.api as sm
aic_values = []

# Iterating throught the columns 
for col in X.columns:
    X_subset = sm.add_constant(X[[col]])  # Adiciona uma constante ao modelo
    model = sm.OLS(Y, X_subset).fit()  # Ajusta o modelo de regressão linear
    aic_values.append((col, model.aic))  # Armazena a coluna e o AIC

aic_values.sort(key=lambda x: x[1])
aic_values

[('A3T3G2C2', 986948.9136313926),
 ('A7T2G1C0', 987377.3171091204),
 ('A1T7G0C2', 987777.4714120368),
 ('A8T1G1C0', 987827.4677093099),
 ('A3T7G0C0', 987943.270321532),
 ('A1T8G0C1', 988225.0661281015),
 ('A1T7G1C1', 988277.9392651439),
 ('A4T6G0C0', 988468.4409273467),
 ('A8T2G0C0', 988518.8313555499),
 ('A7T2G0C1', 988555.6817607414),
 ('A7T1G1C1', 988595.5807153953),
 ('A8T1G0C1', 988602.7476067932),
 ('A2T4G2C2', 988705.9283344318),
 ('A2T7G0C1', 988784.9097027966),
 ('A6T2G2C0', 988792.5999506812),
 ('A3T3G3C1', 988802.70088092),
 ('A7T3G0C0', 988835.8880297549),
 ('A2T8G0C0', 988846.9708095687),
 ('A8T0G1C1', 988861.8904805073),
 ('A2T7G1C0', 988876.784064744),
 ('A4T2G2C2', 988893.2194305409),
 ('A3T3G1C3', 988952.50188503),
 ('A6T4G0C0', 988956.93431616),
 ('A7T1G2C0', 989007.7869416047),
 ('A3T6G0C1', 989031.9708612452),
 ('A0T8G1C1', 989050.7941420947),
 ('A3T4G2C1', 989061.7182082995),
 ('A5T1G1C3', 989179.4471450197),
 ('A3T4G1C2', 989183.8062128406),
 ('A6T1G2C1', 989240.7

### RandomForestClassifier

In [5]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X, Y)

feature_importances = model.feature_importances_

importance_df = pd.DataFrame({
    'Feature': X.columns,
    'Importance': feature_importances
}).sort_values(by='Importance', ascending=False)

cumulative_importance = importance_df['Importance'].cumsum()
selected_features = importance_df[cumulative_importance <= 0.5]['Feature'].values
#top_features = importance_df.head(10)['Feature'].values
X_reduced = X[selected_features]

print(f"Top características selecionadas: {select}")

NameError: name 'top_features' is not defined

In [7]:
X_reduced.shape

(200000, 76)

In [23]:
n = 50
print(importance_df.head(n))
print(importance_df.head(n).iloc[-1])

      Feature  Importance
80   A1T1G4C4    0.014371
134  A2T1G4C3    0.012675
88   A1T2G3C4    0.010872
90   A1T2G5C2    0.010430
229  A4T6G0C0    0.010028
70   A1T0G4C5    0.009914
228  A4T5G1C0    0.009652
133  A2T1G3C4    0.009364
87   A1T2G2C5    0.008750
71   A1T0G5C4    0.008621
189  A3T3G2C2    0.008567
15   A0T1G4C5    0.008548
79   A1T1G3C5    0.008545
190  A3T3G3C1    0.008296
16   A0T1G5C4    0.008218
249  A5T4G1C0    0.008170
89   A1T2G4C3    0.008158
197  A3T5G1C1    0.008118
132  A2T1G2C5    0.008070
153  A2T4G2C2    0.008040
227  A4T5G0C1    0.007679
5    A0T0G5C5    0.007435
40   A0T4G2C4    0.007398
217  A4T2G2C2    0.007057
206  A4T0G4C2    0.006676
141  A2T2G3C3    0.006655
154  A2T4G3C1    0.006651
188  A3T3G1C3    0.006417
193  A3T4G1C2    0.006154
126  A2T0G5C3    0.006093
63   A0T9G0C1    0.006024
265  A6T4G0C0    0.006007
81   A1T1G5C3    0.005982
100  A1T4G0C5    0.005978
216  A4T2G1C3    0.005970
104  A1T4G4C1    0.005950
135  A2T1G5C2    0.005889
39   A0T4G1C

In [24]:
sum = 0

while feature, importance in importance_df:
    sum += importance
    if importance > 0.5:
        
        break

Feature       A1T1G4C4A2T1G4C3A1T2G3C4A1T2G5C2A4T6G0C0A1T0G4...
Importance                                                  1.0
dtype: object

### Pearson correlation test