# Лабораторная №6. Выбор признаков

In [6]:
import pandas as pd
from tqdm import tqdm
import numpy as np

In [45]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder

df = pd.read_table('SMS.tsv')
df = df.sample(n=1000, random_state=0)

vectorizer_full = TfidfVectorizer()
X_full = vectorizer_full.fit_transform(df['text'])

vectorizer_500 = TfidfVectorizer(max_features=500)
X_500 = vectorizer_500.fit_transform(df['text'])

label_encoder = LabelEncoder()
y = label_encoder.fit_transform(df['class'])

## Реализуйте 3 метода выбора признаков

### LASSO

In [19]:
def coordinate_descent_lasso(X, y, alpha, max_iter=1000, tol=0.05):
    n_samples, n_features = X.shape

    w = np.zeros(n_features)
    w_prev = np.zeros(n_features)
    residual = y - np.dot(X, w)

    L = np.linalg.norm(X, axis=0) ** 2

    for _ in tqdm(range(max_iter)):
        for j in range(n_features):
            X_j = X[:, j]
            old_w_j = w[j]

            X_j_dot_residual = np.dot(X_j, residual)
            w[j] = soft_threshold(X_j_dot_residual, alpha) / L[j]

            residual += X_j * (old_w_j - w[j])

        if np.linalg.norm(w - w_prev) < tol:
            break

        w_prev = np.copy(w)

    return w

def soft_threshold(x, alpha):
    if x > alpha:
        return x - alpha
    elif x < -alpha:
        return x + alpha
    else:
        return 0

alpha = 0.5
coefs = coordinate_descent_lasso(X_full.toarray(), y, alpha)

feature_coefs = zip(vectorizer_full.get_feature_names_out(), coefs)

sorted_features_embedded = sorted(feature_coefs, key=lambda x: abs(x[1]), reverse=True)

print()
for feature, coef in sorted_features_embedded[:30]:
    print(f"Feature: {feature:20} Coefficient: {coef:.5f}")

 47%|████▋     | 473/1000 [00:24<00:26, 19.68it/s]


Feature: txt                  Coefficient: 1.33139
Feature: 50                   Coefficient: 1.09355
Feature: win                  Coefficient: 0.85165
Feature: customer             Coefficient: 0.84825
Feature: 150p                 Coefficient: 0.84153
Feature: claim                Coefficient: 0.80802
Feature: rate                 Coefficient: 0.79353
Feature: www                  Coefficient: 0.78064
Feature: 18                   Coefficient: 0.77729
Feature: mobile               Coefficient: 0.69722
Feature: service              Coefficient: 0.64043
Feature: stop                 Coefficient: 0.58831
Feature: awarded              Coefficient: 0.53003
Feature: 500                  Coefficient: 0.51348
Feature: uk                   Coefficient: 0.51209
Feature: com                  Coefficient: 0.50524
Feature: won                  Coefficient: 0.49435
Feature: free                 Coefficient: 0.49251
Feature: cs                   Coefficient: 0.48079
Feature: text                 




### Forward Selection

In [121]:
import statsmodels.api as sm

def forward_selection(data, target, significance_level=0.05, num_best_features=30):
    initial_features = data.columns.tolist()
    best_features = []
    feature_scores = {}
    while len(best_features) < num_best_features and len(initial_features) > 0:
        remaining_features = list(set(initial_features) - set(best_features))
        new_pval = pd.Series(index=remaining_features, dtype=np.float64)
        for new_column in tqdm(remaining_features):
            model = sm.OLS(target, sm.add_constant(data[best_features + [new_column]])).fit()
            new_pval[new_column] = model.pvalues[new_column]
        min_p_value = new_pval.min()
        if min_p_value < significance_level:
            best_feature = new_pval.idxmin()
            best_features.append(best_feature)
            feature_scores[best_feature] = -np.log10(min_p_value)
        else:
            break
    return feature_scores

In [123]:
df_vectorized = pd.DataFrame(X_500.todense(), columns=vectorizer_500.get_feature_names_out())

feature_scores = forward_selection(df_vectorized, y)

100%|██████████| 500/500 [00:01<00:00, 266.78it/s]
100%|██████████| 499/499 [00:02<00:00, 203.07it/s]
100%|██████████| 498/498 [00:02<00:00, 229.92it/s]
100%|██████████| 497/497 [00:02<00:00, 243.99it/s]
100%|██████████| 496/496 [00:02<00:00, 223.09it/s]
100%|██████████| 495/495 [00:02<00:00, 204.65it/s]
100%|██████████| 494/494 [00:02<00:00, 178.98it/s]
100%|██████████| 493/493 [00:03<00:00, 133.39it/s]
100%|██████████| 492/492 [00:04<00:00, 103.08it/s]
100%|██████████| 491/491 [00:04<00:00, 100.59it/s]
100%|██████████| 490/490 [00:07<00:00, 69.42it/s] 
100%|██████████| 489/489 [00:04<00:00, 106.30it/s]
100%|██████████| 488/488 [00:06<00:00, 73.90it/s]
100%|██████████| 487/487 [00:05<00:00, 81.37it/s]
100%|██████████| 486/486 [00:05<00:00, 90.79it/s]
100%|██████████| 485/485 [00:08<00:00, 59.24it/s]
100%|██████████| 484/484 [00:05<00:00, 83.11it/s]
100%|██████████| 483/483 [00:08<00:00, 57.30it/s]
100%|██████████| 482/482 [00:06<00:00, 80.09it/s]
100%|██████████| 481/481 [00:09<00:00,

In [124]:
sorted_scores = sorted(feature_scores.items(), key=lambda x: x[1], reverse=True)
for feature, score in sorted_scores:
    print(f'Feature: {feature:15} Score: {score:.3f}')

Feature: txt             Score: 49.387
Feature: call            Score: 47.062
Feature: mobile          Score: 24.918
Feature: 18              Score: 19.468
Feature: won             Score: 17.213
Feature: text            Score: 16.623
Feature: customer        Score: 14.200
Feature: rate            Score: 13.651
Feature: reply           Score: 13.019
Feature: free            Score: 12.697
Feature: 150p            Score: 12.430
Feature: 50              Score: 11.894
Feature: landline        Score: 11.892
Feature: www             Score: 11.714
Feature: 800             Score: 11.662
Feature: line            Score: 11.509
Feature: win             Score: 11.204
Feature: latest          Score: 10.540
Feature: 100             Score: 10.433
Feature: ringtone        Score: 9.262
Feature: live            Score: 8.356
Feature: co              Score: 7.957
Feature: video           Score: 7.443
Feature: message         Score: 7.306
Feature: who             Score: 7.251
Feature: chat            Score:

### Mutual Information

In [55]:
def mutual_information(feature, target):
    feature_values = np.unique(feature)
    target_values = np.unique(target)
    mi = 0.0
    for f_val in feature_values:
        for t_val in target_values:
            p_ft = np.logical_and(feature == f_val, target == t_val).mean()
            p_f = (feature == f_val).mean()
            p_t = (target == t_val).mean()
            if p_ft > 0.0 and p_f > 0.0 and p_t > 0.0:
                mi += p_ft * np.log2(p_ft / (p_f * p_t))
    return mi

mi_values = []

feature_names = vectorizer_full.get_feature_names_out()
for feature_idx in tqdm(range(X_full.shape[1])):
    feature = X_full[:, feature_idx].toarray().flatten()
    mi = mutual_information(feature, y)
    mi_values.append(mi)

100%|██████████| 3287/3287 [00:03<00:00, 995.98it/s]


In [56]:
features_df = pd.DataFrame({'Feature': feature_names, 'Mutual Information': mi_values})

features_df = features_df.sort_values(by='Mutual Information', ascending=False)

print(features_df.head(30))

     Feature  Mutual Information
2909      to            0.293247
653     call            0.226399
3270     you            0.176752
2043     now            0.127425
2853     the            0.123571
1211     for            0.118526
3273    your            0.115142
2103      or            0.112580
2985     txt            0.106989
1228    free            0.102279
1559      is            0.088887
1243    from            0.084309
1386    have            0.081443
2838    text            0.081094
2087      on            0.081084
376      and            0.078179
1514      in            0.072279
1278     get            0.070882
1916  mobile            0.070121
3193    with            0.069046
3026      ur            0.067228
2067      of            0.065098
2871    this            0.063445
2717    stop            0.058513
2024      no            0.056005
193       50            0.054950
2092    only            0.053682
2013     new            0.053577
418      are            0.052055
1848      

## Библиотечные методы (chi2, f_classif, RFE + RandomForest)

In [80]:
from sklearn.feature_selection import chi2, f_classif, SelectKBest, RFE
from sklearn.ensemble import RandomForestClassifier

chi2_scores, _ = chi2(X_500, y)
f_scores, _ = f_classif(X_500, y)

k = 30

chi2_selector = SelectKBest(chi2, k=k)
f_selector = SelectKBest(f_classif, k=k)

X_chi2 = chi2_selector.fit_transform(X_500, y)
X_f = f_selector.fit_transform(X_500, y)

rf_selector = RFE(RandomForestClassifier(), n_features_to_select=k, verbose=1)
X_rf = rf_selector.fit_transform(X_500, y)

Fitting estimator with 500 features.
Fitting estimator with 499 features.
Fitting estimator with 498 features.
Fitting estimator with 497 features.
Fitting estimator with 496 features.
Fitting estimator with 495 features.
Fitting estimator with 494 features.
Fitting estimator with 493 features.
Fitting estimator with 492 features.
Fitting estimator with 491 features.
Fitting estimator with 490 features.
Fitting estimator with 489 features.
Fitting estimator with 488 features.
Fitting estimator with 487 features.
Fitting estimator with 486 features.
Fitting estimator with 485 features.
Fitting estimator with 484 features.
Fitting estimator with 483 features.
Fitting estimator with 482 features.
Fitting estimator with 481 features.
Fitting estimator with 480 features.
Fitting estimator with 479 features.
Fitting estimator with 478 features.
Fitting estimator with 477 features.
Fitting estimator with 476 features.
Fitting estimator with 475 features.
Fitting estimator with 474 features.
F

In [81]:
feature_names = vectorizer_500.get_feature_names_out()

chi2_indices = chi2_selector.get_support(indices=True)
f_indices = f_selector.get_support(indices=True)
rf_indices = rf_selector.get_support(indices=True)

chi2_features = [(feature_names[i], chi2_scores[i]) for i in chi2_indices]
f_features = [(feature_names[i], f_scores[i]) for i in f_indices]
rf_features = [(feature_names[i], 1) for i in rf_indices]

chi2_features.sort(key=lambda x: x[1], reverse=True)
f_features.sort(key=lambda x: x[1], reverse=True)
rf_features.sort(key=lambda x: x[1], reverse=True)

In [82]:
print("Top 30 features selected by chi2:")
for feature, score in chi2_features:
    print(f"Feature: {feature:20} Score: {score}")

Top 30 features selected by chi2:
Feature: txt                  Score: 46.85710894738313
Feature: free                 Score: 39.80093591256124
Feature: call                 Score: 36.60371416208688
Feature: mobile               Score: 31.245609033363756
Feature: 50                   Score: 27.619281820787094
Feature: claim                Score: 26.033778007651218
Feature: stop                 Score: 24.443381896185656
Feature: www                  Score: 23.325711684830146
Feature: nokia                Score: 22.221807954102687
Feature: 150p                 Score: 21.77438105782459
Feature: win                  Score: 20.06898330418751
Feature: 18                   Score: 19.67672181480045
Feature: text                 Score: 19.499686232691662
Feature: uk                   Score: 19.21245936075709
Feature: 100                  Score: 18.93489415811446
Feature: service              Score: 18.8044511086952
Feature: customer             Score: 18.646455014367373
Feature: tone           

In [83]:
print("Top 30 features selected by f_classif:")
for feature, score in f_features:
    print(f"Feature: {feature:20} Score: {score}")

Top 30 features selected by f_classif:
Feature: txt                  Score: 248.23209001957727
Feature: call                 Score: 161.7198330485248
Feature: free                 Score: 139.4411507380109
Feature: mobile               Score: 131.63868188545246
Feature: claim                Score: 109.77787299872361
Feature: 50                   Score: 108.76952786757332
Feature: www                  Score: 95.66615767599221
Feature: stop                 Score: 95.20145166552939
Feature: 150p                 Score: 82.71258305357567
Feature: 18                   Score: 75.90378514492522
Feature: win                  Score: 75.06210710585952
Feature: text                 Score: 73.98346696500431
Feature: prize                Score: 69.65438767196164
Feature: cs                   Score: 68.31374584208697
Feature: uk                   Score: 65.07178260933297
Feature: nokia                Score: 63.35145655860539
Feature: customer             Score: 60.63792048080779
Feature: reply        

In [84]:
print("Top 30 features selected by RFE (Random Forest):")
for feature, _ in rf_features:
    print(f"Feature: {feature:20}")

Top 30 features selected by RFE (Random Forest):
Feature: 150p                
Feature: 18                  
Feature: 50                  
Feature: call                
Feature: claim               
Feature: com                 
Feature: customer            
Feature: for                 
Feature: free                
Feature: from                
Feature: have                
Feature: in                  
Feature: me                  
Feature: mobile              
Feature: no                  
Feature: now                 
Feature: on                  
Feature: or                  
Feature: rate                
Feature: reply               
Feature: service             
Feature: stop                
Feature: text                
Feature: to                  
Feature: txt                 
Feature: uk                  
Feature: win                 
Feature: www                 
Feature: you                 
Feature: your                


In [88]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_500, y, test_size=0.2, random_state=0)

In [89]:
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

classifiers = [
    ('Logistic Regression', LogisticRegression()),
    ('Support Vector Machine', SVC()),
    ('K-Nearest Neighbors', KNeighborsClassifier())
]

for name, classifier in classifiers:
    classifier.fit(X_train, y_train)
    y_pred = classifier.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Classifier: {name:30} Accuracy: {accuracy}")

Classifier: Logistic Regression            Accuracy: 0.925
Classifier: Support Vector Machine         Accuracy: 0.965
Classifier: K-Nearest Neighbors            Accuracy: 0.86


In [90]:
X_train_chi2 = chi2_selector.transform(X_train)
X_test_chi2 = chi2_selector.transform(X_test)

for name, classifier in classifiers:
    classifier.fit(X_train_chi2, y_train)
    y_pred = classifier.predict(X_test_chi2)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Classifier: {name:30} Accuracy (after chi2): {accuracy}")

Classifier: Logistic Regression            Accuracy (after chi2): 0.88
Classifier: Support Vector Machine         Accuracy (after chi2): 0.965
Classifier: K-Nearest Neighbors            Accuracy (after chi2): 0.925


In [91]:
X_train_f = f_selector.transform(X_train)
X_test_f = f_selector.transform(X_test)

for name, classifier in classifiers:
    classifier.fit(X_train_f, y_train)
    y_pred = classifier.predict(X_test_f)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Classifier: {name:30} Accuracy (after f_classif): {accuracy}")

Classifier: Logistic Regression            Accuracy (after f_classif): 0.87
Classifier: Support Vector Machine         Accuracy (after f_classif): 0.965
Classifier: K-Nearest Neighbors            Accuracy (after f_classif): 0.94


In [92]:
X_train_rf = rf_selector.transform(X_train)
X_test_rf = rf_selector.transform(X_test)

for name, classifier in classifiers:
    classifier.fit(X_train_rf, y_train)
    y_pred = classifier.predict(X_test_rf)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Classifier: {name:30} Accuracy (after RFE): {accuracy}")

Classifier: Logistic Regression            Accuracy (after RFE): 0.89
Classifier: Support Vector Machine         Accuracy (after RFE): 0.965
Classifier: K-Nearest Neighbors            Accuracy (after RFE): 0.925


- Для Logistic Regression скор ухудшился
- Для SVM разницы нет
- Для KNN стало лучше