In [4]:
import typing as tp

import pandas as pd

In [5]:
df = pd.read_csv('SMS.tsv', delimiter='\t')
df

Unnamed: 0,class,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [6]:
df.describe()

Unnamed: 0,class,text
count,5572,5572
unique,2,5169
top,ham,"Sorry, I'll call later"
freq,4825,30


In [26]:
y = df['class'].map({'ham': 0, 'spam': 1})

In [27]:
y, y.shape

(0       0
 1       0
 2       1
 3       0
 4       0
        ..
 5567    1
 5568    0
 5569    0
 5570    0
 5571    0
 Name: class, Length: 5572, dtype: int64,
 (5572,))

In [9]:
df["text"][0]

'Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...'

In [10]:
import nltk
from nltk.tokenize import word_tokenize

In [None]:
import ssl

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

nltk.download()

In [None]:
nltk.download('punkt_tab')

In [11]:
import numpy as np

In [12]:
def calc_frequency_matrix(df: pd.DataFrame) -> [np.ndarray, list[str]]:
    unique_words = set()
    tokenized_texts = df['text'].apply(lambda x: word_tokenize(x.lower()))
    for text in tokenized_texts:
        unique_words.update(text)

    unique_words = list(unique_words)
    word_index = {word: i for i, word in enumerate(unique_words)}
    frequency_matrix = []
    for text in tokenized_texts:
        word_count = [0] * len(unique_words)
        for word in text:
            if word in word_index:
                word_count[word_index[word]] += 1
        frequency_matrix.append(word_count)
    return np.array(frequency_matrix), unique_words

In [13]:
frequency_matrix, unique_words = calc_frequency_matrix(df)

In [14]:
print(unique_words, len(unique_words))



In [15]:
frequency_matrix.shape

(5572, 9439)

In [16]:
def get_used_words(vector: np.ndarray, unique_words: list[str]) -> list[str]:
    non_zero_indices = np.where(vector > 0)[0]
    words = [unique_words[i] for i in non_zero_indices]
    return words

In [17]:
vector = frequency_matrix[0]
used_words = get_used_words(vector, unique_words)

In [18]:
used_words

['n',
 'cine',
 'got',
 'great',
 'la',
 'buffet',
 'wat',
 'amore',
 'point',
 'until',
 ',',
 'available',
 'in',
 'world',
 'there',
 'e',
 'jurong',
 'bugis',
 '..',
 'go',
 'crazy',
 'only',
 '...']

In [19]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score


In [20]:
def print_accuracy(X: np.ndarray, y: np.ndarray, type: str) -> None:
    classifiers: dict[
        str,
        tp.Union[DecisionTreeClassifier, SVC, KNeighborsClassifier]
    ] = {
        'Decision Tree': DecisionTreeClassifier(),
        'SVM': SVC(),
        'KNN': KNeighborsClassifier()
    }
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    for name, classifier in classifiers.items():
        classifier.fit(X_train, y_train)
        y_pred = classifier.predict(X_test)
        print(
            f'{type}:> '
            f'Classifier: {name}. '
            f'Accuracy: {accuracy_score(y_test, y_pred)}. '
        )


def print_diff_classifiers(X, y, selected_features, select_method: str) -> None:
    print_accuracy(X=X, y=y, type=f"Before {select_method}")
    X_selected = X[:, selected_features]
    print_accuracy(X=X_selected, y=y, type=f"After {select_method}")

Filter method:>

In [None]:
from sklearn.feature_selection import SelectKBest, chi2

In [None]:
select_kbest = SelectKBest(chi2, k=30)
X_new = select_kbest.fit_transform(frequency_matrix, y)
selected_features = np.array(unique_words)[select_kbest.get_support()]

print("Best features by filter KBest:", selected_features.tolist())

In [None]:
print_diff_classifiers(frequency_matrix, y, select_kbest.get_support(), select_method="SelectKBest")

Wrapper method:>

In [31]:
frequency_matrix = frequency_matrix[:500, :]
y = y[:500]

In [None]:
from sklearn.feature_selection import RFE

In [33]:
model = DecisionTreeClassifier()
rfe = RFE(estimator=model, n_features_to_select=10)
rfe = rfe.fit(frequency_matrix, y)
rfe.ranking_

array([9430, 9429, 9428, ..., 4258, 4260, 5721])

In [35]:
important_features = np.array(unique_words)[rfe.support_]
print("Best features by wrapper RFE:", important_features.tolist())

Best features by wrapper RFE: ['prize', 'link', 'to', 'reply', 'tron', 'plane', 'special', 'hear', '09064019788', 'i']


In [36]:
print_diff_classifiers(frequency_matrix, y, rfe.support_, select_method="RFE")

Before RFE:> Classifier: Decision Tree. Accuracy: 0.9. 
Before RFE:> Classifier: SVM. Accuracy: 1.0. 
Before RFE:> Classifier: KNN. Accuracy: 0.95. 
After RFE:> Classifier: Decision Tree. Accuracy: 0.95. 
After RFE:> Classifier: SVM. Accuracy: 0.9. 
After RFE:> Classifier: KNN. Accuracy: 0.95. 


Embedded method:>

In [None]:
decision_tree = DecisionTreeClassifier(random_state=42)
decision_tree.fit(frequency_matrix, y)
features = decision_tree.feature_importances_
tuple_values = [(i, unique_words[i], importance) for i, importance in enumerate(features)]
tuple_values = sorted(tuple_values, key=lambda x: x[2], reverse=True)
indices, names, _ = zip(*tuple_values)
# Печать наиболее важных признаков
print("Best features by embedded DecisionTreeClassifier:", names[:30])

In [None]:
print_diff_classifiers(frequency_matrix, y, indices[:30], select_method="DecisionTreeClassifier")

In [None]:
list1 = ['stop', 'horo', '#', 'have', 'your', 'text', 'mobile', 'pix', 'or', 'ask', 'a', 'aight', 'call', 'not', 'i', 'me', '!', 'u', 'with', 'extra', 'its', 'later', 'to', 'free', 'for', 'prize', 'quite', 'txt', 'you', 'while']
list2 = ['call', 'txt', 'i', 'text', 'me', 'reply', 'http', 'won', 'to', '150p/msg', 'claim', 'you', 'tones', 'reveal', 'ringtone', 'service', '18', 'stop', 'my', 'for', 'now', '..', '150p', 'selection', 'ill', 'chat', 'ask', 'not', ';', 'games']
set1 = set(list1)
set2 = set(list2)
common_elements = set1.intersection(set2)
unique_in_list1 = set1.difference(set2)
unique_in_list2 = set2.difference(set1)
common_elements = list(common_elements)
unique_in_list1 = list(unique_in_list1)
unique_in_list2 = list(unique_in_list2)
print("Common elements:", common_elements)
print("Unique to list1:", unique_in_list1)
print("Unique to list2:", unique_in_list2)