In [2]:
import typing as tp

import pandas as pd

In [3]:
df = pd.read_csv('SMS.tsv', delimiter='\t')
df

Unnamed: 0,class,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [4]:
df.describe()

Unnamed: 0,class,text
count,5572,5572
unique,2,5169
top,ham,"Sorry, I'll call later"
freq,4825,30


In [5]:
y = df['class'].map({'ham': 0, 'spam': 1})

In [6]:
y, y.shape

(0       0
 1       0
 2       1
 3       0
 4       0
        ..
 5567    1
 5568    0
 5569    0
 5570    0
 5571    0
 Name: class, Length: 5572, dtype: int64,
 (5572,))

In [7]:
df["text"][0]

'Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...'

In [8]:
import nltk
from nltk.tokenize import word_tokenize

In [12]:
import ssl

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

In [13]:
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/hedwig_industries/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [9]:
import numpy as np

In [10]:
def calc_frequency_matrix(df: pd.DataFrame) -> [np.ndarray, list[str]]:
    unique_words = set()

    tokenized_texts = df['text'].apply(lambda x: word_tokenize(x.lower()))
    for text in tokenized_texts:
        unique_words.update(text)

    unique_words = list(unique_words)
    word_index = {word: i for i, word in enumerate(unique_words)}
    frequency_matrix = []
    for text in tokenized_texts:
        word_count = [0] * len(unique_words)
        for word in text:
            if word in word_index:
                word_count[word_index[word]] += 1
        frequency_matrix.append(word_count)
    return np.array(frequency_matrix), unique_words

In [11]:
frequency_matrix, unique_words = calc_frequency_matrix(df)

In [12]:
print(unique_words, len(unique_words))



In [13]:
frequency_matrix.shape

(5572, 9439)

In [14]:
def get_used_words(vector: np.ndarray, unique_words: list[str]) -> list[str]:
    non_zero_indices = np.where(vector > 0)[0]
    words = [unique_words[i] for i in non_zero_indices]
    return words

In [15]:
vector = frequency_matrix[0]
used_words = get_used_words(vector, unique_words)

In [16]:
used_words

['wat',
 'e',
 'buffet',
 'amore',
 'n',
 'until',
 'bugis',
 'got',
 'world',
 'crazy',
 'in',
 'jurong',
 'available',
 ',',
 'go',
 '...',
 '..',
 'only',
 'cine',
 'point',
 'great',
 'la',
 'there']

In [17]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score


In [18]:
def print_accuracy(X: np.ndarray, y: np.ndarray, type: str) -> None:
    classifiers: dict[
        str,
        tp.Union[DecisionTreeClassifier, SVC, KNeighborsClassifier]
    ] = {
        'Decision Tree': DecisionTreeClassifier(),
        'SVM': SVC(),
        'KNN': KNeighborsClassifier()
    }
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    for name, classifier in classifiers.items():
        classifier.fit(X_train, y_train)
        y_pred = classifier.predict(X_test)
        print(
            f'{type}:> '
            f'Classifier: {name}. '
            f'Accuracy: {accuracy_score(y_test, y_pred)}. '
        )


def print_diff_classifiers(X, y, selected_features, select_method: str) -> None:
    print_accuracy(X=X, y=y, type=f"Before {select_method}")
    X_selected = X[:, selected_features]
    print_accuracy(X=X_selected, y=y, type=f"After {select_method}")

Filter method: >

In [19]:
from scipy.stats import pearsonr
import numpy as np


def calculate_pearson_correlation(X: np.ndarray, y: np.ndarray) -> np.ndarray:
    n_features = X.shape[1]
    feature_correlations = np.zeros(n_features)
    for feature_idx in range(n_features):
        feature_values = X[:, feature_idx]
        corr, _ = pearsonr(feature_values, y)
        feature_correlations[feature_idx] = corr
    return feature_correlations


def filter_top_features_by_pearson(X, y, top_n=30) -> np.ndarray:
    feature_correlations = calculate_pearson_correlation(X, y)
    top_features_idx = np.argsort(np.abs(feature_correlations))[-top_n:]
    return top_features_idx

In [20]:
selected_features_idx = filter_top_features_by_pearson(frequency_matrix, y)
selected_feature_pearson = np.array(unique_words)[selected_features_idx]
print("Best features by filter Pearson correlation:", selected_feature_pearson.tolist())

Best features by filter Pearson correlation: ['our', 'i', 'tone', 'awarded', 'customer', 'from', 'cs', 'nokia', 'win', 'contact', 't', 'your', 'now', 'cash', 'service', 'guaranteed', 'or', 'text', 'reply', 'urgent', 'stop', '!', 'to', 'won', 'prize', 'mobile', 'claim', 'free', 'txt', 'call']


In [93]:
print_diff_classifiers(frequency_matrix, y, selected_features_idx, select_method="FilterPearsonCorrelation")

Before FilterPearsonCorrelation:> Classifier: Decision Tree. Accuracy: 0.9721973094170404. 
Before FilterPearsonCorrelation:> Classifier: SVM. Accuracy: 0.9820627802690582. 
Before FilterPearsonCorrelation:> Classifier: KNN. Accuracy: 0.9282511210762332. 
After FilterPearsonCorrelation:> Classifier: Decision Tree. Accuracy: 0.9632286995515695. 
After FilterPearsonCorrelation:> Classifier: SVM. Accuracy: 0.968609865470852. 
After FilterPearsonCorrelation:> Classifier: KNN. Accuracy: 0.9623318385650225. 


Wrapper method:>

In [24]:
# check
class CustomRFE:
    def __init__(self, n_features_to_select: int = 30, step: int = 1) -> None:
        self.max_features: int = n_features_to_select
        self.step: int = step
        self.n_features_: tp.Optional[int] = None
        self.support_: np.ndarray = self._init_support_()
        self.rank_: np.ndarray = self._init_rank_()
        self.estimator_: DecisionTreeClassifier = DecisionTreeClassifier()

    def _init_rank_(self) -> np.ndarray:
        return np.ones(self.n_features_, dtype=int)

    def _init_support_(self) -> np.ndarray:
        return np.ones(self.n_features_, dtype=bool)

    def fit(self, X, y) -> np.ndarray:
        self.n_features_ = X.shape[1]
        self.support_ = self._init_support_()
        self.rank_ = self._init_rank_()

        while np.sum(self.support_) > self.max_features:
            self.estimator_.fit(X[:, self.support_], y)
            least_important = np.argsort(self.estimator_.feature_importances_)[:self.step]
            self._upd_features(least_important)

        self.rank_[self.support_] = 1
        return np.where(self.support_)[0]

    def _upd_features(self, least_important: np.ndarray) -> None:
        for idx in least_important:
            if idx < len(self.support_[np.where(self.support_)[0]]):
                self._upd_support_(idx)
                self._upd_rank_(idx)

    def _upd_rank_(self, idx: int) -> None:
        self.rank_[np.where(self.rank_)[0][idx]] = np.sum(self.rank_) + 1

    def _upd_support_(self, idx: int) -> None:
        self.support_[np.where(self.support_)[0][idx]] = False


In [25]:
frequency_matrix = frequency_matrix[:100, :]
y = y[:100]

In [28]:
rfe = CustomRFE(n_features_to_select=30)
rfe.fit(frequency_matrix, y)
rfe.rank_

array([2005494892,          1,          1, ...,          1,          1,
                1])

In [29]:
important_features = np.array(unique_words)[rfe.support_]
print("Best features by wrapper CustomRFE:", important_features.tolist())

Best features by wrapper CustomRFE: ['prize', 'praying.will', 'calls£1/minmobsmorelkpobox177hp51fl', '09058095201', 'goods', 'scallies', '£100', 'www.asjesus.com', 'lib', 'chaps', 'mad1', ')', 'comes', 'reply', 'to', 'i', 'inever', 'ltdhelpdesk', '44', '80086', 'mayb', 'spoons', 'leanne.what', 'none', 'a-', 'oclock', 'hannaford', 'etc', 'favourite', 'wap']


Embedded method:>

In [30]:
print_diff_classifiers(frequency_matrix, y, rfe.support_, select_method="CustomRFE")

Before CustomRFE:> Classifier: Decision Tree. Accuracy: 0.9. 
Before CustomRFE:> Classifier: SVM. Accuracy: 1.0. 
Before CustomRFE:> Classifier: KNN. Accuracy: 0.95. 
After CustomRFE:> Classifier: Decision Tree. Accuracy: 0.95. 
After CustomRFE:> Classifier: SVM. Accuracy: 0.9. 
After CustomRFE:> Classifier: KNN. Accuracy: 1.0. 


In [29]:
class Node:
    def __init__(self, feature=None, threshold=None, left=None, right=None, value=None) -> None:
        self.feature: tp.Optional[int] = feature
        self.threshold: tp.Optional[float] = threshold
        self.left: tp.Optional[Node] = left
        self.right: tp.Optional[Node] = right
        self.value: tp.Optional[np.signedinteger] = value

    def is_leaf(self) -> bool:
        return self.value is not None


class CustomDecisionTree:
    def __init__(
            self,
            max_depth: tp.Optional[int] = None,
            min_samples_split: int = 2,
            min_samples_leaf: int = 1,
            max_features: tp.Optional[int] = None
    ) -> None:
        self.max_depth: tp.Optional[int] = max_depth
        self.min_samples_to_split: int = min_samples_split
        self.min_samples_leaf: int = min_samples_leaf
        self.max_features: tp.Optional[int] = max_features
        self.tree: tp.Optional[Node] = None
        self.feature_indices: tp.Optional[np.ndarray] = None
        self.feature_gains: tp.Optional[np.ndarray] = None

    def fit(self, X: np.ndarray, y: np.ndarray) -> None:
        n_features = X.shape[1]
        y = np.asarray(y)
        self._init_features(n_features)
        self.tree = self._build_tree(X, y)
        self._update_features(n_features)

    def _init_features(self, n_features: int) -> None:
        self.feature_gains = np.zeros(n_features)

    def _build_tree(self, X: np.ndarray, y: np.ndarray, depth: int = 0) -> Node:
        n_samples, n_features = X.shape
        self.n_classes = len(np.unique(y))
        if self.max_depth and depth >= self.max_depth \
                or self.n_classes == 1 \
                or n_samples < self.min_samples_to_split:
            leaf_value: np.signedinteger = self._most_common_label(y)
            return Node(value=leaf_value)

        rand_features = np.random.choice(n_features, n_features, replace=False)
        best_feature, best_threshold = self._best_criteria(X, y, rand_features)
        left_idxs, right_idxs = self._split(X[:, best_feature], best_threshold)

        if (len(left_idxs) < self.min_samples_leaf
                or len(right_idxs) < self.min_samples_leaf):
            leaf_value: np.signedinteger = self._most_common_label(y)
            return Node(value=leaf_value)

        left = self._build_tree(X[left_idxs, :], y[left_idxs], depth + 1)
        right = self._build_tree(X[right_idxs, :], y[right_idxs], depth + 1)
        return Node(feature=best_feature, threshold=best_threshold, left=left, right=right)

    def _update_features(self, n_features: int) -> None:
        if self.max_features is not None:
            top_feature_indices = np.argsort(self.feature_gains)[-self.max_features:]
            self.feature_indices = np.sort(top_feature_indices)
        else:
            self.feature_indices = np.arange(n_features)

    def _best_criteria(self, X: np.ndarray, y: np.ndarray, features: np.ndarray) -> tuple[int, float]:
        best_gain = -1
        best_feature, best_threshold = None, None
        for feature in features:
            X_column_by_feature = X[:, feature]
            thresholds = np.unique(X_column_by_feature)
            for threshold in thresholds:
                gain = self._information_gain(y, X_column_by_feature=X_column_by_feature, split_thresh=threshold)
                if gain > best_gain:
                    self.feature_gains[feature] += gain
                    best_gain = gain
                    best_feature = feature
                    best_threshold = threshold
        return best_feature, best_threshold

    def _information_gain(self, y, X_column_by_feature: np.ndarray, split_thresh: float) -> float:
        parent_entropy = self._calc_entropy(y)
        left_idxs, right_idxs = self._split(X_column_by_feature, split_thresh)
        if len(left_idxs) == 0 or len(right_idxs) == 0:
            return -1
        n_total, n_left, n_right = len(y), len(left_idxs), len(right_idxs)
        entropy_left, entropy_right = self._calc_entropy(y[left_idxs]), self._calc_entropy(y[right_idxs])
        k_left, k_right = n_left / n_total, n_right / n_total
        child_entropy = k_left * entropy_left + k_right * entropy_right
        ig = parent_entropy - child_entropy
        return ig

    @staticmethod
    def _split(X_column_by_feature: np.ndarray, split_thresh: float) -> tuple[np.ndarray, np.ndarray]:
        left_idxs = np.argwhere(X_column_by_feature <= split_thresh).flatten()
        right_idxs = np.argwhere(X_column_by_feature > split_thresh).flatten()
        return left_idxs, right_idxs

    @staticmethod
    def _calc_entropy(y: np.ndarray) -> float:
        p_lst = np.bincount(y) / len(y)
        return -np.sum([p * np.log2(p) for p in p_lst if p > 0])

    @staticmethod
    def _most_common_label(y: np.ndarray) -> np.signedinteger:
        return np.bincount(y).argmax()

    def predict(self, X: np.ndarray) -> np.ndarray:
        return np.array([self._traverse_tree(x, self.tree) for x in X])

    def _traverse_tree(self, x, node) -> np.signedinteger:
        if node.is_leaf():
            return node.value
        if x[node.feature] <= node.threshold:
            return self._traverse_tree(x, node.left)
        return self._traverse_tree(x, node.right)


In [30]:
dt = CustomDecisionTree(
    max_depth=5,
    min_samples_split=4,
    min_samples_leaf=3,
    max_features=30
)
dt.fit(frequency_matrix, y)
selected_feature_indices = dt.feature_indices
selected_features = np.array(unique_words)[selected_feature_indices]
print("Best features by embedded CustomDecisionTreeClassifier:", selected_features.tolist())

Best features by embedded CustomDecisionTreeClassifier: ['stop', 'horo', '#', 'have', 'your', 'text', 'mobile', 'pix', 'or', 'ask', 'a', 'aight', 'call', 'not', 'i', 'me', '!', 'u', 'with', 'extra', 'its', 'later', 'to', 'free', 'for', 'prize', 'quite', 'txt', 'you', 'while']


In [31]:
print_diff_classifiers(frequency_matrix, y, selected_feature_indices, select_method="CustomDecisionTreeClassifier")

Before CustomDecisionTreeClassifier:> Classifier: Decision Tree. Accuracy: 0.9721973094170404. 
Before CustomDecisionTreeClassifier:> Classifier: SVM. Accuracy: 0.9820627802690582. 
Before CustomDecisionTreeClassifier:> Classifier: KNN. Accuracy: 0.9282511210762332. 
After CustomDecisionTreeClassifier:> Classifier: Decision Tree. Accuracy: 0.957847533632287. 
After CustomDecisionTreeClassifier:> Classifier: SVM. Accuracy: 0.9641255605381166. 
After CustomDecisionTreeClassifier:> Classifier: KNN. Accuracy: 0.9623318385650225. 
