In [None]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier

# Here i trained the  data
documents = [
    ("free free free buy discount combo pleasure", "S"),
    ("free free free discount pleasure smile smile smile", "S"),
    ("cat mouse", "N"),
    ("cat cat dog dog dog dog", "N"),
    ("mouse", "N")
]

# The datas which are used for Test
test_documents = [
    "dog cat mouse cat",
    "Free free smile"
]

# Here we are Computing MI Scores
def calculate_mi(documents):
    word_counts = {}
    for doc, label in documents:
        for word in set(doc.split()):
            if word not in word_counts:
                word_counts[word] = {label: 1}
            else:
                word_counts[word][label] = word_counts[word].get(label, 0) + 1

    total_docs = len(documents)
    mi_scores = {}
    for word, counts in word_counts.items():
        p_w = sum(counts.values()) / total_docs
        for label, count in counts.items():
            p_w_c = count / total_docs
            p_c = len([doc for doc, l in documents if l == label]) / total_docs
            mi = p_w_c * np.log2(p_w_c / (p_w * p_c))
            mi_scores[word] = mi_scores.get(word, 0) + mi

    return sorted(mi_scores.items(), key=lambda x: x[1], reverse=True)

mi_scores = calculate_mi(documents)
top_features = [word for word, score in mi_scores[:2]]
print("Top Features:", top_features)

# Here i Computed TF*IDF Scores
tfidf_vectorizer = TfidfVectorizer(vocabulary=top_features)
corpus = [doc for doc, label in documents]
tfidf_matrix = tfidf_vectorizer.fit_transform(corpus)

# Here we are  Formulating  Training Data Matrix
train_data_matrix = tfidf_matrix.toarray()
train_labels = [label for doc, label in documents]

print("Training Data Matrix:\n", train_data_matrix)

# Here we are  Calculating TF*IDF for Test Data
test_tfidf_matrix = tfidf_vectorizer.transform(test_documents).toarray()
print("Test Data TF*IDF Vectors:\n", test_tfidf_matrix)

# Here we Predict Class Labels Using KNN
knn = KNeighborsClassifier(n_neighbors=1)
knn.fit(train_data_matrix, train_labels)

# Here we made some  Predictions for test data
predictions = knn.predict(test_tfidf_matrix)
print("Predicted Class Labels for Test Data:", predictions)


Top Features: ['discount', 'free']
Training Data Matrix:
 [[0.31622777 0.9486833 ]
 [0.31622777 0.9486833 ]
 [0.         0.        ]
 [0.         0.        ]
 [0.         0.        ]]
Test Data TF*IDF Vectors:
 [[0. 0.]
 [0. 1.]]
Predicted Class Labels for Test Data: ['N' 'S']
