In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import nltk
from nltk.tokenize import word_tokenize
import stopwordsiso as stopwords
import numpy as np

In [2]:
stop_words = list(stopwords.stopwords("ml"))

In [3]:
# Load and preprocess data
all_news = "D:/Dataset/Fake_News_Dataset_Malayalam/mal_fake_train.csv"

train_set = pd.read_csv(all_news)
X_train = train_set["text"]
Y_train = train_set["label"].map({'original': 1, 'Fake': 0})

test_news = "D:/Dataset/Fake_News_Dataset_Malayalam/mal_fake_test_with_labels.csv"

test_set = pd.read_csv(test_news)
X_test = test_set["text"]
Y_test = test_set["label"].map({'original': 1, 'Fake': 0})

# true_df = all_df[all_df["label"] == "original"]

# fake_df = all_df[all_df["label"] == "Fake"]

In [4]:
def preprocess(text):
    tokens = word_tokenize(text)
    return [word for word in tokens if word not in stop_words]

X_train_tokens = X_train.apply(preprocess)
X_test_tokens = X_test.apply(preprocess)


In [6]:
# KNN, BoW
"""
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(stop_words=stop_words)
X_train_bow = vectorizer.fit_transform(X_train)
X_test_bow = vectorizer.transform(X_test)
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train_bow, Y_train)
y_pred = knn.predict(X_test_bow)
"""

'\nfrom sklearn.feature_extraction.text import CountVectorizer\nvectorizer = CountVectorizer(stop_words=stop_words)\nX_train_bow = vectorizer.fit_transform(X_train)\nX_test_bow = vectorizer.transform(X_test)\nknn = KNeighborsClassifier(n_neighbors=3)\nknn.fit(X_train_bow, Y_train)\ny_pred = knn.predict(X_test_bow)\n'

In [7]:
# KNN, TF-IDF
"""
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(stop_words=stop_words)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train_tfidf, Y_train)
y_pred = knn.predict(X_test_tfidf)
"""

'\nfrom sklearn.feature_extraction.text import TfidfVectorizer\nvectorizer = TfidfVectorizer(stop_words=stop_words)\nX_train_tfidf = vectorizer.fit_transform(X_train)\nX_test_tfidf = vectorizer.transform(X_test)\nknn = KNeighborsClassifier(n_neighbors=3)\nknn.fit(X_train_tfidf, Y_train)\ny_pred = knn.predict(X_test_tfidf)\n'

In [8]:
# KNN, Word2Vec
"""
from gensim.models import Word2Vec
word2vec_model = Word2Vec(sentences=X_train_tokens, vector_size=100, window=5, min_count=1, workers=4)
def document_vector(doc):
    doc = [word for word in doc if word in word2vec_model.wv.key_to_index]
    if len(doc) == 0:
        return np.zeros(word2vec_model.vector_size)
    return np.mean(word2vec_model.wv[doc], axis=0)
X_train_w2v = np.array([document_vector(doc) for doc in X_train_tokens])
X_test_w2v = np.array([document_vector(doc) for doc in X_test_tokens])
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train_w2v, Y_train)
y_pred = knn.predict(X_test_w2v)
"""

'\nfrom gensim.models import Word2Vec\nword2vec_model = Word2Vec(sentences=X_train_tokens, vector_size=100, window=5, min_count=1, workers=4)\ndef document_vector(doc):\n    doc = [word for word in doc if word in word2vec_model.wv.key_to_index]\n    if len(doc) == 0:\n        return np.zeros(word2vec_model.vector_size)\n    return np.mean(word2vec_model.wv[doc], axis=0)\nX_train_w2v = np.array([document_vector(doc) for doc in X_train_tokens])\nX_test_w2v = np.array([document_vector(doc) for doc in X_test_tokens])\nknn = KNeighborsClassifier(n_neighbors=3)\nknn.fit(X_train_w2v, Y_train)\ny_pred = knn.predict(X_test_w2v)\n'

In [5]:
# KNN, Glove
"""
from tqdm import tqdm
# Load pre-trained FastText embeddings
def load_fasttext_embeddings(fasttext_file):
    embeddings_index = {}
    with open(fasttext_file, encoding='utf-8') as f:
        for line in tqdm(f, "Reading FastText"):
            values = line.rstrip().split(' ')
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
    return embeddings_index

fasttext_file = 'D:/cc.ml.300.vec'
# https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ml.300.vec.gz for malayalam glove file
embeddings_index = load_fasttext_embeddings(fasttext_file)

# Function to get average FastText embeddings for a document
def document_vector(doc):
    doc = [word for word in doc if word in embeddings_index]
    if len(doc) == 0:
        return np.zeros(300)  # Ensure the return vector is of size 300
    return np.mean([embeddings_index[word] for word in doc], axis=0)

X_train_glove = np.array([document_vector(doc) for doc in X_train_tokens])
X_test_glove = np.array([document_vector(doc) for doc in X_test_tokens])

# Train KNN classifier
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train_glove, Y_train)

# Predict and evaluate
y_pred = knn.predict(X_test_glove)
"""

Reading FastText: 2000001it [03:37, 9201.02it/s] 


In [7]:
# Evaluate the classifier
accuracy = accuracy_score(Y_test, y_pred)
report = classification_report(Y_test, y_pred)
conf_matrix = confusion_matrix(Y_test, y_pred)

print(f'Accuracy: {accuracy}')
print('Classification Report:')
print(report)
print('Confusion Matrix:')
print(conf_matrix)

Accuracy: 0.6732090284592738
Classification Report:
              precision    recall  f1-score   support

           0       0.65      0.76      0.70       507
           1       0.71      0.59      0.64       512

    accuracy                           0.67      1019
   macro avg       0.68      0.67      0.67      1019
weighted avg       0.68      0.67      0.67      1019

Confusion Matrix:
[[385 122]
 [211 301]]
