In [None]:
! pip install gdown
! gdown --no-check-certificate --folder 'https://drive.google.com/drive/folders/1OhNOXveVDasgQcjurKxJ2-Q8hhH9JxY8?usp=sharing'

Retrieving folder list
Processing file 1KXCLKv2W4Es_f0ULarQjpdUNG51WEVhe shuffled_news.csv
Retrieving folder list completed
Building directory structure
Building directory structure completed
Downloading...
From: https://drive.google.com/uc?id=1KXCLKv2W4Es_f0ULarQjpdUNG51WEVhe
To: /content/data/shuffled_news.csv
100% 15.0M/15.0M [00:00<00:00, 116MB/s] 
Download completed


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
import re

In [None]:
!pip install hazm

Collecting hazm
  Downloading hazm-0.9.4-py3-none-any.whl (371 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m371.7/371.7 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting fasttext-wheel<0.10.0,>=0.9.2 (from hazm)
  Downloading fasttext_wheel-0.9.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (4.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.4/4.4 MB[0m [31m35.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting flashtext<3.0,>=2.7 (from hazm)
  Downloading flashtext-2.7.tar.gz (14 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting numpy==1.24.3 (from hazm)
  Downloading numpy-1.24.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.3/17.3 MB[0m [31m50.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting python-crfsuite<0.10.0,>=0.9.9 (from hazm)
  Downloading python_crfsuite-0.9.9-cp310-cp310-manylinux_2_17_x86_64.m

In [None]:
from hazm import Normalizer, WordTokenizer, stopwords_list

In [None]:
class DataPreprocessor:
    def __init__(self):
        self.data = None
        self.normalizer = Normalizer()
        self.tokenizer = WordTokenizer()
        self.stop_words = stopwords_list()

    def read_data(self, file_path):
        self.data = pd.read_csv(file_path)

    def split_data(self, test_size=0.2):

        X = self.data['Text']
        y = self.data['Topic']
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)
        return X_train, X_test, y_train, y_test

    def clean_text(self, text):
        text = re.sub(r'[a-zA-Z0-9۰-۹;:.,،()«»]+', '', text)

        text = self.normalizer.normalize(text)
        tokens = self.tokenizer.tokenize(text)
        cleaned_text = ' '.join([token for token in tokens if token not in self.stop_words])

        return cleaned_text.strip()


In [None]:
#test
preprocessor = DataPreprocessor()
cleaned_text_example = preprocessor.clean_text(' این یک تست برای کلمات ایست در زبان فارسی است')
print(cleaned_text_example)

تست کلمات ایست زبان فارسی


In [None]:
from collections import defaultdict, Counter
from math import log
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

In [None]:
class NaiveBayesClassifier:
    def __init__(self):
        self.word_counts = defaultdict(lambda: defaultdict(int))
        self.class_counts = defaultdict(int)
        self.total_words = defaultdict(int)
        self.vocab = set()
        self.class_log_prior = defaultdict(float)
        self.word_log_prob = defaultdict(lambda: defaultdict(float))

    def count_word_per_class(self, data):
        for _, row in data.iterrows():
            cls = row['Topic']
            words = row['Text'].split()
            self.class_counts[cls] += 1
            for word in words:
                self.word_counts[cls][word] += 1
                self.vocab.add(word)

        for cls in self.word_counts:
            self.total_words[cls] = sum(self.word_counts[cls].values())
            unique_words_count = len(self.word_counts[cls])
            print(f"Class '{cls}' - Total Words: {self.total_words[cls]}, Unique Words: {unique_words_count}")
            most_common_words = Counter(self.word_counts[cls]).most_common(200)
            print(f"Top 200 words in class {cls}:")
            for word, count in most_common_words:
                print(f'{word}: {count}')
            print("\n")

    def calculate_word_log_prob_per_class(self):
        alpha = 1
        for cls in self.word_counts:
            total_words_in_class = self.total_words[cls]
            unique_words = len(self.vocab)
            for word in self.vocab:
                count = self.word_counts[cls][word]
                self.word_log_prob[cls][word] = log((count + alpha) / (total_words_in_class + alpha * unique_words))

    def calculate_log_prior(self):
        total_samples = sum(self.class_counts.values())
        for cls in self.class_counts:
            self.class_log_prior[cls] = log(self.class_counts[cls] / total_samples)

    def predict(self, text):
        words = text.split()
        scores = defaultdict(float)
        for cls in self.class_counts:
            scores[cls] = self.class_log_prior[cls]
            for word in words:
                if word in self.vocab:
                    scores[cls] += self.word_log_prob[cls][word]
        print("Scores per class:", scores)
        max_class = max(scores, key=scores.get)
        print("Predicted class:", max_class)
        return max_class

    def evaluate(self, true_labels, predicted_labels):
        cm = confusion_matrix(true_labels, predicted_labels)
        print("Confusion Matrix:\n", cm)

        accuracy = accuracy_score(true_labels, predicted_labels)
        print("Accuracy:", accuracy)

        report = classification_report(true_labels, predicted_labels)
        print(report)

In [None]:
preprocessor.read_data('/content/data/shuffled_news.csv')

In [None]:
X_train, X_test, y_train, y_test = train_test_split(preprocessor.data['Text'], preprocessor.data['Topic'], test_size=0.25)

In [None]:
X_train_clean = X_train.apply(preprocessor.clean_text)
X_test_clean = X_test.apply(preprocessor.clean_text)

In [None]:
classifier = NaiveBayesClassifier()
training_data = pd.DataFrame({'Text': X_train_clean, 'Topic': y_train})
classifier.count_word_per_class(training_data)
classifier.calculate_log_prior()
classifier.calculate_word_log_prob_per_class()

Class 'Tech' - Total Words: 231265, Unique Words: 5940
Top 200 words in class Tech:
فناوری: 2946
شرکت: 2714
محققان: 2005
کشور: 1940
تولید: 1841
استفاده: 1822
ارتباطات: 1593
خبر: 1499
ایران: 1456
فضایی: 1431
اطلاعات: 1414
سال: 1297
علمی: 1176
توسعه: 1140
سازمان: 1114
دانش: 1075
حوزه: 1043
رئیس: 990
دانشگاه: 982
معاون: 950
وزیر: 924
موفق: 882
بنیان: 849
کرده‌اند: 843
ملی: 821
قرار: 814
اینترنت: 805
علوم: 802
موبایل: 746
برگزار: 732
اعلام: 694
مرکز: 662
معاونت: 662
ستاد: 655
زمین: 627
آمریکا: 625
گوگل: 609
بازار: 603
نشان: 596
نوآوری: 589
شرکت‌های: 583
دبیر: 562
ابداع: 557
ساخت: 555
ارائه: 548
هدف: 542
بررسی: 536
فناوری‌های: 513
ایرانی: 504
هزار: 486
کاربران: 484
شبکه: 478
طراحی: 477
منتشر: 476
توسط: 474
انجام: 474
کار: 473
میلیون: 471
مصنوعی: 465
همکاری: 461
ایجاد: 460
جمهوری: 459
تهران: 456
کمک: 451
نام: 451
مختلف: 449
وزارت: 439
جهان: 438
پژوهشگاه: 431
صنعت: 431
همراه: 429
شناسایی: 411
حمایت: 407
هوشمند: 404
آب: 400
ماهواره: 399
بین‌المللی: 399
روز: 398
ماه: 397
هفته: 396
پژوهش: 388
قص

In [None]:
predicted_labels = X_test_clean.apply(classifier.predict)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Scores per class: defaultdict(<class 'float'>, {'Tech': -141.939949145824, 'Sport': -79.1091349513306, 'Economy': -145.3396484396404})
Predicted class: Sport
Scores per class: defaultdict(<class 'float'>, {'Tech': -83.94527791105122, 'Sport': -118.84034432558228, 'Economy': -98.54121491661962})
Predicted class: Tech
Scores per class: defaultdict(<class 'float'>, {'Tech': -192.84041944614447, 'Sport': -224.24089393082423, 'Economy': -172.19651616006985})
Predicted class: Economy
Scores per class: defaultdict(<class 'float'>, {'Tech': -148.28005714593542, 'Sport': -119.86432150379417, 'Economy': -154.33250440464934})
Predicted class: Sport
Scores per class: defaultdict(<class 'float'>, {'Tech': -128.141410333565, 'Sport': -90.44361317632561, 'Economy': -120.09006472812088})
Predicted class: Sport
Scores per class: defaultdict(<class 'float'>, {'Tech': -208.15522346237853, 'Sport': -184.6144866584607, 'Economy': -199.4647653

In [None]:
classifier.evaluate(y_test, predicted_labels)

Confusion Matrix:
 [[4978    0   16]
 [   3 5036    1]
 [  68    4 4902]]
Accuracy: 0.9938699360341151
              precision    recall  f1-score   support

     Economy       0.99      1.00      0.99      4994
       Sport       1.00      1.00      1.00      5040
        Tech       1.00      0.99      0.99      4974

    accuracy                           0.99     15008
   macro avg       0.99      0.99      0.99     15008
weighted avg       0.99      0.99      0.99     15008



In [None]:
results_df = pd.DataFrame({
    'Actual': y_test,
    'Predicted': predicted_labels,
    'Headline': X_test_clean
}).reset_index(drop=True)


misclassified = results_df[results_df['Actual'] != results_df['Predicted']]

classes = ['Sports', 'Tech', 'Economy']
misclassified_analysis = {}

for cls in classes:
    misclassified_cls = misclassified[misclassified['Actual'] == cls]
    samples = misclassified_cls.sample(n=min(5, len(misclassified_cls)), random_state=42)
    misclassified_analysis[cls] = samples

for cls, samples in misclassified_analysis.items():
    print(f"Misclassified samples for class '{cls}':")
    print(samples)
    print("\n")


Misclassified samples for class 'Sports':
Empty DataFrame
Columns: [Actual, Predicted, Headline]
Index: []


Misclassified samples for class 'Tech':
      Actual Predicted                                           Headline
423     Tech   Economy  دولت لایحه بودجه سال درآمد حاصل نامبرینگ شماره...
12606   Tech   Economy  هزار کارت بانکی هفته فرآیند ثبت‌نام الکترونیکی...
3998    Tech   Economy  وزیر علوم تکیه فناوری می‌توانیم بازار جهانی بد...
62      Tech   Economy  رئیس کارگروه فناوری‌های پیشران اتاق بازرگانی ا...
5739    Tech     Sport  رئیس کمیته ملی ربوکاپ ایران برگزاری دومین دوره...


Misclassified samples for class 'Economy':
        Actual Predicted                                           Headline
296    Economy      Tech  مدیر توسعه کارآفرینی بهره‌وری نیروی کار وزارت ...
2731   Economy      Tech  مدیر توسعه کارآفرینی بهره‌وری نیروی کار وزارت ...
4563   Economy      Tech  معاون علمی فناوری رییس‌جمهور دنبال طراحی خودرو...
13347  Economy      Tech  رئیس موسسه تحقیقات آب ساخت رسوب 