##   Importing Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn

## Reading Data

In [2]:
train = pd.read_csv("train.csv")
val = pd.read_csv("val.csv")
test = pd.read_csv("test.csv")

x_train = train["content"]
y_train = train["label"]

x_val = val["content"]
y_val = val["label"]

x_test = test["content"]
y_test = test["label"]

In [11]:
train.head()

Unnamed: 0,content,label
0,به گزارش خبرنگار حوزه بهداشت و درمان گروه علمی...,7
1,به گزارش خبرنگار فوتبال و فوتسال گروه ورزشی با...,6
2,بهروز اکرمی، در گفتگو با خبرنگار اجتماعی باشگا...,0
3,به گزارش خبرنگار حوزه شهری گروه اجتماعی باشگاه...,0
4,به گزارش باشگاه خبرنگاران و به نقل از روابط عم...,5


In [3]:
from hazm import Normalizer, WordTokenizer, stopwords_list, Lemmatizer
import regex as re
import string

normalizer1 = Normalizer()

# To-Do
def preprocessing(input):
    punctoation = [".", ",", "(", "[", "/", "{", "|", "@", "?", ",", "!", ".", ";", "#"]
    output = input.translate(str.maketrans("", "", string.punctuation))
    output = " ".join(output.split())
    output = re.sub(r"\d+", "", output)
    output = output.translate({ord(x): "" for x in punctoation})
    output = normalizer1.normalize(output)
    return output

In [4]:
from hazm import *

stopwords = stopwords_list()
word_tokenizer = WordTokenizer()
lemmatizer = Lemmatizer()
# persian_stopwords = stopwords_output("Persian", "nar")


def tokenizer(text):
    filtered = []
    token_first = word_tokenizer.tokenize(text)
    for word in token_first:
        if word not in stopwords:
            filtered.append(lemmatizer.lemmatize(word))
    return filtered

### TF IDF


In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer

#tfidf_train = TfidfVectorizer(tokenizer=tokenizer, preprocessor=preprocessing)

tfidf_train = TfidfVectorizer()


x_train_tfidf = tfidf_train.fit_transform(x_train)
x_val_tfidf = tfidf_train.transform(x_val)
x_test_tfidf = tfidf_train.transform(x_test)

### BOW

In [7]:
from sklearn.feature_extraction.text import CountVectorizer

bow_train = CountVectorizer()

x_train_bow = bow_train.fit_transform(x_train)
x_val_bow = bow_train.transform(x_val)
x_test_bow = bow_train.transform(x_test)

## Random Forest

### TF IDF


In [8]:
from sklearn.ensemble import RandomForestClassifier

rf_tfidf = RandomForestClassifier(random_state=0)
rf_tfidf.fit(x_train_tfidf, y_train)

y_val_predict = rf_tfidf.predict(x_val_tfidf)
y_test_predict = rf_tfidf.predict(x_test_tfidf)

In [9]:
from sklearn.metrics import f1_score

print("Score Validation : ",np.round(f1_score(y_val_predict, y_val, average="weighted"),2))
print("Score Test : ",np.round(f1_score(y_test_predict, y_test, average="weighted"),2))


Score Validation :  0.9
Score Test :  0.91


In [10]:
from sklearn.metrics import classification_report
print(classification_report(y_val_predict, y_val))

              precision    recall  f1-score   support

           0       0.87      0.91      0.89       187
           1       0.75      0.94      0.83       113
           2       0.95      0.93      0.94       181
           3       0.87      0.80      0.83       220
           4       0.95      0.90      0.92       230
           5       0.93      0.94      0.93       228
           6       1.00      0.97      0.98       128
           7       0.89      0.87      0.88       193

    accuracy                           0.90      1480
   macro avg       0.90      0.91      0.90      1480
weighted avg       0.91      0.90      0.90      1480



### BOW


In [11]:
rf_bow = RandomForestClassifier(random_state=0)
rf_bow.fit(x_train_bow, y_train)
y_val_predict_bow = rf_bow.predict(x_val_bow)
y_test_predict_bow  = rf_bow.predict(x_test_bow)

In [12]:
from sklearn.metrics import f1_score

print("Score Validation : ",np.round(f1_score(y_val_predict_bow, y_val, average="weighted"),2))
print("Score Test : ",np.round(f1_score(y_test_predict_bow, y_test, average="weighted"),2))


Score Validation :  0.91
Score Test :  0.91


In [13]:
from sklearn.metrics import classification_report
print(classification_report(y_val_predict_bow, y_val))

              precision    recall  f1-score   support

           0       0.83      0.92      0.87       178
           1       0.80      0.95      0.87       119
           2       0.95      0.93      0.94       181
           3       0.87      0.81      0.84       219
           4       0.95      0.91      0.93       227
           5       0.95      0.94      0.94       233
           6       1.00      0.98      0.99       127
           7       0.91      0.87      0.89       196

    accuracy                           0.91      1480
   macro avg       0.91      0.91      0.91      1480
weighted avg       0.91      0.91      0.91      1480



## Decision Tree

### TF IDF

In [14]:
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier(random_state=0)

dt.fit(x_train_tfidf, y_train)

y_val_predict_dt = dt.predict(x_val_tfidf)
y_test_predict_dt = dt.predict(x_test_tfidf)


In [15]:
from sklearn.metrics import f1_score

print("Score Validation : ",np.round(f1_score(y_val_predict_dt, y_val, average="weighted"),2))
print("Score Test : ",np.round(f1_score(y_test_predict_dt, y_test, average="weighted"),2))

Score Validation :  0.83
Score Test :  0.84


In [16]:
from sklearn.metrics import classification_report

print(classification_report(y_val_predict_dt, y_val))

              precision    recall  f1-score   support

           0       0.81      0.78      0.79       205
           1       0.78      0.83      0.80       133
           2       0.87      0.85      0.86       183
           3       0.76      0.80      0.78       194
           4       0.93      0.91      0.92       223
           5       0.78      0.82      0.80       219
           6       0.89      0.87      0.88       127
           7       0.86      0.82      0.84       196

    accuracy                           0.83      1480
   macro avg       0.83      0.83      0.83      1480
weighted avg       0.83      0.83      0.83      1480



### BOW


In [17]:
dt_bow = DecisionTreeClassifier(random_state=0)
dt_bow.fit(x_train_bow, y_train)
y_val_predict_bow_dt = dt_bow.predict(x_val_bow)
y_test_predict_bow_dt  = dt_bow.predict(x_test_bow)

In [18]:
from sklearn.metrics import f1_score

print("Score Validation : ",np.round(f1_score(y_val_predict_bow_dt, y_val, average="weighted"),2))
print("Score Test : ",np.round(f1_score(y_test_predict_bow_dt, y_test, average="weighted"),2))


Score Validation :  0.85
Score Test :  0.85


In [19]:
from sklearn.metrics import classification_report
print(classification_report(y_val_predict_bow_dt, y_val))

              precision    recall  f1-score   support

           0       0.78      0.79      0.79       193
           1       0.82      0.82      0.82       141
           2       0.90      0.90      0.90       178
           3       0.80      0.78      0.79       210
           4       0.95      0.92      0.94       226
           5       0.80      0.86      0.83       216
           6       0.93      0.89      0.91       129
           7       0.85      0.85      0.85       187

    accuracy                           0.85      1480
   macro avg       0.85      0.85      0.85      1480
weighted avg       0.85      0.85      0.85      1480



## SVM

In [20]:
from sklearn.svm import SVC

svm = SVC()

svm.fit(x_train_tfidf, y_train)
y_val_predict_svm = svm.predict(x_val_tfidf)
y_test_predict_svm = svm.predict(x_test_tfidf)

In [21]:
from sklearn.metrics import f1_score

print("Score Validation : ",np.round(f1_score(y_val_predict_svm, y_val, average="weighted"),2))
print("Score Test : ",np.round(f1_score(y_test_predict_svm, y_test, average="weighted"),2))

Score Validation :  0.89
Score Test :  0.9


In [22]:
from sklearn.metrics import classification_report

print(classification_report(y_val_predict_svm, y_val))

              precision    recall  f1-score   support

           0       0.85      0.88      0.87       190
           1       0.87      0.85      0.86       145
           2       0.94      0.92      0.93       181
           3       0.83      0.88      0.86       193
           4       0.89      0.92      0.90       213
           5       0.96      0.92      0.94       239
           6       0.98      0.97      0.98       126
           7       0.85      0.83      0.84       193

    accuracy                           0.89      1480
   macro avg       0.90      0.90      0.90      1480
weighted avg       0.90      0.89      0.89      1480



## SGD Classifier

In [11]:
from sklearn.linear_model import SGDClassifier
sgd = SGDClassifier(max_iter=100000, random_state=42)


sgd.fit(x_train_tfidf, y_train)

# Make predictions
y_val_predict_sgd = sgd.predict(x_val_tfidf)
y_test_predict_sgd = sgd.predict(x_test_tfidf)

In [12]:
from sklearn.metrics import f1_score

print("Score Validation : ",np.round(f1_score(y_val_predict_sgd, y_val, average="weighted"),2))
print("Score Test : ",np.round(f1_score(y_test_predict_sgd, y_test, average="weighted"),2))

Score Validation :  0.9
Score Test :  0.91


In [18]:
from sklearn.metrics import classification_report

print(classification_report(y_val_predict_sgd, y_val))
print("_________________________________________________\n")
print(classification_report(y_test_predict_sgd, y_test))

              precision    recall  f1-score   support

           0       0.85      0.86      0.86       194
           1       0.89      0.86      0.88       146
           2       0.94      0.93      0.94       181
           3       0.83      0.92      0.87       185
           4       0.92      0.92      0.92       218
           5       0.97      0.92      0.95       244
           6       1.00      0.96      0.98       129
           7       0.84      0.86      0.85       183

    accuracy                           0.90      1480
   macro avg       0.91      0.90      0.91      1480
weighted avg       0.91      0.90      0.90      1480

_________________________________________________

              precision    recall  f1-score   support

           0       0.79      0.89      0.84       194
           1       0.90      0.86      0.88       162
           2       0.95      0.93      0.94       202
           3       0.88      0.90      0.89       220
           4       0.93    

## MLP

### Single Layer

In [10]:
from sklearn.metrics import classification_report

# Evaluate the model
y_pred = model.predict(x_test_tfidf)
y_pred_class = tf.argmax(y_pred, axis=1)
print(classification_report(y_test, y_pred_class))

[1m52/52[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 108ms/step
              precision    recall  f1-score   support

           0       0.87      0.78      0.82       217
           1       0.86      0.88      0.87       156
           2       0.90      0.95      0.93       197
           3       0.87      0.85      0.86       227
           4       0.94      0.94      0.94       244
           5       0.92      0.94      0.93       256
           6       0.98      0.97      0.97       138
           7       0.87      0.90      0.89       209

    accuracy                           0.90      1644
   macro avg       0.90      0.90      0.90      1644
weighted avg       0.90      0.90      0.90      1644



### Single Layer with 8 neurons

In [19]:
input_dim = x_train_tfidf.shape[1]  # Number of TF-IDF features
num_classes = 8

# Define the model
model = Sequential()
model.add(Dense(8, activation="relu", input_shape=(input_dim,)))
model.add(Dense(9, activation="relu"))
model.add(Dense(num_classes, activation="softmax"))

# Compile the model
model.compile(
    loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["accuracy"]
)

# Train the model
model.fit(
    x_train_tfidf,
    y_train,
    epochs=10,
    batch_size=32,
    validation_data=(x_val_tfidf, y_val),
)

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/10
[1m417/417[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m76s[0m 139ms/step - accuracy: 0.3804 - loss: 1.8216 - val_accuracy: 0.7068 - val_loss: 0.9674
Epoch 2/10
[1m417/417[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m73s[0m 114ms/step - accuracy: 0.7848 - loss: 0.7541 - val_accuracy: 0.8385 - val_loss: 0.5820
Epoch 3/10
[1m417/417[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m109s[0m 175ms/step - accuracy: 0.9259 - loss: 0.3624 - val_accuracy: 0.8655 - val_loss: 0.4552
Epoch 4/10
[1m417/417[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m94s[0m 199ms/step - accuracy: 0.9705 - loss: 0.1900 - val_accuracy: 0.8804 - val_loss: 0.4011
Epoch 5/10
[1m417/417[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m97s[0m 92ms/step - accuracy: 0.9886 - loss: 0.1036 - val_accuracy: 0.8831 - val_loss: 0.3852
Epoch 6/10
[1m417/417[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 82ms/step - accuracy: 0.9954 - loss: 0.0566 - val_accuracy: 0.8764 - val_loss: 0.3870
Epoch 7/10


<keras.src.callbacks.history.History at 0x28d3b887b50>

In [20]:
# classification report

# Evaluate the model
y_pred = model.predict(x_test_tfidf)
y_pred_class = tf.argmax(y_pred, axis=1)
print(classification_report(y_test, y_pred_class))

[1m52/52[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 66ms/step
              precision    recall  f1-score   support

           0       0.79      0.76      0.77       217
           1       0.85      0.81      0.83       156
           2       0.93      0.95      0.94       197
           3       0.83      0.87      0.85       227
           4       0.92      0.92      0.92       244
           5       0.95      0.91      0.93       256
           6       0.99      0.91      0.95       138
           7       0.81      0.89      0.85       209

    accuracy                           0.88      1644
   macro avg       0.88      0.88      0.88      1644
weighted avg       0.88      0.88      0.88      1644

