# Assignment 3
## Part II - Practical
## Mahla Entezari 401222017

At the beginning of the project, I have imported the necessary libraries.

In [1]:
 # pip install hazm --upgrade --force-reinstall

In [None]:
# pip uninstall hazm

In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import xgboost as xgb
import re
from scipy.linalg import triu
from hazm import *
from hazm import Normalizer, word_tokenize, stopwords_list, POSTagger, DependencyParser
import nltk
from nltk.corpus import stopwords
from gensim.models import KeyedVectors

import matplotlib.pyplot as plt
# from farsispellchecker import SpellChecker
import stanza

ImportError: cannot import name 'triu' from 'scipy.linalg' (D:\Uni\Term 4\Machine learning\qenv\lib\site-packages\scipy\linalg\__init__.py)

In this piece of code, I take a list of stop words from the hazm library, in other words, words such as conjunctions and prepositions and frequently used words that do not affect the overall meaning and concept of the sentence and are used in most topics and are frequently used and frequently used.

In [None]:
persian_stopwords = set(stopwords_list())
persian_stopwords

Here I define functions that are called below in the preprocessing function. Functions like clean_code, normalize_text, remove_diacritics, preprocess_with_stanza, named_entity_recognition, extract_relations, spell_checker,..

- ##### *clean_text*, *normalize_text*, *remove_diacritics*
  It works in such a way that it removes prepositions such as space, normalizes it, and also removes stop words and similar tasks.\
  Or, for example, it unites letters such as ک and ی which may be written in several ways.

- ##### *preprocess_with_stanza*
  It tokenizes input text using Stanza's natural language processing tools, extracting various linguistic features such as lemma, part-of-speech tags,    dependency relations, and named entity recognition labels for each word, returning them as a list of dictionaries representing each word's features

- ##### *named_entity_recognition*
  This function utilizes Stanza to perform named entity recognition (NER) on Persian text.\
  It downloads the necessary model, processes the text through Stanza's pipeline, and prints detected entities with their types, as well as words with their syntactic dependencies (head and dependency relation).

- ##### *extract_relations*
  This function processes a Stanza doc object, extracting relations between entities in Persian text sentences.\
  It identifies potential subject, object, indirect object, and modifier relationships (nsubj, obj, iobj, amod) and includes them in relations if both the dependent and head words are recognized entities.\
  It then prints and returns a list of tuples representing these relations.

- ##### *spell_checker*
    This is designed to correct spelling in a given text.

In [None]:
normalizer = Normalizer()
# stanza.download('fa')
# nlp = stanza.Pipeline('fa', processors='tokenize,pos,lemma,depparse,ner')
# spell_checker = SpellChecker()

def clean_text(text):
    
    if isinstance(text, str):
        
        text = re.sub(r'[^آ-ی\s]', '', text)
        
        text = normalizer.normalize(text)
        
        text = re.sub(r'\s+', ' ', text).strip()
        
        text = re.sub(r'[^\w\s]', '', text) 
        text = re.sub(r'\d+', '', text) 
        text = text.lower() 
        
        tokens = word_tokenize(text)
        
        tokens = [token for token in tokens if token not in persian_stopwords]
        
        return ' '.join(tokens)
    else:
        return ''
        
def normalize_text(text):
    text = text.replace('ك', 'ک').replace('ي', 'ی')
    text = text.translate(str.maketrans('0123456789', '۰۱۲۳۴۵۶۷۸۹'))
    return text
    
def remove_diacritics(text):
    diacritics = re.compile("[\u064B-\u0652]")
    text = re.sub(diacritics, '', text)
    return text

def preprocess_with_stanza(text):
    doc = nlp(text)
    processed_text = []
    for sentence in doc.sentences:
        for word in sentence.words:
            processed_text.append({
                'text': word.text,
                'lemma': word.lemma,
                'upos': word.upos,  
                'xpos': word.xpos,  
                'head': word.head,
                'deprel': word.deprel, 
                'ner': word.ner, 
            })
    return processed_text


def named_entity_recognition(text):
    stanza.download('fa')
    nlp = stanza.Pipeline('fa')
    doc = nlp(text)
    for sentence in doc.sentences:
        for entity in sentence.ents:
            print(f"Entity: {entity.text}, Type: {entity.type}")
    
        for word in sentence.words:
            print(f"Word: {word.text}, Head: {word.head}, Deprel: {word.deprel}")

def extract_relations(doc):
    relations = []
    for sentence in doc.sentences:
        entities = [(entity.text, entity.type) for entity in sentence.ents]
        if len(entities) < 2:
            continue  

        for word in sentence.words:
            if word.deprel in ["nsubj", "obj", "iobj", "amod"]: 
                head_word = sentence.words[word.head - 1]
                if head_word.ner != "O" and word.ner != "O":
                    relations.append((head_word.text, word.text, word.deprel))

    for relation in relations:
        print(f"Relation: {relation}")
    return relations
    
def extract_entities(text):
    tokens = word_tokenize(text)
    
    tagger = POSTagger(model='resources/postagger.model')
    tagged_words = tagger.tag(tokens)
    
    entities = []
    current_entity = []
    current_tag = None
    
    for word, tag in tagged_words:
        if tag.startswith('B-'):
            if current_entity:
                entities.append((' '.join(current_entity), current_tag))
                current_entity = []
            current_entity.append(word)
            current_tag = tag[2:]
        elif tag.startswith('I-'):
            current_entity.append(word)
        else:
            if current_entity:
                entities.append((' '.join(current_entity), current_tag))
                current_entity = []
            current_tag = None
    
    if current_entity:
        entities.append((' '.join(current_entity), current_tag))
    
    return normalized_text, entities
    
def spell_checker(text):
    tokens = text.split()
    corrected_text = []
    for token in tokens:
        corrected_token = spell_checker.correct(token)
        corrected_text.append(corrected_token)
    
    corrected_sentence = ' '.join(corrected_text)
    return corrected_sentence

In [None]:
# import nlpaug.augmenter.word as naw
# def augment_text(text, aug_type='synonym'):
#     augmenter = naw.SynonymAug(aug_src='wordnet')
#     augmented_text = augmenter.augment(text)
#     return augmented_text

In [None]:
def data_preproccess():
    df['cleaned_text'] = df['text'].apply(clean_text)
    df['cleaned_text'] = df['cleaned_text'].apply(normalize_text)
    df['cleaned_text'] = df['cleaned_text'].apply(remove_diacritics)
    # df['cleaned_text'] = df['cleaned_text'].apply(preprocess_with_stanza)
    # df['cleaned_text'] = df['cleaned_text'].apply(named_entity_recognition)
    # df['cleaned_text'] = df['cleaned_text'].apply(spell_checker)
    # df['cleaned_text'] = df['cleaned_text'].apply(augment_text)
    

In [None]:
df = pd.read_csv('train_data.csv')
df

At this stage, instead of manipulating the data and adding a line above the rest of the lines
I created two new columns, one of which is the text column and the other is the label column.

In [None]:
df['text']=df.iloc[:,0]
df['label']=df.iloc[:,1]

In [None]:
for index, row in df.iterrows():
    text = row['text']
    label = row['label']
    normalized_text, entities = extract_entities(text)
    print(f"Text: {normalized_text}")
    print(f"Entities: {entities}")
    print()

In [None]:
text = "۱۲۳ امتحان می‌کنیم"
cleaned_tokens = clean_text(text)
print(cleaned_tokens)

In [None]:
data_preproccess()
df

In [None]:
from collections import Counter

word_counts = Counter()
for text in df['cleaned_text']:
    tokens = text.split()
    word_counts.update(tokens)
word_counts    

Here we count the number of types of words and the number of times each word appears
Then, for a better and more practical view, we keep the words that have been repeated at least 100 times and then sort them.

In [None]:
filtered_word_counts = {word: freq for word, freq in word_counts.items() if freq >= 100}
filtered_word_counts = dict(sorted(filtered_word_counts.items(), key=lambda item: item[1], reverse=True))
filtered_word_counts

Here we count the number of types of words and the number of times each word appears
Then, for a better and more practical view, we keep the words that have been repeated at least 100 times and then sort them.

In [None]:
plt.figure(figsize=(12, 6))
plt.bar(filtered_word_counts.keys(), filtered_word_counts.values())
plt.xlabel('Words')
plt.ylabel('Frequency')
plt.title('Word Frequency in texts(>100)')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

Also, according to this graph, it is clear that most of the texts have the **happy** label
And also the lowest number of that **Fear** label

In [None]:
class_dist = df['label'].value_counts()

plt.figure(figsize=(8, 5))
class_dist.plot(kind='bar', color=['blue', 'green'])
plt.xlabel('Class')
plt.ylabel('Number of Samples')
plt.title('Class Distribution')
plt.xticks(rotation=0)
plt.show()


In [None]:
bow = CountVectorizer()
X_bow = bow.fit_transform(df['cleaned_text'])
X_bow

CountVectorizer (bow): Counts how often each word appears in each document.

TfidfVectorizer (tfidf): Measures how important each word is to a document compared to the entire collection, accounting for word frequency and rarity across documents.

In [None]:
tfidf = TfidfVectorizer()
X_tfidf = tfidf.fit_transform(df['cleaned_text'])
X_tfidf

‌

Encode categorical labels (df['label']) into numerical values suitable for machine learning algorithms.

In [None]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
y = encoder.fit_transform(df['label'])

X_train, X_test, y_train, y_test = train_test_split(X_bow, y, test_size=0.05, random_state=42)

Then, from here on, I will train several different models on the data to compare their efficiency

These models include

*Decision tree*, *Random forest*, *GradientBoosting*, *LogisticRegression*, *MultinomialNB* and *XGBoost*

In [None]:
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train, y_train)

dt_pred = dt_model.predict(X_test)

print("Decision Tree Accuracy:", accuracy_score(y_test, dt_pred))
print(classification_report(y_test, dt_pred))


In [None]:
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

rf_pred = rf_model.predict(X_test)

print("Random Forest Accuracy:", accuracy_score(y_test, rf_pred))
print(classification_report(y_test, rf_pred))


In [None]:
gb_model = GradientBoostingClassifier(n_estimators=100, random_state=42)

gb_model.fit(X_train, y_train)

gb_preds = gb_model.predict(X_test)

print("Gradient Boosting Accuracy:", accuracy_score(y_test, gb_preds))
print(classification_report(y_test, gb_preds))

In [None]:
ensemble_model = VotingClassifier(estimators=[('rf', rf_model), ('gb', gb_model)], voting='hard')
ensemble_model.fit(X_train, y_train)

y_pred = ensemble_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Ensemble Model Accuracy:", accuracy)

In [None]:
log_model = LogisticRegression(max_iter=1000)
log_model.fit(X_train, y_train)

y_pred = log_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

print(classification_report(y_test, y_pred))

In [None]:
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)
params = {
    'max_depth': 3,
    'objective': 'multi:softmax',
    'num_class': len(encoder.classes_), 
    'eval_metric': 'merror'  
}

num_rounds = 100 
xgb_model = xgb.train(params, dtrain, num_boost_round=num_rounds)
xgb_preds = xgb_model.predict(dtest)

decoded_preds = encoder.inverse_transform(xgb_preds.astype(int))

print("XGBoost Accuracy:", accuracy_score(y_test, xgb_preds))
print(classification_report(y_test, xgb_preds, target_names=encoder.classes_))

In [None]:
params = {
    'objective': 'multi:softmax',  
    'num_class': len(encoder.classes_), 
    'eval_metric': 'merror', 
    'max_depth': 6,
    'eta': 0.3,
    'subsample': 0.8,
    'colsample_bytree': 0.8
}
num_rounds = 100
nfold = 5
early_stopping_rounds = 20

cv_results = xgb.cv(params, dtrain, num_rounds, nfold=nfold, early_stopping_rounds=early_stopping_rounds, verbose_eval=True)
best_num_rounds = len(cv_results) 
final_model = xgb.train(params, dtrain, best_num_rounds)
xgb_preds = final_model.predict(dtest)

decoded_preds = encoder.inverse_transform(xgb_preds.astype(int))

print("Best num rounds XGBoost Accuracy:", accuracy_score(y_test, xgb_preds))
print(classification_report(y_test, xgb_preds, target_names=encoder.classes_))

In [None]:
plt.figure(figsize=(10, 6))
plt.errorbar(cv_results.index, cv_results['train-merror-mean'], yerr=cv_results['train-merror-std'], label='Train')
plt.errorbar(cv_results.index, cv_results['test-merror-mean'], yerr=cv_results['test-merror-std'], label='Validation')
plt.xlabel('Boosting Round')
plt.ylabel('Error Rate')
plt.title('Training and Validation Error Rates')
plt.legend()
plt.grid()
plt.show()

In [None]:
clf = MultinomialNB()
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print('Classification Report:')
print(report)

After comparing all these models and trying to improve the efficiency and increase the accuracy percentage and change the parameters and the number of rounds\
It can be seen with a slight difference that the best of these models are the following models:

**Gradient Boosting** , **Ensemble Model**, **XGBoost** and **Multinomial Naive Bayes** with about **60%** accuracy and best of them is ***Logistic Regression*** with ***62%*** accuracy.