# **Installing the required Packages**

In [None]:
!pip  install -U farasapy

# **Importing the required Libraries and Tools**

In [None]:
import pandas as pd
import numpy as np
import nltk
nltk.download('stopwords')
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
import torch
import re
import matplotlib.pyplot as plt
from sklearn.metrics import precision_recall_fscore_support,  classification_report, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import LabelEncoder
from google.colab import drive
from sklearn.model_selection import train_test_split
from farasa.stemmer import FarasaStemmer
from sklearn.model_selection import GridSearchCV, StratifiedKFold, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.utils.class_weight import compute_class_weight
import time
import seaborn as sns


## **Data Preparation for Modeling**

In [None]:
# Mounting Google Drive
drive.mount("/content/drive", force_remount=True)

# Defining the data file paths in Google Drive for Train and Test splits of the First Scenraio
drive_file_path_train = "/content/drive/MyDrive/#####"
drive_file_path_test = "/content/drive/MyDrive/#####"


# Reading the CSV file into a DataFrame
train_data = pd.read_csv(drive_file_path_train)
test_data = pd.read_csv(drive_file_path_test)



In [None]:
# Defining the data file path in Google Drive for Train and Test splits of the Second Scenraio
drive_file_path_cx_train = "/content/drive/MyDrive/#####"
drive_file_path_cx_test = "/content/drive/MyDrive/#####"


# Reading the CSV file into a DataFrame
cx_train_data = pd.read_csv(drive_file_path_cx_train)
cx_test_data = pd.read_csv(drive_file_path_cx_test)



***Utilizing GPU Capabilities***

In [None]:

device = torch.device("cpu")


***Further Preprocessing Steps***

***Applying Stemming using Farasa Stemmer***

In [None]:
# Initializing Farasa Stemmer
farasa_stemmer = FarasaStemmer(interactive=True)

# Function to  stem text
def stem_text(text):


    # Stemming the segmented text
    stemmed_text = ' '.join(farasa_stemmer.stem(text).split())


    return stemmed_text

# Apply the function to the DataFrame
train_data['Processed_Content'] = train_data['Processed_Content'].apply(stem_text)
test_data['Processed_Content'] = test_data['Processed_Content'].apply(stem_text)

cx_train_data['Processed_Content'] = cx_train_data['Processed_Content'].apply(stem_text)
cx_test_data['Processed_Content'] = cx_test_data['Processed_Content'].apply(stem_text)

***Applying Label Encoding and Vectorization through TF-IDF Vectorizer***

In [None]:
# Label Encoding fro the Sector column in train and test splits of the First Scenario

# Initializing the label encoder
label_encoder = LabelEncoder()

# Fit and transform the labels
train_data['Sector'] = label_encoder.fit_transform(train_data['Sector'])
test_data['Sector'] = label_encoder.fit_transform(test_data['Sector'])


In [None]:
# Label Encoding fro the Sector column in train and test splits of the Second Scenario

# Initializing the label encoder
label_encoder_cx = LabelEncoder()

# Fit and transform the labels
cx_train_data['Sector'] = label_encoder_cx.fit_transform(cx_train_data['Sector'])
cx_test_data['Sector'] = label_encoder_cx.fit_transform(cx_test_data['Sector'])


In [None]:
# Get the class names
class_names = label_encoder.classes_
print(class_names)

In [None]:
# Defining the Arabic-to-English translation dictionary which corresponds to the class labels (given that label encoding follows the alphabetical order)
translations = {
    'الاتصالات': 'Communication',
    'البنوك': 'Banking',
    'البيئة': 'Environment',
    'التعليم': 'Education',
    'التموين': 'Supply',
    'الزراعة': 'Agriculture',
    'الصحة': 'Healthcare',
    'القضاء': 'Judiciary',
    'الكهرباء': 'Electricity',
    'المياه والصرف الصحي': 'Water and Sanitation'

}


In [None]:
# Vectorize the text data using TF-IDF
vectorizer = TfidfVectorizer()

# Applying TF-IDF in train and test splits of the First Scenario
X_train_tfidf = vectorizer.fit_transform(train_data.loc[:,'Processed_Content'])
X_test_tfidf = vectorizer.transform(test_data.loc[:,'Processed_Content'])

# Generating the y_train and y-test of the First Scenario
y_train = train_data.loc[:,'Sector']
y_test = test_data.loc[:,'Sector']

# Applying TF-IDF in train and test splits of the Second Scenario
X_cx_train_tfidf = vectorizer.fit_transform(cx_train_data.loc[:,'Processed_Content'])
X_cx_test_tfidf = vectorizer.transform(cx_test_data.loc[:,'Processed_Content'])


# Generating the y_train and y-test of the Second Scenario
y_cx_train = cx_train_data.loc[:,'Sector']
y_cx_test = cx_test_data.loc[:,'Sector']


In [None]:
def class_weights_calculator(y_train):

    # Calculate class weights based on the training labels
    class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(y_train), y=y_train)
    class_weights_dict = {i: class_weights[i] for i in range(len(class_weights))}

    # Define the SVM model with class weights
    svm = SVC(class_weight=class_weights_dict, random_state=1)

    return svm


## **SVM Model Validation for the Both Scenarios**

***Using GridSearchCV to find the Best Hyperparamters and provide Model Validation Results based on the chosen best ones. GridSearchCV is used with 10-Fold Cross Validation***

***The model validation function should be executed independently for each scenario, using its own data splits***

In [None]:
def validation(svm_model, param_grid, X_train_tfidf, y_train, scenario):

  # svm_model is the ones created from the class_weights_calculator function

  # Defining the parameter grid for GridSearch
  param_grid = param_grid # Actual Parameters Search Space should be added

  # Perform GridSearch with 10-folds cross-validation using the svm_model returned from the class_weights_calculator
  grid_search = GridSearchCV(svm_model, param_grid, cv=10, scoring='f1_macro', n_jobs=-1)
  grid_search.fit(X_train_tfidf, y_train)

  # Get the best model from GridSearch
  best_svm = grid_search.best_estimator_

  if scenario == 1:
    print("1st Experimental Scenario\n")
  else :
    print("2nd Experimental Scenario\n")

  print("Best Parameters:", grid_search.best_params_)  # Print the best parameters

  # Get the best cross-validation score (validation score) for the best parameters
  print("Best Cross-Validation F1-Score (Macro):", grid_search.best_score_)

  return best_svm, grid_search.best_params_, grid_search.best_score_



## **SVM Model Evaluation for the Both Scenarios**

***The model evaluation function should be executed independently for each scenario, using its own data splits***

In [None]:

def evaluate(best_svm, X_train_tfidf, X_test_tfidf, y_train, y_test, translations_dict, scenario):

  # Start the timer for training
  start_time = time.time()

  best_svm.fit(X_train_tfidf, y_train)

  # Evaluate on the test set
  y_test_pred = best_svm.predict(X_test_tfidf)

  # End the timer for evaluation
  end_time = time.time()

  train_eval_time = end_time - start_time
  print(f"Training and Evaluation Time: {train_eval_time:.2f} seconds")


  # labels and their corresponding names
  label_names = [f"{j}" for i, j in translations_dict.items()]

  # Generate the classification report with label names
  eval_classification_report = classification_report(y_test, y_test_pred, target_names=label_names)

  if scenario == 1:
    print("Evaluation Classification Report the 1st Experimental Scenario:\n\n", eval_classification_report,"\n")
  else :
    print("Evaluation Classification Report the 2nd Experimental Scenario:\n\n", eval_classification_report,"\n")

  # Printing evaluation metrics:
  # Macro average
  macro_f1_score = f1_score(y_test, y_test_pred, average='macro')
  print("F1_score_Macro:", macro_f1_score, "\n")

  # Weighted average
  weighted_f1_score = f1_score(y_test, y_test_pred, average='weighted')
  print("F1_score_Weighted:", weighted_f1_score)

  return y_test_pred, eval_classification_report, macro_f1_score, weighted_f1_score

***Checking the Classification Report***

In [None]:
def classification_report_heatmap(eval_classification_report, scenario):

  # Convert the report to a DataFrame
  df_report = pd.DataFrame(eval_classification_report).transpose()

  # Separate the support column to add it as a separate column without color
  support = df_report['support'].iloc[:-1]  # Exclude the last row (accuracy row if needed)

  # Remove the support column from the heatmap to avoid coloring it
  df_report_no_support = df_report.drop(columns='support').iloc[:-1, :]  # Exclude last row (accuracy)

  # Display heatmap
  plt.figure(figsize=(4 ,6))
  sns.heatmap(df_report.iloc[:, :-1], annot=True, cmap="vlag_r" ,fmt=".4f", cbar=True, linewidths=0.5 )


  # Move the x-axis labels (precision, recall, f1-score) to the top
  plt.tick_params(top=True, bottom=False, labeltop=True, labelbottom=False)


  if scenario == 1:
    plt.title('SVM Classification Report Heatmap (1st Experimental Scenario)\n', fontsize=12)
  else:
    plt.title('SVM Classification Report Heatmap (2nd Experimental Scenario)\n', fontsize=12)

  plt.show()

  return df_report






***Checking the Confusion Matrix***

In [None]:
def confusion_matrix_heatmap(y_test, y_test_pred, translations_dict, scenario):

  # Assuming you have y_test (true labels) and y_pred (predicted labels)
  # Generate the confusion matrix
  eval_conf_matrix = confusion_matrix(y_test, y_test_pred)

  # labels and their corresponding names
  label_names = [f"{j}" for i, j in translations_dict.items()]


  plt.figure(figsize=(7, 5))


  # Add labels, title, and adjust the plot
  # Plot confusion matrix with the "Blues" color map
  sns.heatmap(eval_conf_matrix, annot=True, cmap="Blues", fmt="g", xticklabels=label_names, yticklabels=label_names, cbar=True, linewidths=0.5)


  plt.xlabel('Predicted Labels', fontsize=12)
  plt.ylabel('True Labels', fontsize=12)

  if scenario == 1:
    plt.title('SVM Confusion Matrix Heatmap (1st Experimental Scenario)\n', fontsize=12)
  else:
    plt.title('SVM Confusion Matrix Heatmap (2nd Experimental Scenario)\n', fontsize=12)

  plt.show()

  return eval_conf_matrix

