# Reading Dataset

In [3]:
import pandas as pd

dataset= pd.read_csv('data.csv')
dataset.drop(dataset.columns[0], axis=1, inplace=True)
dataset.head()
dataset.shape

(14000, 2)

In [4]:
dataset.dropna(axis=0,inplace=True)
dataset.head()

Unnamed: 0,text,class
0,يحيي كل من العروسي وعواطف وعمار والعنبري أمجاد...,Culture
1,أخبارنا المغربية ـ هدى جميعي\nتحول فنان مغربي ...,Culture
2,بالفيديو : الفنان الشعبي العمري يتهم الداودي و...,Culture
3,عبدالاله بوسحابة : اخبارنا المغربية\nعلمنا في ...,Culture
4,أخبارنا المغربية : حنان سلامة\nكعادته كلما تعل...,Culture


In [None]:
dataset.info()
dataset.describe()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7000 entries, 0 to 6999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    7000 non-null   object
 1   class   7000 non-null   object
dtypes: object(2)
memory usage: 164.1+ KB


Unnamed: 0,text,class
count,7000,7000
unique,6986,7
top,[],Culture
freq,7,1000


# Exploratory Data Analysis

In [None]:
import nltk
nltk.download('stopwords')
stop=set(nltk.corpus.stopwords.words("arabic"))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
# Text length analysis 
def avg_word(sentence):
    words = sentence.split()
    if len(words) == 0:
        return 0
    return (sum(len(word) for word in words)/len(words))

In [None]:
dataset['word_count']= dataset['text'].apply(lambda x:len(str(x).split(" ")))
dataset['char_count']= dataset['text'].str.len()
dataset['avg_char_per_word'] = dataset['text'].apply(lambda x: avg_word(x))
dataset['stopwords']=dataset['text'].apply(lambda x: len([y for y in x.split() if y in stop]))
dataset=dataset.sort_values(by='word_count',ascending=[0])
dataset.head()

Unnamed: 0,text,class,word_count,char_count,avg_char_per_word,stopwords
3914,['من الخطاب الملكي في 9 مارس، إلى الانتخابات ا...,Politics,4890,29109,5.360779,939
4337,['الحمد لله نحمده، ونستعينه، ونستغفره، ونعوذ ب...,Religion,4327,23823,4.505893,998
241,"[""شهد المشهد الثقافي والفني المغربي خلال العام...",Culture,4177,26898,5.439789,805
4184,['إن الحمد لله نحمده ونستعينه ونستغفره ، ونعوذ...,Religion,3667,18242,3.974911,772
4182,['العالم الإسلامي خُدع بإيران وبحزب الله وأحدا...,Religion,3316,18249,4.503619,800


In [None]:
dataset_stat=dataset.mean(axis=0)
dataset_stat

  dataset_stat=dataset.mean(axis=0)


word_count            272.649000
char_count           1639.022571
avg_char_per_word       5.119287
stopwords              55.581143
dtype: float64

# Text Preprocessing

Punkt: It is a module in nltk used to tokenize text. It is made to learn parameters from a corpus in an unsupervised way that 

In [None]:
nltk.download('punkt')
from nltk.corpus.reader.tagged import word_tokenize
from nltk import tokenize

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
# Remove stopwords
def removeStop(text):
  tmp=word_tokenize(text)
  text=" ".join([w for w in tmp if not w in stop and len(w) >=2])
  return text

dataset['noStop_article']=dataset['text'].apply(lambda x: removeStop(x))
dataset.head()
# print(word_tokenize(dataset['text'][0]))
# print(tokenize.sent_tokenize(dataset['text'][0]))

Unnamed: 0,text,class,word_count,char_count,avg_char_per_word,stopwords,noStop_article
3914,['من الخطاب الملكي في 9 مارس، إلى الانتخابات ا...,Politics,4890,29109,5.360779,939,'من الخطاب الملكي مارس، الانتخابات التشريعية ي...
4337,['الحمد لله نحمده، ونستعينه، ونستغفره، ونعوذ ب...,Religion,4327,23823,4.505893,998,'الحمد لله نحمده، ونستعينه، ونستغفره، ونعوذ با...
241,"[""شهد المشهد الثقافي والفني المغربي خلال العام...",Culture,4177,26898,5.439789,805,`` شهد المشهد الثقافي والفني المغربي خلال العا...
4184,['إن الحمد لله نحمده ونستعينه ونستغفره ، ونعوذ...,Religion,3667,18242,3.974911,772,'إن الحمد لله نحمده ونستعينه ونستغفره ونعوذ با...
4182,['العالم الإسلامي خُدع بإيران وبحزب الله وأحدا...,Religion,3316,18249,4.503619,800,'العالم الإسلامي خُدع بإيران وبحزب الله وأحداث...


In [None]:
# Normalization

# import reugular expression
import re 

def normalize(text):
  # remove three or more repetitions of a character
  pattern=re.compile(r"(.)\1{2,}",re.DOTALL)
  text=pattern.sub(r"\1\1",text)
  # normalize alif
  text = text.replace(u"\u0625", u"\u0627")  # HAMZA below, with LETTER ALEF
  text = text.replace(u"\u0622", u"\u0627")  # ALEF WITH MADDA ABOVE, with LETTER ALEF
  text = text.replace(u"\u0623", u"\u0627")  # ALEF WITH HAMZA ABOVE, with LETTER ALEF
  # normalize taa
  text = text.replace(u"\u0629", u"\u0647") # taa' marbuuTa, with haa'
  # normalize yaa
  text = text.replace(u"\u064A", u"\u0649")  # yaa' with 'alif maqSuura
  # remove diacritics 
  text = text.replace(u"\u064B", "")  # fatHatayn
  text = text.replace(u"\u064C", "")  # Dammatayn
  text = text.replace(u"\u064D", "")  # kasratayn
  text = text.replace(u"\u064E", "")  # fatHa
  text = text.replace(u"\u064F", "")  # Damma
  text = text.replace(u"\u0650", "")  # kasra
  text = text.replace(u"\u0651", "")  # shaddah
  text = text.replace(u"\u0652", "")  # sukuun
  text = text.replace(u"\u0670", "`")  # dagger 'alif
  return text

# aggregate all preprocessing steps into one column for the next step
dataset['normalized_article']=dataset['noStop_article'].apply(lambda x:normalize(x))
dataset.head()

Unnamed: 0,text,class,word_count,char_count,avg_char_per_word,stopwords,noStop_article,normalized_article
3914,['من الخطاب الملكي في 9 مارس، إلى الانتخابات ا...,Politics,4890,29109,5.360779,939,'من الخطاب الملكي مارس، الانتخابات التشريعية ي...,'من الخطاب الملكى مارس، الانتخابات التشرىعىه ى...
4337,['الحمد لله نحمده، ونستعينه، ونستغفره، ونعوذ ب...,Religion,4327,23823,4.505893,998,'الحمد لله نحمده، ونستعينه، ونستغفره، ونعوذ با...,'الحمد لله نحمده، ونستعىنه، ونستغفره، ونعوذ با...
241,"[""شهد المشهد الثقافي والفني المغربي خلال العام...",Culture,4177,26898,5.439789,805,`` شهد المشهد الثقافي والفني المغربي خلال العا...,`` شهد المشهد الثقافى والفنى المغربى خلال العا...
4184,['إن الحمد لله نحمده ونستعينه ونستغفره ، ونعوذ...,Religion,3667,18242,3.974911,772,'إن الحمد لله نحمده ونستعينه ونستغفره ونعوذ با...,'ان الحمد لله نحمده ونستعىنه ونستغفره ونعوذ با...
4182,['العالم الإسلامي خُدع بإيران وبحزب الله وأحدا...,Religion,3316,18249,4.503619,800,'العالم الإسلامي خُدع بإيران وبحزب الله وأحداث...,'العالم الاسلامى خدع باىران وبحزب الله واحداث ...


In [None]:
# Remove Punctuations
import string

arabicPunctuations = [".","`","؛","<",">","(",")","*","&","^","%","]","[",",","ـ","،","/",":","؟",".","'","{","}","~","|","!","”","…","“","–"] # defining customized punctuation marks
englishPunctuations = [word.strip() for word in string.punctuation] # importing English punctuation marks
punctuationsList = arabicPunctuations + englishPunctuations # creating a list of all punctuation marks

def removePunct(text):
  cleanArticle=''
  for i in text:
    if i not in punctuationsList:
      cleanArticle=cleanArticle+i
  return cleanArticle

dataset['clean_article'] = dataset['normalized_article'].apply(lambda x: removePunct(x)) # takes 18s to run
dataset.head()

Unnamed: 0,text,class,word_count,char_count,avg_char_per_word,stopwords,noStop_article,normalized_article,clean_article
3914,['من الخطاب الملكي في 9 مارس، إلى الانتخابات ا...,Politics,4890,29109,5.360779,939,'من الخطاب الملكي مارس، الانتخابات التشريعية ي...,'من الخطاب الملكى مارس، الانتخابات التشرىعىه ى...,من الخطاب الملكى مارس الانتخابات التشرىعىه ىوم...
4337,['الحمد لله نحمده، ونستعينه، ونستغفره، ونعوذ ب...,Religion,4327,23823,4.505893,998,'الحمد لله نحمده، ونستعينه، ونستغفره، ونعوذ با...,'الحمد لله نحمده، ونستعىنه، ونستغفره، ونعوذ با...,الحمد لله نحمده ونستعىنه ونستغفره ونعوذ بالله ...
241,"[""شهد المشهد الثقافي والفني المغربي خلال العام...",Culture,4177,26898,5.439789,805,`` شهد المشهد الثقافي والفني المغربي خلال العا...,`` شهد المشهد الثقافى والفنى المغربى خلال العا...,شهد المشهد الثقافى والفنى المغربى خلال العام ...
4184,['إن الحمد لله نحمده ونستعينه ونستغفره ، ونعوذ...,Religion,3667,18242,3.974911,772,'إن الحمد لله نحمده ونستعينه ونستغفره ونعوذ با...,'ان الحمد لله نحمده ونستعىنه ونستغفره ونعوذ با...,ان الحمد لله نحمده ونستعىنه ونستغفره ونعوذ بال...
4182,['العالم الإسلامي خُدع بإيران وبحزب الله وأحدا...,Religion,3316,18249,4.503619,800,'العالم الإسلامي خُدع بإيران وبحزب الله وأحداث...,'العالم الاسلامى خدع باىران وبحزب الله واحداث ...,العالم الاسلامى خدع باىران وبحزب الله واحداث ا...


In [None]:
# Noise Removal
def clean(text):
  # Remove extra whitespace
  text=re.sub('\s+',' ',text)
  # Remove numbers
  text=re.sub('\d+',' ',text)
  return text

dataset['abstract_article']=dataset['clean_article'].apply(lambda x: clean(x))
dataset.head()

Unnamed: 0,text,class,word_count,char_count,avg_char_per_word,stopwords,noStop_article,normalized_article,clean_article,abstract_article
3914,['من الخطاب الملكي في 9 مارس، إلى الانتخابات ا...,Politics,4890,29109,5.360779,939,'من الخطاب الملكي مارس، الانتخابات التشريعية ي...,'من الخطاب الملكى مارس، الانتخابات التشرىعىه ى...,من الخطاب الملكى مارس الانتخابات التشرىعىه ىوم...,من الخطاب الملكى مارس الانتخابات التشرىعىه ىوم...
4337,['الحمد لله نحمده، ونستعينه، ونستغفره، ونعوذ ب...,Religion,4327,23823,4.505893,998,'الحمد لله نحمده، ونستعينه، ونستغفره، ونعوذ با...,'الحمد لله نحمده، ونستعىنه، ونستغفره، ونعوذ با...,الحمد لله نحمده ونستعىنه ونستغفره ونعوذ بالله ...,الحمد لله نحمده ونستعىنه ونستغفره ونعوذ بالله ...
241,"[""شهد المشهد الثقافي والفني المغربي خلال العام...",Culture,4177,26898,5.439789,805,`` شهد المشهد الثقافي والفني المغربي خلال العا...,`` شهد المشهد الثقافى والفنى المغربى خلال العا...,شهد المشهد الثقافى والفنى المغربى خلال العام ...,شهد المشهد الثقافى والفنى المغربى خلال العام ...
4184,['إن الحمد لله نحمده ونستعينه ونستغفره ، ونعوذ...,Religion,3667,18242,3.974911,772,'إن الحمد لله نحمده ونستعينه ونستغفره ونعوذ با...,'ان الحمد لله نحمده ونستعىنه ونستغفره ونعوذ با...,ان الحمد لله نحمده ونستعىنه ونستغفره ونعوذ بال...,ان الحمد لله نحمده ونستعىنه ونستغفره ونعوذ بال...
4182,['العالم الإسلامي خُدع بإيران وبحزب الله وأحدا...,Religion,3316,18249,4.503619,800,'العالم الإسلامي خُدع بإيران وبحزب الله وأحداث...,'العالم الاسلامى خدع باىران وبحزب الله واحداث ...,العالم الاسلامى خدع باىران وبحزب الله واحداث ا...,العالم الاسلامى خدع باىران وبحزب الله واحداث ا...


In [None]:

# Lemmatization/stemming (takes very long time)
# !pip install farasapy

In [None]:
# from farasa.stemmer import FarasaStemmer
# stemmer = FarasaStemmer()

# dataset['stemmed_article']=dataset['abstract_article'].apply(lambda x: stemmer.stem(x))
# dataset.head()

# Preparing the Dataset

In [None]:
from pandas.core.common import random_state
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

dataset=shuffle(dataset)
x=dataset['abstract_article']
y=dataset['class']
x_train,x_test,y_train,y_test= train_test_split(x,y,test_size=0.3,random_state=42)
x_train.shape,x_test.shape

((4900,), (2100,))

# Random Forest Classifier



In [None]:
#TFIDF Vectorization
from sklearn.feature_extraction.text import TfidfVectorizer 
tfidf_vectorizer=TfidfVectorizer() 
 
# just send in all your docs here 
X=tfidf_vectorizer.fit_transform(dataset.abstract_article)

#Splitting vectorized data

clfx_train, clfx_test, clfy_train,clfy_test = train_test_split(X,dataset['class'], test_size=0.3,random_state=42)

In [None]:
#Grid search cross validation to find the most optimal parameters
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix,accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier

# Fitting the RandomforestClassifier to the training set
classifier = RandomForestClassifier()

parameters = {
    "n_estimators":[5,10,50,100,250],
    "max_depth":[2,4,8,16,32,None]
    
}


In [None]:
from sklearn.model_selection import GridSearchCV
cv = GridSearchCV(classifier,parameters,cv=5)
cv.fit(clfx_train,clfy_train.values.ravel())

In [None]:
def display(results):
    print(f'Best parameters are: {results.best_params_}')
    print("\n")
    mean_score = results.cv_results_['mean_test_score']
    std_score = results.cv_results_['std_test_score']
    params = results.cv_results_['params']
    for mean,std,params in zip(mean_score,std_score,params):
        print(f'{round(mean,3)} + or -{round(std,3)} for the {params}')
display(cv)

Best parameters are: {'max_depth': None, 'n_estimators': 100}


0.284 + or -0.021 for the {'max_depth': 2, 'n_estimators': 5}
0.387 + or -0.025 for the {'max_depth': 2, 'n_estimators': 10}
0.646 + or -0.019 for the {'max_depth': 2, 'n_estimators': 50}
0.727 + or -0.015 for the {'max_depth': 2, 'n_estimators': 100}
0.81 + or -0.012 for the {'max_depth': 2, 'n_estimators': 250}
0.406 + or -0.02 for the {'max_depth': 4, 'n_estimators': 5}
0.477 + or -0.028 for the {'max_depth': 4, 'n_estimators': 10}
0.743 + or -0.023 for the {'max_depth': 4, 'n_estimators': 50}
0.806 + or -0.015 for the {'max_depth': 4, 'n_estimators': 100}
0.839 + or -0.005 for the {'max_depth': 4, 'n_estimators': 250}
0.505 + or -0.027 for the {'max_depth': 8, 'n_estimators': 5}
0.636 + or -0.016 for the {'max_depth': 8, 'n_estimators': 10}
0.807 + or -0.008 for the {'max_depth': 8, 'n_estimators': 50}
0.842 + or -0.008 for the {'max_depth': 8, 'n_estimators': 100}
0.86 + or -0.008 for the {'max_depth': 8, 'n_estimator

In [None]:
clf = RandomForestClassifier(n_estimators=100)
clf.fit(clfx_train,clfy_train)

RandomForestClassifier()

In [None]:
print(clf.score(clfx_test,clfy_test))

0.8947619047619048


# Applying Logistic Regression

***Pipeline module in scikit-learn:*** is a tool that simplifies preprocessing by grouping operations in a pipe.\
***Pipeline Class:*** is to sequentially apply a list of transforms and a final estimator. Intermediate steps of pipeline must implement fit and tranform, and the final estimator only needst to implement fit.\
***Countvectorizer:*** is a method to convert text to numerical data so that the machine can deal with it. It converts the text to a sparse matrix of unique words counted using word-level tokenization.\
***TF:*** term frequency\
***IDF:*** inverse document frequency\
***TfidfTransformer:*** converts a collection of raw documents to a matrix of TF-IDF features. In order to start using TfidfTransformer you will first have to create a CountVectorizer to count the number of words (term frequency).


In [35]:
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score

logR=Pipeline([('vect',CountVectorizer()),
             ('tfidf',TfidfTransformer()),
             ('clf',LogisticRegression())
             ])

# logR=Pipeline([('vect',CountVectorizer(binary=True)),
#              ('tfidf',TfidfTransformer()),
#              ('clf',LogisticRegression())
#              ])

logR.fit(x_train,y_train) # takes 31s to run
y_pred_logR=logR.predict(x_test)

print(f'Accuracy: {accuracy_score(y_pred_logR,y_test)}')

Accuracy: 0.9373361925184656


# More Logistic Regression

In [36]:
# Convert labels from categorical to numeric
label_map={'Finance':0,'Culture':1, 'Medical':2, 'Tech':3, 'Sports': 4, 'Religion':5, 'Politics':6}
dataset['numeric_class']=dataset['class'].apply(lambda x: label_map[x])
dataset.numeric_class

3333     0
11812    4
7697     6
9077     5
12208    3
        ..
2801     0
12661    3
6874     6
3770     0
9422     5
Name: numeric_class, Length: 13988, dtype: int64

In [37]:
x=dataset['abstract_article']
y=dataset['numeric_class']
feature_train,feature_test,target_train,target_test= train_test_split(x,y,test_size=0.2,random_state=42)

***TfidfVectorizer:*** with Tfidfvectorizer you compute the word counts, idf and tf-idf values all at once.\
***GridSearchCV:*** is the process of performing hyperparameter tuning in order to determine the optimal values for a given model. It’s essentially a cross-validation technique.

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline
# from sklearn.metrics import confusion_matrix, classification_report

pipe=make_pipeline(TfidfVectorizer(), LogisticRegression())

# Similar to fine-tuning parameters (test these values and returns the best for the classifier)
param_grid= {'logisticregression__C': [0.01, 0.1, 1, 10, 100]}
model= GridSearchCV(pipe, param_grid, cv=5) # takes 10min to run

model.fit(feature_train,target_train)

In [40]:
# Performance Evaluation
y_pred_logR2=model.predict(feature_test)
print(f'Accuracy: {accuracy_score(y_pred_logR2,target_test):.2f}')
# print(classification_report(y_pred_logR2, target_test))

Accuracy: 0.94


# Applying Naive Bayes

***Multinomial Naive Bayes algorithm:*** is a probabilistic learning method based on the Bayes theorem and predicts the tag of a text. It calculates the probability of each tag for a given sample and then gives the tag with the highest probability as output.

In [41]:
from sklearn.naive_bayes import MultinomialNB

naiveB=Pipeline([('vect',CountVectorizer()),
                 ('tfidf',TfidfTransformer()),
                 ('clf',MultinomialNB())
                ])

naiveB.fit(x_train,y_train)
y_pred_naiveB=naiveB.predict(x_test)
print(f'accuracy: {accuracy_score(y_pred_naiveB,y_test)}')

accuracy: 0.9239933285680247


# Applying Neural Network

In [42]:
# Using less data
x=dataset['abstract_article'][:5000]
y=dataset['class'][:5000]
x_train,x_test,y_train,y_test= train_test_split(x,y,test_size=0.2,random_state=42)
x_train.shape,x_test.shape

((4000,), (1000,))

In [43]:
# Tokenizing and converting text to matrix
from keras.preprocessing.text import Tokenizer

tokenizer=Tokenizer(num_words=None,lower=False)
tokenizer.fit_on_texts(x)

x_train_tok=tokenizer.texts_to_matrix(x_train,mode='tfidf')
x_test_tok=tokenizer.texts_to_matrix(x_test,mode='tfidf')

In [44]:
# One-Hot Encoding of Classes
from sklearn.preprocessing import LabelEncoder
import keras.preprocessing.text

label_encoder = LabelEncoder()
label_encoder.fit(y)
y_encoded = label_encoder.fit_transform(y)

num_labels = len(set(y_encoded))
y_train_encoded = label_encoder.fit_transform(y_train) 
y_test_encoded = label_encoder.fit_transform(y_test)
# One-Hot Encoding
y_train_encoded_ = keras.utils.to_categorical(y_train_encoded,num_labels)
y_test_encoded_ = keras.utils.to_categorical(y_test_encoded,num_labels)

In [46]:
# Building Neural Network Model
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout

max_words = len(tokenizer.word_index) + 1

model=Sequential()
model.add(Dense(1024,input_shape=(max_words,)))
model.add(Activation('relu'))
model.add(Dropout(0.5))
model.add(Dense(num_labels))
model.add(Activation('sigmoid'))

***categorical_crossentropy:*** Used as a loss function for multi-class classification model where there are two or more output labels. The output label is assigned one-hot category encoding value in form of 0s and 1. The output label, if present in integer form, is converted into categorical encoding using keras.utils to_categorical method.

In [47]:
# Train
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['categorical_accuracy'])
model.fit(x_train_tok, y_train_encoded_, batch_size=100, epochs=5, verbose=1, validation_split=0.1)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7fa383ce5b50>

In [49]:
# Test
eval_val = model.evaluate(x_test_tok, y_test_encoded_, verbose=0)
print("Loss\t\t" , 'categorical_accuracy\t')
print(eval_val)

Loss		 categorical_accuracy	
[0.4528392553329468, 0.9269999861717224]
