In [None]:
#Import needed Packages

#Data Analysis 
import pandas as pd
import numpy as np

#Data Pre-processing
import re
import pyarabic.araby as arabic
import pyarabic.araby_const
from nltk.corpus import stopwords
import string 
import nltk
import arabicnlp
from itertools import chain
from nltk.tokenize import word_tokenize #pre-processing (toknization)
from collections import defaultdict, Counter
import tashaphyne.arabic_const as pre_arabic
from sklearn.feature_extraction.text import TfidfVectorizer
from wordcloud import WordCloud


# visualization
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
import matplotlib



#Modeling

from sklearn.metrics.scorer import make_scorer
from sklearn.model_selection import train_test_split, cross_val_score,cross_validate ,GridSearchCV
from sklearn.metrics import precision_score, recall_score, accuracy_score, roc_auc_score,confusion_matrix,make_scorer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import BernoulliNB, MultinomialNB, GaussianNB

#settings
pd.set_option('display.max_colwidth',100000000000)
%matplotlib inline
# nltk.download('stopwords')
# from IPython.core.display import display, HTML
# display(HTML("<style>.container { width:100% !important; }</style>"))

In [None]:
#! pip install arabicnlp

# Arabic Poetry Classification Problem


### Arabic Poetry Classification Workflow Steps:
The competition solution workflow goes through six stages described in the [Data Science Solutions book](https://leanpub.com/data-science-solutions).

* Problem Definition.
* Acquire training and testing data.
* Analyze, identify patterns, and explore the data.
* Data cleansing
* Model, predict and solve the problem.
* Results discussion and report.

In general , the workflow indicates a general sequence of how each stage may follow the other. However, there are use cases with exceptions.




## 1. Problem Statement 

The analysis of Arabic poetic text using machine learning is not an easy task, as the attributes of Arabic
poetry differ from that of other Arabic texts. 
In this project, a classification model built to classify Arabic poetry based on the poet's origin.

## 2. Loading the Dataset 
The dataset contains around 11K sample of poems that extend from the 6th century to the present day. This dataset consist of 9 features and 11604 instance. In addition,it included 11594 poems of 591 poets.The total number of words was 1741848(before pre-processing)

Moreover, the dataset was scraped from [Adab website]( http://adab.com/).

In [None]:
poem_df=pd.read_csv('poems_11K_sample.csv')
poem_df.shape #check the number of instance and featuers 

## 3. Exploratory Data Analysis (EDA) and Pre-Processing 




**Data Inspection**
* What data type is data?
* How many poem does dataset contain?
* Inspect the first data point, what does it look like?
* How many poem of each category does dataset contain?


So, I tried to analyze by describing the data (Take a look into the dataset to understand it).In the following sections, I tried to answer several questions that help me to understand the dataset. 



In [None]:
poem_df.head() #viewing the data


### 3.1 Which features are available in the dataset?

In [None]:
poem_df.columns.values

### 3.2 Which is the data type of each feature?

    From the following results, we observed that:

    - Categorical: poem_style, poem_link, poem_text, poem_title, poet_link, poet_name, and poet_cat

    - Mixed (Numerical and Categorical): poem_id, and poet_id

   Besides, this helps to select the appropriate plots for visualization

In [None]:
poem_df.dtypes

In [None]:
numerical_uniques=poem_df.select_dtypes(include='number').nunique()#The number of unique values per numerical feature
numerical_uniques

In [None]:
nonnumerical_uniques = poem_df.select_dtypes(exclude='number').nunique()#The number of unique values per nonnumerical feature
nonnumerical_uniques

In [None]:
poem_df.info()

In [None]:
poem_df.columns = poem_df.columns.str.strip() #Remove space from the dataframe columns for ease of use in analysis

### 3.3 What is the distribution of categorical features? 
(Statistics summary on the dataset) 



In [None]:
poem_df.describe(include=['O'])

We can observed that:
* Poem_style takes three possible values. 'فصحى' style used by most poets (top='فصحى', freq=11581).
* Poet_cat feature as 26 possible values with 33% 'العصر العباسي' (top='العصر العباسي', freq=3836/count=11604).
* poem_links are unique across the dataset (count=unique=11604).
* 'ابن الرومي' is the most popular Poet_name with around 3%.
* Poet_link feature has high ratio (94%) of duplicate values (unique=591).

### 3.4 Is there any null or unknown values?

In [None]:
print('The total and percentages of null values per feature:\n')
missing_data = pd.DataFrame({'Total_Missing': poem_df.isnull().sum(), '%_Missing': (poem_df.isnull().sum()/11604)*100})
missing_data
## The data is clean, there is no null values in the dataset 

In [None]:
poem_df.poem_style.value_counts() #there is an unknown featuers (-)

#####  In this situation, the cross-tabulation was used to predict the unknown value. The hypothesis was one feature influenced by another.

In [None]:
pd.crosstab(poem_df.poem_title,poem_df.poem_style)

In [None]:
 poem_df['poem_style']=poem_df['poem_style'].replace('-','فصحى')

In [None]:
print('Total number of Poem: {}'.format(len(poem_df.poem_text)))
print('Number of poet that appear multiple times: {}'.format(np.sum(poem_df.poem_title.value_counts() > 1)))



### A. Exploratory Data Analysis (EDA)


__The observations from the following visualization are:__


>1. The poet category (predicted value) is 33% and 10% for العصر العباسي والاندلسي respictivaly. Therefore, like most of the NLP datasets this dataset is seems to be slightly imbalanced, but we're not sure (yet)
if this imbalanced will be significant. We’ll come back to this in the modeling section.  
>2. 3.4% of the poem was written by ابن الرومي followed 2.8% by أبوالعلاء المعري	
> ( This is reasonable because ابن الرومي were in العصر العباسي origin). 
>3. 0.1% of poem style were not written in الفصحى


In [None]:
#look into the distribution of each feature

def features_distribution(col, ax):
    poem_df[col][poem_df[col].notnull()].value_counts().plot(kind='bar', facecolor='y', ax=ax)
    ax.set_xlabel('{}'.format(col), fontsize=20)
    ax.set_title("{} ".format(col), fontsize= 18)
    return ax

f, ax = plt.subplots(3,3, figsize = (22,15))
f.tight_layout(h_pad=9, w_pad=2, rect=[0, 0.03, 1, 0.93])
columns = ['poem_id', 'poem_link', 'poem_style', 'poem_text', 'poem_title','poet_cat', 'poet_id', 'poet_link', 'poet_name']


counter = 0
for i in range(3):
    for j in range(3):
        features_distribution(columns [counter], ax[i][j])
        counter += 1
feature_plot = plt.suptitle("Initial Distributions of features", fontsize= 25)

In [None]:
poem_df.poet_cat.value_counts()[:10].plot(kind='bar');

# The dataset comes with 26 labels.The label indicates the poet origin.
# Our job is to predict the Arabic poetry based on the poet origin in the testing dataset.

In [None]:
#show the top 10 poet depending on the number of poem
top_poet_name = poem_df.groupby(['poet_name']).size().reset_index(name='counts').sort_values('counts',ascending=False,inplace=False)


#plt.style.use('dark_background')
plt.figure(figsize=(12, 8))
sns.set_style("darkgrid", {'axes.grid' : False})
sns.barplot(x=top_poet_name.poet_name[:10], y=top_poet_name.counts[:10])
plt.title('The Name of the Most Popular Poet ')
plt.xlabel("Poet Name", fontsize=15)
plt.ylabel("Number of Poem", fontsize=15)
plt.xticks(rotation=50)
plt.show()


In [None]:
#How many poem of each category does dataset contain?
poem_category = poem_df.groupby(['poet_cat']).agg({'poem_title':['count']}).sort_values([('poem_title', 'count')],ascending=False)
#poem_category.plot(kind='bar');
plt.figure(figsize=(12, 8))

sns.set_style("darkgrid", {'axes.grid' : False})
sns.barplot(poem_category.index,poem_category[('poem_title', 'count')])

plt.title('The Number of poem in Each Category ')
plt.xlabel("Poet category", fontsize=15)
plt.ylabel("Number of Poem", fontsize=15)
plt.xticks(rotation=50)
plt.show()



### B. Data Pre-Processing


**Common data cleaning steps on all text:**
* Strip Arabic Diacritics
* Remove punctuation
* Remove numerical values
* Remove common non-sensical text (/n)
* Strip Elongation
* Toknize
* Remove stop words

**More data cleaning steps after tokenization:**
* Normailze
* Deal with typos
* And more...

In [None]:
def count_words(text):   #check the numbre of all words before pre-processing  
    tex=word_tokenize(text)
    number_words = sum(1 for tokens in tex)
    return number_words

In [None]:
results_words=poem_df.poem_text.apply(lambda x:count_words(x)) #total number of poem words = 1741848 before pre-preocessing
results_words.sum() 

#### 3.5 Remove Punctuations and Special Character

In [None]:
#remove_punctuations_and_specialCharacter
def remove_specialCharacter(text):  
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\w*\d\w*', '', text)
    text = re.sub('[‘’“”…]', '', text)
    text = re.sub('\n', '', text)
    return text

In [None]:
rt=string.punctuation
rt

In [None]:
puncs = [",",".","``","''",";","?","--","-",")","(",":","!","...","|","…","،","..","\"","؟"]
pun_nltk=list(string.punctuation)
allpun= str(puncs+pun_nltk )

#### 3.6 Remove Arabic Diacritics

In [None]:
def remove_diacritics(text):
    return arabic.strip_harakat(text)

#### 3.7 Normalization

In [None]:
def normalize_arabic(text):
#     text = re.sub("ا" ,'أ',text)
#     text = re.sub("ى", "ي", text)
#     text = re.sub("ؤ", "ء", text)
#     text = re.sub("ئ", "ء", text)
#     text = re.sub("ة", "ه", text)
#     text = re.sub("گ", "ك", text)
      text=pre_arabic.LAMALEFAT_PAT.sub(r'%s%s'%(pre_arabic.LAM, pre_arabic.ALEF), text)
      text = pre_arabic.ALEFAT_PAT.sub(pre_arabic.ALEF, text) 
      text=pre_arabic.HAMZAT_PAT.sub(pre_arabic.HAMZA, text)
      return text

#### 3.8 Remove Stopwords 

As can be observed from the following code, the most common words are stop words such as conjunctions, prepositions, and pronouns that should be deleted to prevent their results from having an impact on the classification.

I searched and found the largest list of Arabic stop words on [Github](https://github.com/mohataher/arabic-stop-words) so, I prefer to use it rather than NLTK stopwords.

In [None]:
#looking into the most common words in poem text before pre-processing

counter = defaultdict(int)
for poem in poem_df.poem_text:
    for word in word_tokenize(poem):
        counter[word] += 1
        
common_word = Counter(counter)

common_word.most_common(10)

In [None]:
#arabic_stopwords=stopwords.words('arabic')


file = open('/Users/Mony/Desktop/arabic-stop-words/list.txt', 'r')
stop_words = file.readlines()
file.close()

stop_words= [words.strip().split('\n') for words in stop_words]
stop_words=list(chain(*stop_words))


def remove_stopwords(text):
    tex=word_tokenize(text)
    filtered_words = [word for word in tex if word not in stop_words]
    return ' '.join(filtered_words)

#### 3.9 Remove Repeating Letters

In [None]:
def remove_repeating_letters(text):
    return re.sub(r'(.)\1+', r'\1', text)

#### 3.10 Remove the Elongation

In [None]:
def strip_elongation(text):
    return re.sub(r'[%s]' % pre_arabic.TATWEEL,    '', text)

In [None]:
clean_text=poem_df.copy() #copy contant of the orginal dataframe

In [None]:
#poem_text pre-processing

clean_text.poem_text=clean_text.poem_text.apply(lambda x:remove_specialCharacter(x))
clean_text.poem_text=clean_text.poem_text.apply(lambda x:remove_diacritics(x))
clean_text.poem_text=clean_text.poem_text.apply(lambda x:strip_elongation(x))
clean_text.poem_text=clean_text.poem_text.apply(lambda x:remove_repeating_letters(x))

clean_text.poem_text=clean_text.poem_text.apply(lambda x:normalize_arabic(x))

clean_text.poem_text=clean_text.poem_text.apply(lambda x:remove_stopwords(x))


In [None]:
clean_text.poem_text

In [None]:
#looking into the most common words in poem text after pre-processing
fdist1 = nltk.FreqDist(word_tokenize(str(clean_text.poem_text)))
fdist1.most_common(10)

In [None]:
# The total number of poem wordws after pre-processing was 1315585

# after_words=clean_text.poem_text.apply(lambda x:count_words(x))
# after_words.sum()



In [None]:
#poem_title pre-processing

clean_text.poem_title=clean_text.poem_title.apply(lambda x:remove_specialCharacter(x))
clean_text.poem_title=clean_text.poem_title.apply(lambda x:remove_diacritics(x))
clean_text.poem_title=clean_text.poem_title.apply(lambda x:normalize_arabic(x))
clean_text.poem_title=clean_text.poem_title.apply(lambda x:remove_repeating_letters(x))
clean_text.poem_title=clean_text.poem_title.apply(lambda x:remove_stopwords(x))

In [None]:
clean_text.poem_title

In [None]:
#poet_name pre-processing


clean_text.poet_name=clean_text.poet_name.apply(lambda x:remove_specialCharacter(x))
clean_text.poet_name=clean_text.poet_name.apply(lambda x:remove_diacritics(x))
clean_text.poet_name=clean_text.poet_name.apply(lambda x:normalize_arabic(x))
clean_text.poet_name=clean_text.poet_name.apply(lambda x:remove_repeating_letters(x))
clean_text.poet_name=clean_text.poet_name.apply(lambda x:remove_stopwords(x))

In [None]:
clean_text.poet_name

In [None]:
#poet_cat pre-processing

clean_text.poet_cat=clean_text.poet_cat.apply(lambda x:remove_specialCharacter(x))
clean_text.poet_cat=clean_text.poet_cat.apply(lambda x:remove_diacritics(x))
clean_text.poet_cat=clean_text.poet_cat.apply(lambda x:normalize_arabic(x))
clean_text.poet_cat=clean_text.poet_cat.apply(lambda x:remove_repeating_letters(x))


In [None]:
clean_text.poet_cat


#### 3.11 **Drive new featuer (length of poem text) from poem text featuer that may be useful in the modelling**

In [None]:
def length(text):    
    '''a function which returns the length of text'''
    return len(text)

In [None]:
clean_text['length'] = clean_text['poem_text'].apply(length)
clean_text.head(3)

In [None]:
clean_text.poet_cat.value_counts()

#### Assumtions based on data analysis:

- Poem_link and Poet_link features may be dropped from the analysis (as it contains high ratio of duplicates (94%)) 
  and they may not be a correlation between links and poet_cat.
- Poem_id and Poet_id may be dropped from dataset as it does not contribute to poet_cat

##  4.Converting Text to Numbers

In [None]:
processed_df=clean_text.copy()

In [None]:
processed_df.columns

#### 4.1 Deleting Unnecessary Features 

In [None]:
processed_df=processed_df.drop(columns={'poem_id', 'poem_link','poet_id', 'poet_link'}) #drop unwanted featuers 

In [None]:
processed_df.columns #check the remaining features

#### 4.2 Dealing with Categorical Values  ( One Hot Encoding and Ordinal Converting)


**A- poem_style**

In [None]:
processed_df['poem_style']=pd.get_dummies(processed_df.poem_style, drop_first=True)

**B- poet_cat**

According to '
الموجز في الشعر العربي، دراسة في العصور المختلفة للشعر العربي'
تأليف فالح الحجية، مراجعة وتقديم د.شوقي ضيف' 

They arranged the time period of poem into the following:

* شعر العصر الجاهلي
* "شِعر صدر الإسلام" شعر العصر الاسلامي 
* الشعر العباسي 
* الشعر الاندلسي
* الشعر العثماني
* الشعر الحديث 

So, I transformed the Categorical data of string type into numerical values which the model can understand.

In [None]:
processed_df['poet_cat']=processed_df.poet_cat.replace({'العصر الجاهلي':1,'العصر الاسلامي':2,'العصر العباسي':3,'العصر الاندلسي':4,'شعراء العراق والشام':5})

In [None]:
#The number of classes were reduced from 26 to 6
processed_df['poet_cat']=processed_df.poet_cat.replace(dict.fromkeys(['قطر','موريتانيا','افغانستان','الكويت', 'ايران','الاردن','ليبيا','المغرب','تونس','السودان','الجزاءر','البحرين','الامارات','عمان','عمان','اليمن','فلسطين','السعودية','العراق','مصر','لبنان','سوريا'], '6'))




In [None]:
processed_df.poet_cat.value_counts() #just to check after replacement stage

**Histogram of poem text lenght of each poet category**

As we can see the distributions coincides so it better to leave out text length as a feature for predictive modelling

In [None]:
data1 = processed_df[processed_df['poet_cat'] == 1]
data2 = processed_df[processed_df['poet_cat'] == 2]
data3 = processed_df[processed_df['poet_cat'] == 3]
data4 = processed_df[processed_df['poet_cat'] == 4]
data5 = processed_df[processed_df['poet_cat'] == 5]
data6 = processed_df[processed_df['poet_cat'] == 6]

In [None]:


matplotlib.rcParams['figure.figsize'] = (12.0, 6.0)
bins = 500
plt.hist(data1['length'], alpha = 0.6, bins=bins, label='العصر الجاهلي')
plt.hist(data2['length'], alpha = 0.8, bins=bins, label='العصر الاسلامي')
plt.hist(data3['length'], alpha = 0.4, bins=bins, label='العصر العباسي')
plt.hist(data4['length'], alpha = 0.6, bins=bins, label='العصر الاندلسي')
plt.hist(data5['length'], alpha = 0.8, bins=bins, label='العراق والشام')
plt.hist(data6['length'], alpha = 0.4, bins=bins, label='الشعر الحديث')
plt.xlabel('Poem Text Length')
plt.ylabel('Numbers')
plt.legend(loc='upper right')
plt.xlim(0,300)
plt.grid()
plt.show()

**C- poet_name**

In [None]:
#get binary values for category columns

def explode(frame,cat_col,sep=','):
    '''inputs-
    frame: input dataframe
    cat_col: name of the category column
    sep: is the seperator between the catgories
    
    output-
    new dataframe with binary values for category columns
    '''
    df=frame.copy()
    df[cat_col]=df[cat_col].apply(lambda x: x.replace(' ',' ').split(sep))
    categories=list(set(df[cat_col].sum()))
    df_cat=pd.DataFrame(0,index=df.index,columns=categories)
    for cat in categories:
        df_cat[cat]=df[cat_col].apply(lambda cat_list: int(cat in cat_list))
    return pd.concat([df,df_cat],axis=1)

In [None]:
processed_df=explode(processed_df,'poet_name',sep='/')

**D- poem_title**

In [None]:
processed_df=explode(processed_df,'poem_title',sep='/')

In [None]:
processed_df.tail()

**E- poem_text**



**Why I did not use the TF (CountVectorizer)? 
Because counting the number of words in each document will give more weightage to longer documents than shorter documents. To avoid this, I use TF-IDF**

In [None]:
tfidf_vector = TfidfVectorizer( min_df=20,ngram_range=(1, 2),lowercase=False)

# when I used  integer value for min_df parameters it will choose the cutoff on an absolute value that means
# ,for example, the word”Lubna” appears at least 20 times in all documents
#scaler = StandardScaler(with_mean=False)
#scaler=preprocessing.MaxAbsScaler()



# When I used float it means min_df=0.1, for example, the word “Lubna” most be at least appears at 10% of whole
# documents


#I transformed each text poem into a vector and it's normalize (default by TFIDF)
text_poem_df = tfidf_vector.fit_transform(processed_df.poem_text).toarray()
labels = processed_df.poet_cat

print("Each of the %d poem text is represented by %d features (TF-IDF score of unigrams and bigrams)" %(text_poem_df.shape))

In [None]:
text_poem_df.max() #to ensure its normalized 

In [None]:
convert_df=pd.DataFrame(data=text_poem_df,columns=tfidf_vector.get_feature_names())
convert_df.head()

In [None]:
# # check the most popular word in text poem
# # text=' '.join(word for word in processed_df.poem_text)

# reshaped_texts = arabic_reshaper.reshape(text)
# arabic_texts = get_display(reshaped_texts)
# # wordcloud=WordCloud( max_words=200).generate(arabic_texts)
# # plt.figure(figsize=(10,10))
# # plt.imshow(wordcloud)
# # plt.axis("off")
# # plt.show()

In [None]:
#adding all features into one dataframe

In [None]:
#final_df=convert_df.append(processed_df,ignore_index=False,sort=False)
#final_df=processed_df+convert_df

##  5.Prepare Train and Test Data sets 

**Split dataset for training and testing**

In [None]:
#X = processed_df.drop(columns=['poet_cat'])
X=convert_df
y = processed_df.poet_cat.astype(int)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 42)

In [None]:
X.info()

In [None]:
y

In [None]:
X_test.shape

In [None]:
X_train.shape

In [None]:
y_train.shape

## 6.Buliding The Models

**The classification models build are:**

* Random Forest
* Linear Support Vector Machine
* Multinomial Naive Bayes
* K-NN
* Logistic Regression

The sklearn.pipeline could be used. I proposed to use the mention models, because it has been widely
used in related studies, for instance, [Arabic Poetry Authorship Attribution using Machine
Learning Techniques](https://thescipub.com/pdf/10.3844/jcssp.2019.1012.1021) and [Machine Learning for Authorship Attribution in Arabic
Poetry](https://pdfs.semanticscholar.org/5f6c/9a176f5d4c8d5d48b36051ba61ff75175d7f.pdf)

### 6.1 Baseline Model 

#### A. Logistic Regression Model (Baseline Model) with Default Hyperparameter

In [None]:
print('Logistic Regression Model:')
logmodel = LogisticRegression()
cross_val=pd.DataFrame(cross_validate(logmodel,X_train,y_train,cv=10,return_train_score=True,scoring=['accuracy','balanced_accuracy', 'f1_weighted']))
pd.DataFrame(cross_val.mean())

**From the above results there is no overfitting**

### 6.2 Model with Default Hyperparameter

#### A. RandomForest Model with Default Hyperparameter

In [None]:
from sklearn.metrics import f1_score
print('RandomForest Model:')
#scorer = make_scorer(f1_score, average = 'weighted')
RF_CV = pd.DataFrame(cross_validate(RandomForestClassifier(), X_train, y_train, cv = 10,return_train_score=True,
                        scoring = ['accuracy','balanced_accuracy', 'f1_weighted']))
pd.DataFrame(RF_CV.mean())

In [None]:
#RandomForestClassifier.roc_auc

**From the above results there is an overfitting so we need to deal with the number of trees (default is 10)**

#### B. K-NN Model with Default Hyperparameter

In [None]:
print('K-NN Model:')
knn= KNeighborsClassifier(n_neighbors=8)
knn_croos_val=cross_validate(knn,X_train,y_train,return_train_score=True,cv=10,scoring=['accuracy','balanced_accuracy', 'f1_weighted'])



In [None]:
pd.DataFrame(knn_croos_val).mean()

#### C.  SVM Model with Default Hyperparameter

In [None]:
print('SVM Model:')
SVM=SVC( C=1.0,kernel='linear')
SVM_cross_val=cross_validate(SVM, X_train, y_train,return_train_score=False,cv=10,scoring=['accuracy','balanced_accuracy', 'f1_weighted'])


In [None]:
pd.DataFrame(SVM_cross_val.mean())

#### D.Naive Bayes Model with Default Hyperparameter

So they do model slightly different things. If you have discrete multiple features to worry about, 
you have to use Multinomial NB. But if you only have a single feature to worry about, then you can 
make a modelling choice based on the above.


https://datascience.stackexchange.com/questions/27624/difference-between-bernoulli-and-multinomial-naive-bayes

We are going to train Naive Bayes Classifier. Naive Bayes Classifier is a good choice given we have a medium sized dataset, NB classifier scales well and also NB classifier has been historically used in NLP tasks. We will train Multinomial and Bernoulli NB classifier, since they almost always outperfrom Gaussian NB classifier in NLP tasks

In [None]:
NB=MultinomialNB()
NB_cross_val=pd.DataFrame(cross_validate(NB,X_train,y_train,cv=10,scoring=['accuracy','balanced_accuracy', 'f1_weighted'], return_train_score=True))
print(pd.DataFrame(NB_cross_val.mean()))

In [None]:
## RandomForestClassifier().get_params().keys()  #To get hyperparameters of each classifers

**From the above results there is no overfitting**

### 6.3 Models with GridSearchCV 

#### A. K-NN Model with Best Hyperparameter

In [None]:
k_value=[x for x in range (1,10)]

In [None]:
knn_grid_parameters= dict(n_neighbors= k_value)

In [None]:
score={'balanced_score':make_scorer(metrics.balanced_accuracy_score),'f1_weighted':make_scorer(f1_score, average = 'weighted')}

In [None]:
knn_GridSearchCV=GridSearchCV(estimator=KNeighborsClassifier(), param_grid=knn_grid_parameters,refit='balanced_score',cv=10,scoring=score)


In [None]:
knn_GridSearchCV.fit(X_train,y_train)

In [None]:
knn_GridSearchCV.best_score_

In [None]:
pd.DataFrame(knn_GridSearchCV.cv_results_)

In [None]:
knn_GridSearchCV.best_params_

#### B. SVM Model with Best Hyperparameter

In [None]:
#SVM=SVC( C=1.0,kernel='sigmoid')

SVM_parameters=[{'C':[1,10],'kernel':['linear']},
           {'C':[1,10],'kernel':['rbf'], 'gamma':[x for x in np.arange(0.01,0.02,0.01)]}]
SVM_GridSearchCV=GridSearchCV(estimator=SVC(), param_grid=SVM_parameters,scoring=score,cv=5)



In [None]:
SVM_GridSearchCV.mean()

In [None]:
SVM_GridSearchCV.fit (X_train,y_train)
SVM_accuracy=SVM_GridSearchCV.best_score_
SVM_accuracy

In [None]:
SVM_GridSearchCV.best_params_

###  C. RandomForest  Model with Best Hyperparameter ( Selected Model)


In [None]:
RandomForest_parameters={ 
    'n_estimators': [100, 200, 500],
    'max_features': ['auto', 'sqrt', 'log2'],
    'criterion' :['gini', 'entropy']
}

RandomForest_GridSearchCV=GridSearchCV(estimator=RandomForestClassifier(), param_grid=RandomForest_parameters,refit='balanced_score', cv=10,scoring=score)

In [None]:
score

In [None]:
RandomForest_GridSearchCV.fit(X_train,y_train) 

In [None]:
RandomForest_GridSearchCV.best_score_

In [None]:
pd.DataFrame(RandomForest_GridSearchCV.cv_results_)

In [None]:
RandomForest_GridSearchCV.best_params_

In [None]:
print('RandomForest Model:')
RandomForestClassifie=RandomForestClassifier(criterion= 'gini',max_features='auto',n_estimators=100,oob_score=True)
RandomForestClassifie.fit(X_train,y_train)
#pd.DataFrame(RF_CV1.mean())

In [None]:
RandomForestClassifie_predication=RandomForestClassifie.predict(X_test)

In [None]:
metrics.balanced_accuracy_score(y_test,RandomForestClassifie_predication )

In [None]:
feature_imp = pd.Series(RandomForestClassifie.feature_importances_,index=X.columns).sort_values(ascending=False)
feature_imp

In [None]:
# Creating a bar plot
sns.barplot(x=feature_imp[:5], y=feature_imp.index[:5])
# Add labels to your graph
plt.xlabel('Feature Importance Score')
plt.ylabel('Features')
plt.title("Visualizing Important Features")
plt.legend()
plt.figure(figsize=(100,100))

#### D. Logistic Regression Model with Best Hyperparameter

In [None]:


#LogisticRegression().get_params().keys()
Logistic_parameters={'C':np.logspace(-3,3,7),'penalty':['l1','l2'] , }
Logistic_GridSearchCV=GridSearchCV(estimator=LogisticRegression(), param_grid=Logistic_parameters,refit='balanced_score', cv=10,scoring=score)




In [None]:
Logistic_GridSearchCV.fit(X_train,y_train)

Logistic_GridSearchCV.best_score_

Logistic_GridSearchCV.best_estimator_

Logistic_GridSearchCV.best_params_

pd.DataFrame(Logistic_GridSearchCV.cv_results_)



#### E. Naive Bayes Model with Best Hyperparameter

https://stackoverflow.com/questions/39828535/how-to-tune-guassiannb

## 6.Evaluating The Models


In [None]:
RF_ConfusionMatrix = pd.DataFrame(confusion_matrix(y_test,RandomForestClassifie_predication))
#knn_ConfusionMatrix = confusion_matrix(y_test,knn_predictions)



print('The Confusion Matrix for Random Forest Model is :','\n')

#print('\n The Confusion Matrix for K-NN Model is :','\n',knn_ConfusionMatrix)
RF_ConfusionMatrix

In [None]:
y_test.shape

In [None]:
print(classification_report(y_test,RandomForestClassifie_predication))

In [None]:
metrics.f1_score(y_test,RandomForestClassifie_predication , average='weighted')

## 7.Conclusion

In this project, a classification model built to classify Arabic poetry based on the poet's origin. We have applied multiple empirical experiments on the poems dataset scraped from the [Adab website]( http://adab.com/) to test our models. Overall, our results were assessed thoroughly by applying the most commonly used measures for multi-text classification problems, which are (F1-weighted and balanced_score) evaluation measures. TF-IDF vectorization approache were uesed, other approaches such as FastText and doc2vec could be used.

Resource of some errors faced during project: 

* Errors of the direction of the Arabic text

! pip install python-bidi
! pip install arabic_reshaper
Python3 is no needed for decode value

[source](https://stackoverflow.com/questions/47057509/arabic-text-in-matplotlib?rq=1)