**Necessary library**

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

**Load the dataset**

In [None]:
url = 'https://raw.githubusercontent.com/NNishat/Nishat_files/master/dataset.csv'
df = pd.read_csv(url)
# Dataset is now stored in a Pandas Dataframe

In [None]:
df.head(5)

Unnamed: 0,Sura without replacements,Attributes of God
0,"He said, ""O Adam, inform them of their names.""...",1
1,Do you not know that to Allah belongs the domi...,1
2,Originator of the heavens and the earth. When ...,1
3,"Indeed, in the creation of the heavens and ear...",0
4,"Allah - there is no deity except Him, the Ever...",1


**Define independent and dependent features**

In [None]:
X= df['Sura without replacements']
y= df['Attributes of God']

**Copy the dataframe into mesages variable**

In [None]:
mesages= df.copy()

In [None]:
mesages.head(4)

Unnamed: 0,Sura without replacements,Attributes of God
0,"He said, ""O Adam, inform them of their names.""...",1
1,Do you not know that to Allah belongs the domi...,1
2,Originator of the heavens and the earth. When ...,1
3,"Indeed, in the creation of the heavens and ear...",0


**Apply RESET Index on the dataframe**

In [None]:
mesages.reset_index(inplace=True)

**Library import**

In [None]:
import re
import nltk
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

**Text preprocessing**

In [None]:
#Ideas from this Youtube link about Naive bayes and bag of words--------------------https://www.youtube.com/watch?v=8Mlc4-3tgzc
#github link--------------https://github.com/Suji04/NormalizedNerd/blob/master/Introduction%20to%20NLP/Bag%20of%20Words.ipynb
import nltk
nltk.download('punkt')
nltk.download('stopwords')

def clean_text(df):
    all_reviews = list()
    lines = df["Sura without replacements"].values.tolist()
    for text in lines:
        text = text.lower()
        #removing all the links
        pattern = re.compile('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
        text = pattern.sub('', text)
        #removing punctuation
        text = re.sub(r"[,.\"!@#$%^&*(){}?/;`~:<>+=-]", "", text)
        tokens = word_tokenize(text)
        table = str.maketrans('', '', string.punctuation)
        stripped = [w.translate(table) for w in tokens]
        words = [word for word in stripped if word.isalpha()]
        stop_words = set(stopwords.words("english"))
        stop_words.discard("not")
        PS = PorterStemmer()
        #words = [w for w in words if not w in stop_words]
        words = [PS.stem(w) for w in words if not w in stop_words]
        words = ' '.join(words)
        all_reviews.append(words)
    return all_reviews

all_reviews = clean_text(df)
all_reviews[0:10]

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


['said adam inform name inform name said not tell know unseen aspect heaven earth know reveal conceal',
 'not know allah belong dominion heaven earth not besid allah protector helper',
 'origin heaven earth decre matter say',
 'inde creation heaven earth altern night day great ship sail sea benefit peopl allah sent heaven rain give life therebi earth lifeless dispers therein everi kind move creatur direct wind cloud control heaven earth sign peopl use reason',
 'allah deiti except everliv sustain exist neither drowsi overtak sleep belong whatev heaven whatev earth interced except permiss know present encompass not thing knowledg except will kursi extend heaven earth preserv tire not high great',
 'let not greedili withhold allah given bounti ever think better rather wors neck encircl withheld day resurrect allah belong heritag heaven earth allah fulli acquaint',
 'allah belong dominion heaven earth allah thing compet',
 'inde creation heaven earth altern night day sign understand',
 'r

**Features extraction using CountVectorizer**

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
CV = CountVectorizer(min_df=3)   
X = CV.fit_transform(all_reviews).toarray()
y = mesages["Attributes of God"]

In [None]:
X.shape

(118, 108)

In [None]:
#printing first 5 rows https://www.shanelynn.ie/select-pandas-dataframe-rows-and-columns-using-iloc-loc-and-ix/
df.loc[0:7]

Unnamed: 0,Sura without replacements,Attributes of God
0,"He said, ""O Adam, inform them of their names.""...",1
1,Do you not know that to Allah belongs the domi...,1
2,Originator of the heavens and the earth. When ...,1
3,"Indeed, in the creation of the heavens and ear...",0
4,"Allah - there is no deity except Him, the Ever...",1
5,And let not those who [greedily] withhold what...,1
6,And to Allah belongs the dominion of the heave...,1
7,"Indeed, in the creation of the heavens and the...",0


In [None]:
df['Sura without replacements'][6]

'And to Allah belongs the dominion of the heavens and the earth, and Allah is over all things competent.'

**Train test split**

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 8)

In [None]:
# from sklearn.tree import DecisionTreeClassifier
# model = DecisionTreeClassifier(criterion="entropy", random_state=41)
#explaining random state https://www.youtube.com/watch?v=c249O4giblM
#cross validation https://www.youtube.com/watch?v=fKz-SgScM3Q

#from sklearn.naive_bayes import MultinomialNB
#model = MultinomialNB()
#from sklearn.naive_bayes import BernoulliNB
#model = BernoulliNB()
#from sklearn.naive_bayes import svm
#model=svm.SVC()

**GaussianNB Algorithm**

In [None]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, f1_score, precision_score
model = GaussianNB()
model.fit(X_train,y_train)
y_pred = model.predict(X_test)
print("SCORE:",accuracy_score(y_test, y_pred))
print("F1-SCORE:",f1_score(y_test, y_pred))
print("PRECISION:",precision_score(y_test, y_pred))

SCORE: 0.7916666666666666
F1-SCORE: 0.8648648648648648
PRECISION: 0.8


**Predict manually**

In [None]:
model.predict(X_test)

array([1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1,
       1, 1])

**K-fold cross validiation**

In [None]:
from sklearn.model_selection import cross_validate
cv_gaussianNB= cross_validate(model, X, y, cv=10)
print(cv_gaussianNB['test_score'])
print(cv_gaussianNB['test_score'].mean())

[0.58333333 0.91666667 0.66666667 0.58333333 0.66666667 0.41666667
 0.75       0.75       0.72727273 0.63636364]
0.6696969696969697


**Predict from unknown dataset**

In [None]:
emails = [
          
    
    "And to Allah belongs the dominion of the heavens and the earth, and Allah is over all things competent.",

    "Allah - there is no deity except Him, the Ever-Living, the Sustainer of [all] existence. Neither drowsiness overtakes Him nor sleep. To Him belongs whatever is in the heavens and whatever is on the earth. Who is it that can intercede with Him except by His permission? He knows what is [presently] before them and what will be after them, and they encompass not a thing of His knowledge except for what He wills. His Kursi extends over the heavens and the earth, and their preservation tires Him not. And He is the Most High, the Most Great.",
    
    "Thiss is a text text for cross validiation and bias param check",
]
emails_count = CV.transform(emails).toarray()
model.predict(emails_count)

array([0, 1, 0])