# Chatbot Message Classification Project

## Table of Content
<ul>
    <li><a href="#sec1">Installing the nescessery Libraries.</a></li>
    <li><a href="#sec2">Data Wrangling.</a></li>
    <li><a href="#sec3">Pre-processing.</a></li>
    <li><a href="#sec4">Deep Learning Model.</a></li>
    <li><a href="#sec5">Model Traning.</a></li>
    <li><a href="#sec6">Pre-processing for the test data.</a></li>
    <li><a href="#sec7">Evaluate on the splitted test data.</a></li>
</ul>

<a id='sec1'></a>
## Installing the nescessery Libraries.

In [1]:
pip install pyarabic

Collecting pyarabic
  Downloading PyArabic-0.6.14-py3-none-any.whl (126 kB)
[?25l[K     |██▋                             | 10 kB 26.8 MB/s eta 0:00:01[K     |█████▏                          | 20 kB 12.3 MB/s eta 0:00:01[K     |███████▉                        | 30 kB 9.7 MB/s eta 0:00:01[K     |██████████▍                     | 40 kB 8.7 MB/s eta 0:00:01[K     |█████████████                   | 51 kB 5.0 MB/s eta 0:00:01[K     |███████████████▋                | 61 kB 5.2 MB/s eta 0:00:01[K     |██████████████████▏             | 71 kB 5.3 MB/s eta 0:00:01[K     |████████████████████▊           | 81 kB 5.9 MB/s eta 0:00:01[K     |███████████████████████▍        | 92 kB 4.6 MB/s eta 0:00:01[K     |██████████████████████████      | 102 kB 5.0 MB/s eta 0:00:01[K     |████████████████████████████▌   | 112 kB 5.0 MB/s eta 0:00:01[K     |███████████████████████████████▏| 122 kB 5.0 MB/s eta 0:00:01[K     |████████████████████████████████| 126 kB 5.0 MB/s 
Installing c

In [2]:
pip install qalsadi

Collecting qalsadi
  Downloading qalsadi-0.4.4-py3-none-any.whl (257 kB)
[?25l[K     |█▎                              | 10 kB 21.7 MB/s eta 0:00:01[K     |██▌                             | 20 kB 14.7 MB/s eta 0:00:01[K     |███▉                            | 30 kB 10.7 MB/s eta 0:00:01[K     |█████                           | 40 kB 9.2 MB/s eta 0:00:01[K     |██████▍                         | 51 kB 5.1 MB/s eta 0:00:01[K     |███████▋                        | 61 kB 5.3 MB/s eta 0:00:01[K     |█████████                       | 71 kB 5.6 MB/s eta 0:00:01[K     |██████████▏                     | 81 kB 6.2 MB/s eta 0:00:01[K     |███████████▌                    | 92 kB 4.8 MB/s eta 0:00:01[K     |████████████▊                   | 102 kB 4.8 MB/s eta 0:00:01[K     |██████████████                  | 112 kB 4.8 MB/s eta 0:00:01[K     |███████████████▎                | 122 kB 4.8 MB/s eta 0:00:01[K     |████████████████▋               | 133 kB 4.8 MB/s eta 0:00:01[K

<a id='sec2'></a>
## Data Wrangling

In [3]:
import pyarabic.araby as araby

import random
import json
import pickle
import numpy as np
import os

import nltk
from nltk.stem import WordNetLemmatizer

from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Input, Bidirectional, GRU
from tensorflow.keras.optimizers import SGD
from tensorflow.keras.models import load_model

In [4]:
import pandas as pd
import qalsadi.lemmatizer

lemmer = qalsadi.lemmatizer.Lemmatizer()

In [6]:
df = pd.read_csv(r"train_ara.csv")
df.head()

Unnamed: 0,text,intent
0,كم عدد مستشفيات العزل فى مصر وما هى اماكنها,business location
1,ومخاصمك,nothing identified
2,متي ينتهي كورونا؟,the evolution of the virus
3,انا اسف,nothing identified
4,كام عدد الوفيات النهارده,infected cases


In [8]:
path = r"list.txt"
stop_words = []
with open(path, "r", encoding="utf-8", errors="ignore") as myfile:
    stop_words = myfile.readlines()
stopWords = [word.strip() for word in stop_words]

<a id='sec3'></a>
## Pre-processing

In [9]:
words = []
classes = []
documents = []
ignore_letters = ['!', '?', ',', '.']

for i in range (df['text'].count()):
    word = araby.tokenize(df['text'][i])
    words.extend(word)
    documents.append((word, df['intent'][i]))
    if df['intent'][i] not in classes:
        classes.append(df['intent'][i])

In [10]:
words = [lemmer.lemmatize(w.lower()) for w in words if w not in ignore_letters and w not in stopWords]
words = sorted(list(set(words)))

classes = sorted(list(set(classes)))

In [11]:
training = []
output_empty = [0] * len(classes)

for doc in documents:
    bag = []
    word_patterns = doc[0]
    word_patterns = [lemmer.lemmatize(word.lower()) for word in word_patterns]
    for word in words:
        bag.append(1) if word in word_patterns else bag.append(0)

        output_row = list(output_empty)
        output_row[classes.index(doc[1])] = 1
        training.append([bag, output_row])

random.shuffle(training)
training = np.array(training)

train_x = list(training[:, 0])
train_y = list(training[:, 1])

  app.launch_new_instance()


In [12]:
X_train, X_test, y_train, y_test = train_test_split(train_x, train_y, test_size = 0.2, random_state = 42, shuffle=True)

<a id='sec4'></a>
## Deep Learning Model

In [13]:
model = Sequential()
model.add(Dense(128, input_shape=(len(train_x[0]),), activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(len(train_y[0]), activation='softmax'))

sgd = SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True)
model.compile(loss='categorical_crossentropy', optimizer=sgd, metrics=['accuracy'])

  super(SGD, self).__init__(name, **kwargs)


In [14]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 128)               125440    
                                                                 
 dropout (Dropout)           (None, 128)               0         
                                                                 
 dense_1 (Dense)             (None, 64)                8256      
                                                                 
 dropout_1 (Dropout)         (None, 64)                0         
                                                                 
 dense_2 (Dense)             (None, 40)                2600      
                                                                 
Total params: 136,296
Trainable params: 136,296
Non-trainable params: 0
_________________________________________________________________


<a id='sec5'></a>
## Model Training 

In [15]:
model.fit(np.array(X_train), np.array(y_train), epochs=3, batch_size=5, verbose=2)

Epoch 1/3
123903/123903 - 263s - loss: 0.1467 - accuracy: 0.9607 - 263s/epoch - 2ms/step
Epoch 2/3
123903/123903 - 236s - loss: 0.1875 - accuracy: 0.9638 - 236s/epoch - 2ms/step
Epoch 3/3
123903/123903 - 251s - loss: 0.2750 - accuracy: 0.9506 - 251s/epoch - 2ms/step


<keras.callbacks.History at 0x7f6dd4347890>

<a id='sec6'></a>
## Pre-processing for the test data

In [16]:
def _clean_up_sentence(sentence):
    sentence_words = araby.tokenize(sentence)
    sentence_words = [lemmer.lemmatize(word.lower()) for word in sentence_words]
    return sentence_words

In [17]:
def _bag_of_words(sentence, words):
    sentence_words = _clean_up_sentence(sentence)
    bag = [0] * len(words)
    for s in sentence_words:
        for i, word in enumerate(words):
            if word == s:
                bag[i] = 1
    return np.array(bag)

In [18]:
def _predict_class(sentence):
    p = _bag_of_words(sentence, words)
    res = model.predict(np.array([p]))[0]
    ERROR_THRESHOLD = 0.1
    results = [[i, r] for i, r in enumerate(res) if r > ERROR_THRESHOLD]
    
    results.sort(key=lambda x: x[1], reverse=True)
    return_list = ""
    for r in results:
        return_list = classes[r[0]]
    return return_list

### importing and predecting on the test data

In [19]:
dfTest = pd.read_csv(r"test.csv")
output = []
for i in range(dfTest['text'].count()):
    x = _predict_class(dfTest['text'][i])
    output.append(x)
output[:10]

['infected cases',
 'infected cases',
 'yes',
 'treatment',
 'infected cases',
 'infected cases',
 'infected cases',
 'infected cases',
 'the evolution of the virus',
 'infected cases']

<a id='sec7'></a>
## Evaluate on the splitted test data

In [20]:
model.evaluate(np.array(X_test), np.array(y_test))



[0.04943576455116272, 0.9862343072891235]