In [1]:
import pandas as pd
import nltk
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
import re
nltk.download('punkt_tab')
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt_tab to C:\Users\Windows
[nltk_data]     Home\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.
[nltk_data] Downloading package punkt to C:\Users\Windows
[nltk_data]     Home\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\Windows
[nltk_data]     Home\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Windows
[nltk_data]     Home\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
dataset = pd.read_csv("../dataset/dataset10K.csv")

In [None]:
#Show data table
#movie_reviews.head()
display(dataset)

Unnamed: 0,Question,Answer
0,"""Why does my sheep chew on random objects like...",Sheep sometimes chew on odd things due to bore...
1,"Why does my bird pluck its feathers, and what ...","Feather plucking can happen due to stress, bor..."
2,"""Why does my pig sometimes sneeze a lot, and s...","Pigs can sneeze due to dust, allergies, or mil..."
3,"""Deer seems less active and is eating less—cou...","It could be the weather, especially if it's ho..."
4,"""Why does Donkey sometimes chew on rocks, and ...","Chewing on rocks, called pica, can happen due ..."
...,...,...
10006,"""Does the combination of drooping ears, listle...","Those symptoms could point to several issues, ..."
10007,"""Considering the symptoms of death, epistaxis,...",These symptoms are very serious and could poin...
10008,"""Can a sudden loss of appetite and altered beh...","Yes, a sudden loss of appetite and behavior ch..."
10009,"""At what point should I be concerned about the...",If your buffalo is showing signs like severe l...


In [None]:
#Count Question column [value_counts()]
dataset['Question'].value_counts()


Question
"Why does my bird pluck its feathers, and how can I help it stop?"                                                                                  2
"Why does my bird pluck its feathers, and how can I help stop it?"                                                                                  2
"Should I be concerned about potential respiratory infections or environmental irritants causing these symptoms in my duck?"                        2
"Pig’s been scratching a lot lately—could it be allergies or something in his food? What should I check for first?"                                 1
"Why does my sheep chew on everything, and is it normal or should I worry about its diet?"                                                          1
                                                                                                                                                   ..
"Are there specific dietary changes or diagnostic tests you recommend to address my bird's 

In [None]:
#Show Question raw data in row 1
dataset['Question'].iloc[0]


'"Why does my sheep chew on random objects like rocks or fences? Is that normal or something to worry about?"'

In [7]:
#Removes HTML tags: replaces anything between opening and closing <> with empty space
TAG_RE = re.compile(r'<[^>]+>')

def remove_tags(text):
    return TAG_RE.sub('', text)

In [8]:
# Example usage:
text_with_html = dataset["Question"][0]
text_without_html = remove_tags(text_with_html)
text_without_html

'"Why does my sheep chew on random objects like rocks or fences? Is that normal or something to worry about?"'

In [None]:
def preprocess_text(sen):
    sentence = sen.lower()

    # Remove html tags
    sentence = remove_tags(sentence)

    # Remove punctuations and numbers
    sentence = re.sub('[^a-zA-Z]', ' ', sentence)

    # Single character removal
    sentence = re.sub(r"\s+[a-zA-Z]\s+", ' ', sentence)

    # Remove multiple spaces
    sentence = re.sub(r'\s+', ' ', sentence)

    # Remove Stopwords
    pattern = re.compile(r'\b(' + r'|'.join(stopwords.words('english')) + r')\b\s*')
    sentence = pattern.sub('', sentence)

    # Tokenization
    tokens = word_tokenize(sentence)

    # Stemming
    stemmer = PorterStemmer()
    stemmed_words = [stemmer.stem(word) for word in tokens]

    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    lemmatized_words = [lemmatizer.lemmatize(word) for word in stemmed_words]

    cleaned_sentence = ' '.join(lemmatized_words)  # Joining the words

    return cleaned_sentence  # Return a list of tokens

In [32]:
# Create a copy of the dataset
dataset_processed = dataset.copy()

# Apply preprocess_text to 'Question' column
dataset_processed.loc[:10011, 'Question'] = dataset_processed.loc[:10011, 'Question'].apply(preprocess_text)

# Apply a different preprocessing function to 'Answer' column if needed
dataset_processed.loc[:10011, 'Answer'] = dataset_processed.loc[:10011, 'Answer'].apply(preprocess_text)

# Display the first 50 rows after preprocessing
print(dataset_processed[['Question', 'Answer']].head(50))


                                             Question  \
0   sheep chew random object like rock fenc normal...   
1                             bird pluck feather help   
2                         pig sometim sneez lot worri   
3   deer seem less activ eat less could weather wo...   
4   donkey sometim chew rock could hurt teeth stomach   
5   pig scratch lot late could allergi someth food...   
6                sheep chew everyth normal worri diet   
7   monkey seem less play eat less could mood chan...   
8                          hors chew wood harm health   
9   pig seem less activ eat much late could someth...   
10  hi doc eleph seem less play eat much could str...   
11  hi notic buffalo seem less energet eat much co...   
12                     duck sometim sneez eat concern   
13  sheep sometim scratch ear lot could someth ser...   
14                       dog lion lick paw much worri   
15      buffalo seem restless night help feel comfort   
16                            f

In [None]:
dataset_processed['Answer'][1]

'duck might sneez eat food particl irrit nose usual big deal keep eye frequent sneez symptom like discharg happen lot consid softer smaller food piec'

In [None]:
dataset_processed['Question'][1]

'duck sometim sneez eat concern'

In [35]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(dataset_processed['Question'])
sequences = tokenizer.texts_to_sequences(dataset_processed['Question'])
padded_sequences = pad_sequences(sequences, padding='post')
print("Padded Sequences:", padded_sequences)

Padded Sequences: [[ 12 149 449 ...   0   0   0]
 [ 32 495 108 ...   0   0   0]
 [ 24  83  85 ...   0   0   0]
 ...
 [124  26  37 ...   0   0   0]
 [369  97 363 ...   0   0   0]
 [196 232 327 ...   0   0   0]]


In [36]:
# see tokenizer.word_index
tokenizer = Tokenizer()
tokenizer.fit_on_texts(dataset_processed)
print(tokenizer.word_index)

{'question': 1, 'answer': 2}


In [37]:
display(dataset_processed)

Unnamed: 0,Question,Answer
0,sheep chew random object like rock fenc normal...,sheep sometim chew odd thing due boredom miner...
1,bird pluck feather help,feather pluck happen due stress boredom health...
2,pig sometim sneez lot worri,pig sneez due dust allergi mild irrit environ ...
3,deer seem less activ eat less could weather wo...,could weather especi hot cold deer often eat l...
4,donkey sometim chew rock could hurt teeth stomach,chew rock call pica happen due boredom miner d...
...,...,...
10006,combin droop ear listless drool weight loss wh...,symptom could point sever issu like bacteri in...
10007,consid symptom death epistaxi hemoptysi buffal...,symptom seriou could point condit like hemorrh...
10008,sudden loss appetit alter behavior wolf indic ...,ye sudden loss appetit behavior chang wolf cou...
10009,point concern sever buffalo pain specif sign i...,buffalo show sign like sever limp refus eat co...


In [38]:
from sklearn.model_selection import train_test_split

# ใช้ 'Question' เป็นอินพุตและ 'Answer' เป็นเป้าหมาย
X = dataset_processed['Question']
y = dataset_processed['Answer']

# แบ่งข้อมูลเป็นชุดเทรนและทดสอบ
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# แสดงขนาดของชุดข้อมูลที่แบ่ง
print("Training data size:", len(X_train))
print("Test data size:", len(X_test))


Training data size: 8008
Test data size: 2003


In [39]:
X_train

9090    would recommend specif diagnost test determin ...
8294    could suggest simpl step keep eleph environ sa...
9941    connect sheep ammonia like odor letharg behavi...
1095    cat lethargi fever someth seriou could pas bug...
3131    could buffalo feel sluggish use heat someth el...
                              ...                        
5734    dog lose weight worm diabet mayb someth el go ...
5191         keep fowl comfort prevent skin thicken itchi
5390    could caus buffalo lose weight babi buffalo su...
860     deer seem less play eat less could someth wron...
7270    could deer lack energi troubl urin relat get e...
Name: Question, Length: 8008, dtype: object

In [40]:
y_train

9090    sheep abort fever blood test check infect like...
8294    sure keep eleph space clean free obstacl preve...
9941    ye ammonia like odor lethargi could point urin...
1095    lethargi fever cat sign mild infect someth ser...
3131    buffalo feel sluggish heat prefer cooler clima...
                              ...                        
5734    weight loss dog due worm diabet issu like poor...
5191    make sure fowl clean dri coop good ventil prov...
5390    buffalo might lose weight due poor nutrit para...
860     could stress chang environ even mild ill keep ...
7270    possibl lack exercis lead low energi troubl ur...
Name: Answer, Length: 8008, dtype: object

In [41]:
word_tokenizer = Tokenizer()
word_tokenizer.fit_on_texts(X_train)

X_train = word_tokenizer.texts_to_sequences(X_train)
X_test = word_tokenizer.texts_to_sequences(X_test)

In [42]:
# Adding 1 to store dimensions for words for which no pretrained word embeddings exist

vocab_length = len(word_tokenizer.word_index) + 1

vocab_length

1705

In [43]:
# Padding all reviews to fixed length 100

maxlen = 100

X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)

In [49]:
# Load GloVe word embeddings and create an Embeddings Dictionary

from numpy import asarray
from numpy import zeros

embeddings_dictionary = dict()
glove_file = open('../dataset/glove.6B.100d.txt', encoding="utf8")

for line in glove_file:
    records = line.split()
    word = records[0]
    vector_dimensions = asarray(records[1:], dtype='float32')
    embeddings_dictionary [word] = vector_dimensions
glove_file.close()

In [50]:
# Create Embedding Matrix having 100 columns
# Containing 100-dimensional GloVe word embeddings for all words in our corpus.

embedding_matrix = zeros((vocab_length, 100))
for word, index in word_tokenizer.word_index.items():
    embedding_vector = embeddings_dictionary.get(word)
    if embedding_vector is not None:
        embedding_matrix[index] = embedding_vector

In [51]:
embedding_matrix.shape

(1705, 100)

In [52]:
embedding_matrix

array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.05869   ,  0.40272999,  0.38633999, ..., -0.35973999,
         0.43718001,  0.10121   ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [-0.055423  , -0.35898   ,  0.14016999, ...,  0.24855   ,
         0.58578998,  1.05110002],
       [ 0.58832002, -0.098767  ,  0.17133   , ...,  0.23624   ,
        -0.76871002, -0.41764   ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ]])