In [67]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, GlobalAveragePooling1D



In [68]:

df = pd.read_csv("/kaggle/input/politeness-train/politeness_train.csv")
print(df.head())

   utterance_id  conversation_id  \
0          3208             2099   
1          4458             4622   
2          4888             5599   
3          3316             2295   
4          4005             3674   

                                                text  polite  
0  Is it for a particular CPU (e.g. a Z80 or the ...       1  
1  What is an eastern style pot?  Is this a cup-l...       1  
2  I didn't know that you had to use an email add...       0  
3  I don't understand.  That call stack is all ab...       1  
4  The last sentence makes me think that there is...       0  


In [44]:
df.polite.value_counts()

polite
1    2457
0    2457
Name: count, dtype: int64

In [69]:
df.count()


utterance_id       4914
conversation_id    4914
text               4914
polite             4914
dtype: int64

In [72]:

import string
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.tokenize import WhitespaceTokenizer
from nltk.stem import WordNetLemmatizer

def clean_text(text):
    # lower text
    text = text.lower()
    # tokenize text and remove puncutation
    text = [word.strip(string.punctuation) for word in text.split(" ")]
    # remove words that contain numbers
    text = [word for word in text if not any(c.isdigit() for c in word)]
    # remove stop words
    stop = stopwords.words('english')
    text = [x for x in text if x not in stop]
    # remove empty tokens
    text = [t for t in text if len(t) > 0]
    text = [t for t in text if len(t) > 1]
    # join all
    text = " ".join(text)
    return(text)
    text = [WordNetLemmatizer().lemmatize(t[0], get_wordnet_pos(t[1])) for t in pos_tags]
    # remove words with only one letter
    text = [t for t in text if len(t) > 1]
    # join all
    text = " ".join(text)
    return(text)

# clean text data
df["text"] = df["text"].apply(lambda x: clean_text(x))

In [73]:
df.head()

Unnamed: 0,utterance_id,conversation_id,text,polite
0,3208,2099,particular cpu e.g cell processor's spu abstra...,1
1,4458,4622,eastern style pot cup-like device lid strain l...,1
2,4888,5599,know use email address instead username change,0
3,3316,2295,understand call stack debug help library happe...,1
4,4005,3674,last sentence makes think bigger picture reall...,0


In [74]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df['text'])
sequences = tokenizer.texts_to_sequences(df['text'])
X = pad_sequences(sequences, maxlen=100)


In [75]:
y = df['polite'].values

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [79]:
embeddings_index = {}
with open('/kaggle/input/glove6b100dtxt/glove.6B.100d.txt', encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

In [80]:
# Create embedding matrix
word_index = tokenizer.word_index
num_words = min(len(word_index) + 1, len(embeddings_index))
embedding_dim = 100
embedding_matrix = np.zeros((num_words, embedding_dim))
for word, i in word_index.items():
    if i >= num_words:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [86]:
# Build the neural network model
model = Sequential()
model.add(Embedding(num_words, embedding_dim, weights=[embedding_matrix], input_length=100, trainable=False))
model.add(GlobalAveragePooling1D())
model.add(Dense(50, activation='relu'))
model.add(Dense(1, activation='sigmoid'))


In [92]:
# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=10, verbose=2, validation_data=(X_test, y_test))

Epoch 1/10
123/123 - 1s - loss: 0.6089 - accuracy: 0.6688 - val_loss: 0.6427 - val_accuracy: 0.6287 - 1s/epoch - 11ms/step
Epoch 2/10
123/123 - 0s - loss: 0.6062 - accuracy: 0.6678 - val_loss: 0.6392 - val_accuracy: 0.6419 - 375ms/epoch - 3ms/step
Epoch 3/10
123/123 - 0s - loss: 0.6041 - accuracy: 0.6698 - val_loss: 0.6401 - val_accuracy: 0.6368 - 379ms/epoch - 3ms/step
Epoch 4/10
123/123 - 0s - loss: 0.6026 - accuracy: 0.6762 - val_loss: 0.6406 - val_accuracy: 0.6277 - 381ms/epoch - 3ms/step
Epoch 5/10
123/123 - 0s - loss: 0.6031 - accuracy: 0.6726 - val_loss: 0.6423 - val_accuracy: 0.6358 - 442ms/epoch - 4ms/step
Epoch 6/10
123/123 - 0s - loss: 0.6010 - accuracy: 0.6734 - val_loss: 0.6428 - val_accuracy: 0.6277 - 389ms/epoch - 3ms/step
Epoch 7/10
123/123 - 0s - loss: 0.6008 - accuracy: 0.6759 - val_loss: 0.6400 - val_accuracy: 0.6307 - 366ms/epoch - 3ms/step
Epoch 8/10
123/123 - 0s - loss: 0.5989 - accuracy: 0.6800 - val_loss: 0.6389 - val_accuracy: 0.6378 - 369ms/epoch - 3ms/step
Ep

<keras.src.callbacks.History at 0x7b324674e560>

In [93]:
y_pred_prob = model.predict(X_test)
y_pred = (y_pred_prob > 0.5).astype(int)

report = classification_report(y_test, y_pred)

print("Classification Report:")
print(report)

Classification Report:
              precision    recall  f1-score   support

           0       0.64      0.59      0.61       482
           1       0.63      0.68      0.65       501

    accuracy                           0.63       983
   macro avg       0.63      0.63      0.63       983
weighted avg       0.63      0.63      0.63       983



**SUMMARY**
*Accuracy is at 63% which is less than that of logistic regression classifier.* 

* Glove Embeddings is used because of their effectiveness in capturing global word co-occurence which is better representation of word semantics.
* Word sequence preserves the sequential information present in the text, allowing the neural network to learn from the order of words in sentences.
* Epochs is to 10 because dataset size is not very large hence set epoch to a smaller value
* Set verbose to 2 to control verbosity during training to avoid excessive verbosity that might clutter the input
