<a href="https://colab.research.google.com/github/Luicazen/NLPTweets/blob/NN/nn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import files
uploaded = files.upload()
import pandas as pd
import numpy as np

Saving train.csv to train.csv
Saving test.csv to test.csv


In [2]:
import pandas as pd

# Load the train.csv file into a DataFrame
df_train = pd.read_csv('train.csv')

# Display the first 5 rows of the DataFrame
display(df_train.head())

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [3]:
# Display basic information about the DataFrame
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7613 non-null   int64 
 1   keyword   7552 non-null   object
 2   location  5080 non-null   object
 3   text      7613 non-null   object
 4   target    7613 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 297.5+ KB


In [4]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# 1. Initialize a Tokenizer object
# Set num_words to define the maximum number of words to keep (e.g., 10,000 common words)
# Set oov_token for out-of-vocabulary words
numWords = 10000
otherWords = "<unk>"
tokenizer = Tokenizer(num_words=numWords, oov_token=otherWords)

# 2. Fit the tokenizer on the 'text' column of df_train
tokenizer.fit_on_texts(df_train['text'])

# 3. Convert the 'text' column into sequences of integers
sequences = tokenizer.texts_to_sequences(df_train['text'])

# 4. Determine a suitable maximum sequence length (maxlen)
# Calculate the maximum length of sequences or choose a fixed value
# For simplicity, let's use a fixed maxlen for now, e.g., 100
# You could also calculate it: maxlen = max([len(x) for x in sequences])
# Or a percentile: np.percentile([len(x) for x in sequences], 90)
maxLen = max([len(x) for x in sequences])

# Pad these sequences to ensure they all have the same length
paddedSequences = pad_sequences(sequences, maxlen=maxLen, padding='post')

# 5. Store the word index mapping
wordIndex = tokenizer.word_index

print(f"Original text sample: {df_train['text'].iloc[0]}")
print(f"Tokenized sequence sample: {sequences[0]}")
print(f"Padded sequence sample: {paddedSequences[0]}")
print(f"Vocabulary size: {len(wordIndex)}")

Original text sample: Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all
Tokenized sequence sample: [120, 4634, 25, 5, 869, 9, 22, 264, 139, 1620, 4635, 90, 41]
Padded sequence sample: [ 120 4634   25    5  869    9   22  264  139 1620 4635   90   41    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0]
Vocabulary size: 22701


In [5]:
from sklearn.model_selection import train_test_split

# Define features (X) and target (y)
X = paddedSequences
y = df_train['target']

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"X_train shape: {X_train.shape}")
print(f"X_val shape: {X_val.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_val shape: {y_val.shape}")

X_train shape: (6090, 33)
X_val shape: (1523, 33)
y_train shape: (6090,)
y_val shape: (1523,)


In [6]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GlobalAveragePooling1D, Dense
from tensorflow.keras import Input

#Define the embedding dimension
#You can experiment with values like 16, 32, 64
embeddingDim = 16

#Create a Sequential model
model = Sequential([
    #Add an Input layer to explicitly define the input shape
    Input(shape=(maxLen,)),
    # 4. Add an Embedding layer
    # input_dim is the size of the vocabulary (num_words + 1 for 0-padding)
    # output_dim is the dimensionality of the dense embedding
    Embedding(input_dim=numWords + 1, output_dim=embeddingDim),

    # 5. Add a GlobalAveragePooling1D layer
    # This layer pools the sequence dimension, effectively taking the average of word embeddings
    GlobalAveragePooling1D(),

    # 6. Add a Dense layer with 'relu' activation
    Dense(16, activation='relu'),

    # 7. Add the final Dense layer with 1 neuron and 'sigmoid' activation for binary classification
    Dense(1, activation='sigmoid')
])

# Display the model summary to verify its architecture
model.summary()

In [7]:
epochs = 10

# 1. Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# 2. Train the model
history = model.fit(
    X_train,
    y_train,
    epochs=epochs,
    validation_data=(X_val, y_val),
    verbose=2 # Set to 1 for progress bar, 0 for silent
)

print("Model training complete.")

Epoch 1/10
191/191 - 2s - 13ms/step - accuracy: 0.5867 - loss: 0.6627 - val_accuracy: 0.6671 - val_loss: 0.6322
Epoch 2/10
191/191 - 1s - 5ms/step - accuracy: 0.7300 - loss: 0.5760 - val_accuracy: 0.7479 - val_loss: 0.5467
Epoch 3/10
191/191 - 2s - 8ms/step - accuracy: 0.8087 - loss: 0.4658 - val_accuracy: 0.7991 - val_loss: 0.4826
Epoch 4/10
191/191 - 3s - 16ms/step - accuracy: 0.8529 - loss: 0.3806 - val_accuracy: 0.7997 - val_loss: 0.4625
Epoch 5/10
191/191 - 3s - 14ms/step - accuracy: 0.8778 - loss: 0.3225 - val_accuracy: 0.8135 - val_loss: 0.4607
Epoch 6/10
191/191 - 1s - 5ms/step - accuracy: 0.8949 - loss: 0.2797 - val_accuracy: 0.8129 - val_loss: 0.4602
Epoch 7/10
191/191 - 1s - 5ms/step - accuracy: 0.9122 - loss: 0.2460 - val_accuracy: 0.7833 - val_loss: 0.4838
Epoch 8/10
191/191 - 1s - 5ms/step - accuracy: 0.9218 - loss: 0.2183 - val_accuracy: 0.7892 - val_loss: 0.4915
Epoch 9/10
191/191 - 1s - 5ms/step - accuracy: 0.9307 - loss: 0.1964 - val_accuracy: 0.7991 - val_loss: 0.501

## Evaluate Model

### Subtask:
Assess the performance of the trained neural network on the validation set using the `model.evaluate()` method to get the loss and accuracy.


In [8]:
from sklearn.metrics import f1_score
import numpy as np

# Make predictions on the validation set
y_pred_proba = model.predict(X_val)

# Convert probabilities to binary predictions using a threshold (e.g., 0.5)
y_pred = (y_pred_proba > 0.5).astype(int)

# Calculate F1-score
f1 = f1_score(y_val, y_pred)

print(f"Model F1-score on validation set: {f1:.4f}")

[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step
Model F1-score on validation set: 0.7600


#Run the Test file and create the submission.csv.

In [9]:

# Load the train.csv file into a DataFrame
df_test = pd.read_csv('test.csv')

# Display the first 5 rows of the DataFrame
display(df_test.head())

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [10]:
# Display basic information about the DataFrame
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3263 entries, 0 to 3262
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        3263 non-null   int64 
 1   keyword   3237 non-null   object
 2   location  2158 non-null   object
 3   text      3263 non-null   object
dtypes: int64(1), object(3)
memory usage: 102.1+ KB
