In [23]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import feature_extraction
from sklearn.metrics import f1_score
from keras.models import Sequential
from keras.layers import Dense

## Explore dataset

In [35]:
# Importing dataset
train = pd.read_csv('nlp-getting-started/train.csv')
test = pd.read_csv('nlp-getting-started/test.csv')
train.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [46]:
print("Example of tweet that is not real disaster --> ["+train['text'][0]+"]")
print("Example of tweet that is real disaster --> ["+train['text'][1]+"]")

Example of tweet that is not real disaster --> [Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all]
Example of tweet that is real disaster --> [Forest fire near La Ronge Sask. Canada]


## Preprocessing

In [47]:
count_vectorizer = feature_extraction.text.CountVectorizer()
## let's get counts for the first 5 tweets in the data
example_train_vectors = count_vectorizer.fit_transform(train["text"][0:5])


In [48]:
print(example_train_vectors[0].todense())
print(example_train_vectors[0].todense().shape)

[[0 0 0 1 1 1 0 0 0 0 0 0 1 1 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 1 0
  0 0 0 1 0 0 0 0 0 0 0 0 0 1 1 0 1 0]]
(1, 54)


In [49]:
x_train = count_vectorizer.fit_transform(train["text"])
y_train = train["target"]

x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.2, random_state=42)

x_test = count_vectorizer.transform(test["text"])

In [50]:
x_train.shape

(6090, 21637)

## Neural Network

In [51]:
#n_features = x_train.shape[1] = 21637
model = Sequential()
model.add(Dense(28, activation='relu', input_shape=(x_train.shape[1],)))
model.add(Dense(64, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.summary()


Model: "sequential_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_15 (Dense)            (None, 28)                605864    
                                                                 
 dense_16 (Dense)            (None, 64)                1856      
                                                                 
 dense_17 (Dense)            (None, 1)                 65        
                                                                 
Total params: 607785 (2.32 MB)
Trainable params: 607785 (2.32 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


## Train

In [52]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(x_train.toarray(), y_train, epochs=4, batch_size=16, validation_data=(x_val.toarray(), y_val))

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<keras.src.callbacks.History at 0x287558460>

In [53]:
# Calculate f1-score on validation set after each epoch
y_val_pred = model.predict(x_val.toarray())
y_val_pred = (y_val_pred > 0.5).astype(int)  # Convert probabilities to binary output [0,1]
f1 = f1_score(y_val, y_val_pred)

print("F1-score on validation:", f1)

F1-score en conjunto de validaciÃ³n: 0.7267080745341615


In [54]:
# Predict test data
y_test_pred = model.predict(x_test.toarray())
y_test_pred = (y_test_pred > 0.5).astype(int)  # Convert probabilities to binary output [0,1]



## Save the results

In [55]:
# Load the existing CSV file
submission = pd.read_csv("nlp-getting-started/sample_submission.csv")

# Replace the "target" column with the new predictions
submission["target"] = y_test_pred

# Save the modified DataFrame to a new CSV file
submission.to_csv("nlp-getting-started/submission.csv", index=False)