In [299]:
import pandas as pd
import numpy as np
import string

Let's take a lot at the data.

In [300]:
df = pd.read_csv("data/train.csv", index_col = ["id"])
df.head()

Unnamed: 0_level_0,keyword,location,text,target
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,,,Our Deeds are the Reason of this #earthquake M...,1
4,,,Forest fire near La Ronge Sask. Canada,1
5,,,All residents asked to 'shelter in place' are ...,1
6,,,"13,000 people receive #wildfires evacuation or...",1
7,,,Just got sent this photo from Ruby #Alaska as ...,1


Here you can see into it with a bit more detail 

In [301]:
df.info()
df["keyword"].value_counts()
df["location"].value_counts()
df["target"].value_counts()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7613 entries, 1 to 10873
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   keyword   7552 non-null   object
 1   location  5080 non-null   object
 2   text      7613 non-null   object
 3   target    7613 non-null   int64 
dtypes: int64(1), object(3)
memory usage: 297.4+ KB


0    4342
1    3271
Name: target, dtype: int64

In [302]:
#With this we remove how spaces are represented. 
df["keyword"] = df["keyword"].astype(str)
df["keyword"].replace(to_replace="%20",value = " ", inplace =True, regex = True)

We can see that although both keyword and location have null values there are not so many that it becomes unusable. What could we do about it?

As a first idea, keywords are simply the words in the text that give the category to the disaster. Let's create a function to try to fill these missing values.

But let's first define X and y. We will try to predict target (real disaster or not, 0 or 1 respectively) with location, keywords and text. 

In [303]:
X_train = df.drop(["target"], axis = 1)
y_train = df["target"]

Just to give it a try I have asked the GPT 3 chat to help me with the preprocessing. With very minor tweaks I was able to use the function right off the bat.

In [304]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer

def preprocess_text_open_AI(text):
    # Convert to lowercase
    text = text.lower()
    
    # Remove punctuation and special characters
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    
    # Tokenize the text
    tokens = nltk.word_tokenize(text)
    
    # Remove stop words
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]
    
    # Perform stemming (option 1)
    # stemmer = PorterStemmer()
    # tokens = [stemmer.stem(token) for token in tokens]
    
    #Option 2. Use lemmatization. 
      # Perform lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    # return tokens
    #Should we join it or keep it as a list?
    lemmatized = " ".join(tokens)
    lemmatized = lemmatized.strip(" ")
    
    return lemmatized



In [305]:
X_train["text"] = X_train["text"].apply(preprocess_text_open_AI)
X_train["keyword"] = X_train["keyword"].apply(preprocess_text_open_AI)

Let's make a function that:
1) Get's all keyword values in a list.
2) For each "nan" if corresponding df["text"] in list df["keyword"] replace. 

In [306]:
values = X_train["keyword"].unique()
values

array(['nan', 'ablaze', 'accident', 'aftershock', 'airplane accident',
       'ambulance', 'annihilated', 'annihilation', 'apocalypse',
       'armageddon', 'army', 'arson', 'arsonist', 'attack', 'attacked',
       'avalanche', 'battle', 'bioterror', 'bioterrorism', 'blaze',
       'blazing', 'bleeding', 'blew', 'blight', 'blizzard', 'blood',
       'bloody', 'blown', 'body bag', 'body bagging', 'bomb', 'bombed',
       'bombing', 'bridge collapse', 'building burning', 'building fire',
       'burned', 'burning', 'burning building', 'bush fire', 'casualty',
       'catastrophe', 'catastrophic', 'chemical emergency', 'cliff fall',
       'collapse', 'collapsed', 'collide', 'collided', 'collision',
       'crash', 'crashed', 'crush', 'crushed', 'curfew', 'cyclone',
       'damage', 'danger', 'dead', 'death', 'debris', 'deluge', 'deluged',
       'demolish', 'demolished', 'demolition', 'derail', 'derailed',
       'derailment', 'desolate', 'desolation', 'destroy', 'destroyed',
       'des

In [307]:
for index, row in X_train.iterrows():
    word_list = row["text"].split(" ")
    if row["keyword"] == "nan":
        for word in word_list:
            if word in values:
                row["keyword"] = word
       
                

In [308]:
X_train = X_train.drop("location",axis =1)

In [309]:
X_train

Unnamed: 0_level_0,keyword,text
id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,earthquake,deed reason earthquake may allah forgive u
4,fire,forest fire near la ronge sask canada
5,evacuation,resident asked shelter place notified officer ...
6,evacuation,13000 people receive wildfire evacuation order...
7,wildfire,got sent photo ruby alaska smoke wildfire pour...
...,...,...
10869,collapse,two giant crane holding bridge collapse nearby...
10870,fire,ariaahrary thetawniest control wild fire calif...
10871,volcano,m194 0104 utc5km volcano hawaii httptcozdtoyd8ebj
10872,injury,police investigating ebike collided car little...


In [310]:
y_train

id
1        1
4        1
5        1
6        1
7        1
        ..
10869    1
10870    1
10871    1
10872    1
10873    1
Name: target, Length: 7613, dtype: int64

In [311]:
from sklearn.feature_extraction.text import CountVectorizer



# Create a CountVectorizer object
vectorizer = CountVectorizer()

# Learn the vocabulary and transform the tweets
X = vectorizer.fit_transform(X_train["text"])

# Get the vocabulary
vocab = vectorizer.vocabulary_

# Print the vocabulary
print(vocab)



In [312]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split



# Split the data into training and testing sets
X_train, X_test, y_train_b, y_test = train_test_split(X, y_train, test_size=0.2, random_state=42)

# Create a LogisticRegression object
model = LogisticRegression(solver='lbfgs')

# Fit the model on the training data
model.fit(X_train, y_train_b)

In [313]:
y_pred = model.predict(X_test)

In [314]:
from sklearn.metrics import accuracy_score, precision_score, recall_score

# True labels
y_true = y_test

# Predicted labels
y_pred = model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_true, y_pred)

# Calculate precision
precision = precision_score(y_true, y_pred)

# Calculate recall
recall = recall_score(y_true, y_pred)

# Print the results
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)


Accuracy: 0.7944845699277742
Precision: 0.7916666666666666
Recall: 0.7026194144838213


In [315]:
#df = pd.read_csv("data/test.csv", index_col = ["id"])
df = pd.read_csv("data/test.csv")
X_unseen= df.drop(["location"], axis = 1)
X_unseen = X_unseen.drop(["keyword"], axis = 1)

In [316]:
X_unseen

Unnamed: 0,id,text
0,0,Just happened a terrible car crash
1,2,"Heard about #earthquake is different cities, s..."
2,3,"there is a forest fire at spot pond, geese are..."
3,9,Apocalypse lighting. #Spokane #wildfires
4,11,Typhoon Soudelor kills 28 in China and Taiwan
...,...,...
3258,10861,EARTHQUAKE SAFETY LOS ANGELES ÛÒ SAFETY FASTE...
3259,10865,Storm in RI worse than last hurricane. My city...
3260,10868,Green Line derailment in Chicago http://t.co/U...
3261,10874,MEG issues Hazardous Weather Outlook (HWO) htt...


In [317]:
X_unseen = X_unseen["text"].apply(preprocess_text_open_AI)

In [318]:
X_unseen= vectorizer.transform(X_unseen)

In [319]:
predictions = model.predict(X_unseen)

In [320]:
predictions = pd.DataFrame(predictions)
predictions["id"] = df["id"]

In [321]:
predictions = predictions.set_index("id")

In [322]:
predictions= predictions.rename(columns={0: 'target'})

In [323]:
predictions

Unnamed: 0_level_0,target
id,Unnamed: 1_level_1
0,1
2,1
3,1
9,1
11,1
...,...
10861,1
10865,1
10868,1
10874,1


In [324]:
import csv

def create_submission_csv(predictions, file_name):
    """
    Creates a submission CSV file for a Kaggle competition.
    
    Parameters:
    predictions (list): A list of predictions.
    file_name (str): The name of the CSV file to be created.
    
    Returns:
    None
    """
    with open(file_name, 'w', newline='') as csvfile:
        writer = csv.writer(csvfile)
        
        # Write the header row
        writer.writerow(['id', 'target'])
        
        # Write the predictions
        for index, prediction in predictions.iterrows():
            writer.writerow([index, prediction["target"]])

In [325]:
create_submission_csv(predictions, 'submission.csv')