<a href="https://www.kaggle.com/code/laplacecherub/disaster-tweets?scriptVersionId=94195076" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [1]:
import numpy as np
import pandas as pd
import os
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer, PorterStemmer
import spacy
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from xgboost import XGBClassifier
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/nlp-getting-started/sample_submission.csv
/kaggle/input/nlp-getting-started/train.csv
/kaggle/input/nlp-getting-started/test.csv


In [2]:
tweet_df = pd.read_csv('../input/nlp-getting-started/train.csv')

In [3]:
tweet_df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [4]:
X, y = tweet_df.drop('target', axis=1), tweet_df.target
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [5]:
nlp = spacy.load('en_core_web_sm')
#to add to process_tweets(): try porter stemmer and not removing stopwords
def process_tweets(tweets):
    processed_tweets = []
    for tweet in tweets.text:
        processed_tweet = []
        words = nlp(tweet)
        for word in words:
            word = word.lemma_
            word = word.lower()
            #if word not in stopwords.words('english'):
            processed_tweet.append(word)
        processed_tweets.append(processed_tweet)
    return processed_tweets
    
def build_freqs(processed_tweets, labels):
    freq_dict = dict()
    for tweet, label in zip(processed_tweets, labels):
        for word in tweet:
            if (word, label) in freq_dict:
                freq_dict[(word, label)] += 1
            else:
                freq_dict[(word, label)] = 1
    return freq_dict

def get_features(freqs, processed_tweet):
    bias = 1
    disaster = 0
    non_disaster = 0
    num_words = len(processed_tweet)
    num_chars = 0
    num_hashtags = 0
    num_links = 0
    for word in set(processed_tweet): 
        disaster += freqs.get((word, 1), 0)
        non_disaster += freqs.get((word, 0), 0)
        num_chars += len(word)
        if word[0] == '#':
            num_hashtags += 1
        if word[:5] == 'http':
            num_links += 1
    return (bias, disaster, non_disaster, num_chars, num_words, num_hashtags, num_links, (disaster / (non_disaster + 1)))

In [6]:
# model = LogisticRegression()
model = XGBClassifier(random_state=42, n_estimators=350, max_depth=3, learning_rate=0.01, booster='dart')
processed_tweets = process_tweets(X_train)
freqs = build_freqs(processed_tweets, y_train)
features = pd.DataFrame([get_features(freqs, tweet) for tweet in processed_tweets])
model.fit(features, y_train)
test_processed_tweets = process_tweets(X_test)
test_features = pd.DataFrame([get_features(freqs, tweet) for tweet in test_processed_tweets])
preds = model.predict(test_features)
print('The F1 score was {}'.format(f1_score(preds, y_test))) # 0.5716417910447761 V7, 0.5993031358885018 keeping stopwords
# 0.6368932038834951 with word count and char count 0.6336123631680618 with number of hashtags 0.6336123631680618 with num links
# 0.6179084073820915 with disaster/non_disaster ratio



The F1 score was 0.6198347107438017


In [7]:
processed_tweets = process_tweets(X)
freqs = build_freqs(processed_tweets, y)
features = pd.DataFrame([get_features(freqs, tweet) for tweet in processed_tweets])
model.fit(features, y)
test = pd.read_csv('../input/nlp-getting-started/test.csv')
processed_tweets = process_tweets(test)
features = pd.DataFrame([get_features(freqs, tweet) for tweet in processed_tweets])
preds = model.predict(features)
submission = pd.DataFrame({'id': test.id, 'target': preds})
submission.to_csv('submission.csv', index=False)



