In [1]:
import numpy as np
import pandas as pd
import os
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/nlp-getting-started/sample_submission.csv
/kaggle/input/nlp-getting-started/train.csv
/kaggle/input/nlp-getting-started/test.csv


In [2]:
tweet_df = pd.read_csv('../input/nlp-getting-started/train.csv')

In [3]:
tweet_df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [4]:
X, y = tweet_df.drop('target', axis=1), tweet_df.target

In [5]:
def process_tweets(tweets):
    lemmatizer = WordNetLemmatizer()
    processed_tweets = []
    for tweet in tweets.text:
        processed_tweet = []
        words = word_tokenize(tweet)
        for word in words:
            word = word.lower()
            word = lemmatizer.lemmatize(word)
            if word not in stopwords.words('english'):
                processed_tweet.append(word)
        processed_tweets.append(processed_tweet)
    return processed_tweets
    
def build_freqs(processed_tweets, labels):
    freq_dict = dict()
    for tweet, label in zip(processed_tweets, labels):
        for word in tweet:
            if (word, label) in freq_dict:
                freq_dict[(word, label)] += 1
            else:
                freq_dict[(word, label)] = 1
    return freq_dict

def get_features(freqs, processed_tweet):
    bias = 1
    disaster = 0
    non_disaster = 0
    for word in processed_tweet: # maybe make a set to only count each word once
        disaster += freqs.get((word, 1), 0)
        non_disaster += freqs.get((word, 0), 0)
    return (bias, disaster, non_disaster)

In [6]:
# model = LogisticRegression()
model = XGBClassifier(random_state=42, n_estimators=350, max_depth=3, learning_rate=0.01, booster='dart')
processed_tweets = process_tweets(X)
freqs = build_freqs(processed_tweets, y)
features = pd.DataFrame([get_features(freqs, tweet) for tweet in processed_tweets])
model.fit(features, y)





XGBClassifier(base_score=0.5, booster='dart', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              gamma=0, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.01, max_delta_step=0,
              max_depth=3, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=350, n_jobs=4,
              num_parallel_tree=1, predictor='auto', random_state=42,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [7]:
test = pd.read_csv('../input/nlp-getting-started/test.csv')
processed_tweets = process_tweets(test)
features = pd.DataFrame([get_features(freqs, tweet) for tweet in processed_tweets])
preds = model.predict(features)
submission = pd.DataFrame({'id': test.id, 'target': preds})
submission.to_csv('submission.csv', index=False)