In [25]:
import pandas as pd
import numpy as np
import nltk
import string
from nltk.corpus import stopwords
from nltk import word_tokenize, FreqDist
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn import feature_extraction, linear_model, model_selection, preprocessing
from sklearn.model_selection import cross_val_score

pd.set_option("display.max_colwidth", None) 

In [2]:
tweet_df = pd.read_csv('train.csv')
test_tweets_df = pd.read_csv('test.csv')

In [3]:
tweet_df.head(3)

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are being notified by officers. No other evacuation or shelter in place orders are expected,1


In [30]:
tweet_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7613 non-null   int64 
 1   keyword   7552 non-null   object
 2   location  5080 non-null   object
 3   text      7613 non-null   object
 4   target    7613 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 297.5+ KB


In [4]:
tweet_df['keyword'].value_counts()

fatalities               45
armageddon               42
deluge                   42
body%20bags              41
harm                     41
                         ..
forest%20fire            19
epicentre                12
threat                   11
inundation               10
radiation%20emergency     9
Name: keyword, Length: 221, dtype: int64

In [9]:
X_train = tweet_df['text']
y_train = tweet_df['target']
X_test = test_tweets_df['text']

In [11]:
stopwords_list = stopwords.words('english') + list(string.punctuation)

In [12]:
stopwords_list

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [13]:
def process_tweet(tweet):
    tokens = nltk.word_tokenize(tweet)
    stopwords_removed = [token.lower() for token in tokens if token.lower() not in stopwords_list]
    return stopwords_removed

In [15]:
processed_data = list(map(process_tweet, X_train))
processed_data[0]

['deeds', 'reason', 'earthquake', 'may', 'allah', 'forgive', 'us']

In [16]:
total_vocab = set()
for tweet in processed_data:
    total_vocab.update(tweet)
len(total_vocab)

22882

In [17]:
articles_concat = []
for article in processed_data:
    articles_concat += article

In [19]:
freq_dist = FreqDist(articles_concat)
freq_dist.most_common(200)

[('http', 4307),
 ('...', 945),
 ("'s", 791),
 ("n't", 446),
 ('https', 409),
 ('like', 346),
 ('amp', 344),
 ("'m", 250),
 ('fire', 249),
 ('get', 228),
 ('new', 223),
 ('via', 218),
 ('people', 197),
 ('news', 197),
 ('one', 194),
 ('video', 165),
 ('2', 162),
 ('emergency', 155),
 ('disaster', 154),
 ('would', 142),
 ('police', 140),
 ("'re", 129),
 ('still', 128),
 ('got', 124),
 ('body', 124),
 ('us', 122),
 ('..', 120),
 ('burning', 120),
 ('back', 119),
 ('storm', 119),
 ('california', 117),
 ('crash', 117),
 ('time', 112),
 ('know', 112),
 ('man', 110),
 ('suicide', 110),
 ('buildings', 110),
 ('day', 108),
 ('rt', 107),
 ('see', 105),
 ('first', 105),
 ('world', 105),
 ('going', 103),
 ('bomb', 103),
 ('ca', 102),
 ('3', 102),
 ('love', 100),
 ('fires', 100),
 ('nuclear', 100),
 ('today', 99),
 ('attack', 99),
 ('two', 98),
 ('youtube', 98),
 ('dead', 96),
 ('killed', 96),
 ('go', 93),
 ('train', 93),
 ('gt', 92),
 ('full', 91),
 ('war', 90),
 ('car', 89),
 ('accident', 89),
 

In [20]:
vectorizer = TfidfVectorizer()

In [22]:
train_vec = vectorizer.fit_transform(X_train)

test_vec = vectorizer.transform(X_test)

In [23]:
train_vec.shape
# (num_tweets, num_unique_words)

(7613, 21637)

In [26]:
rf_classifier = RandomForestClassifier(n_estimators=100)

In [28]:
rf_classifier.fit(train_vec, y_train)

RandomForestClassifier()

In [29]:
cross_val_score(rf_classifier,
               train_vec, y_train,
                cv=5)

array([0.7235719 , 0.67301379, 0.67826658, 0.69053876, 0.75952694])