In [1]:
import pandas as pd
import numpy as np
import nltk
import string
from nltk.corpus import stopwords
from nltk import word_tokenize, FreqDist
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import cross_val_score, train_test_split

pd.set_option("display.max_colwidth", None) 

In [2]:
tweet_df = pd.read_csv('train.csv')
tweet_df = tweet_df[['text', 'target']]

In [3]:
tweet_df.head()

Unnamed: 0,text,target
0,Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all,1
1,Forest fire near La Ronge Sask. Canada,1
2,All residents asked to 'shelter in place' are being notified by officers. No other evacuation or shelter in place orders are expected,1
3,"13,000 people receive #wildfires evacuation orders in California",1
4,Just got sent this photo from Ruby #Alaska as smoke from #wildfires pours into a school,1


In [4]:
tweet_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    7613 non-null   object
 1   target  7613 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 119.1+ KB


In [5]:
stopwords_list = stopwords.words('english') + list(string.punctuation)

In [6]:
stopwords_list

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [7]:
def process_tweet(tweet):
    tokens = nltk.word_tokenize(tweet)
    stopwords_removed = [token.lower() for token in tokens if token.lower() not in stopwords_list]
    nonlist = ' '.join(stopwords_removed)
    return nonlist

In [8]:
tweet_df['processed_data'] = tweet_df['text'].map(process_tweet)
# tweet_df.drop('text', axis=1, inplace=True)
tweet_df.head()

Unnamed: 0,text,target,processed_data
0,Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all,1,deeds reason earthquake may allah forgive us
1,Forest fire near La Ronge Sask. Canada,1,forest fire near la ronge sask canada
2,All residents asked to 'shelter in place' are being notified by officers. No other evacuation or shelter in place orders are expected,1,residents asked 'shelter place notified officers evacuation shelter place orders expected
3,"13,000 people receive #wildfires evacuation orders in California",1,"13,000 people receive wildfires evacuation orders california"
4,Just got sent this photo from Ruby #Alaska as smoke from #wildfires pours into a school,1,got sent photo ruby alaska smoke wildfires pours school


In [9]:
# total_vocab = set()
# for tweet in tweet_df['processed_data']:
#     total_vocab.update(tweet)
# len(total_vocab)


In [10]:
# articles_concat = []
# for article in tweet_df['processed_data']:
#     articles_concat += article

In [11]:
# freq_dist = FreqDist(articles_concat) 
# freq_dist.most_common(200)

In [12]:
X = tweet_df['processed_data']
y = tweet_df['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=107, test_size=.2)



In [13]:
vectorizer = TfidfVectorizer()

train_vec = vectorizer.fit_transform(X_train)

test_vec = vectorizer.transform(X_test)

In [14]:
train_vec.shape
# (num_tweets, num_unique_words)

(6090, 18472)

In [15]:
rf_classifier = RandomForestClassifier(n_estimators=100)

In [16]:
rf_classifier.fit(train_vec, y_train)

RandomForestClassifier()

In [17]:
cross_val_score(rf_classifier,
               train_vec, y_train,
                cv=5)

array([0.7955665 , 0.77175698, 0.78407225, 0.76929392, 0.7545156 ])

In [19]:
rf_classifier.score(test_vec, y_test)

0.7872619829284307