In [1]:
import numpy as np 
import pandas as pd 
import re
import string
import nltk
from nltk.corpus import stopwords

In [2]:
from sklearn import model_selection
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import f1_score
from sklearn import preprocessing, decomposition, model_selection
from sklearn.model_selection import GridSearchCV,StratifiedKFold,RandomizedSearchCV
import re
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from string import punctuation

In [3]:
data = pd.read_csv('train.csv')

In [4]:
data.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [5]:
print(data.shape)

(7613, 5)


In [6]:
def clean_text(text):
    #makes text lower case
    text = text.lower()
    # remove text in square brackets
    text = re.sub('\[.*?\]', '', text)
    #remove links
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    # remove punctuation
    text = re.sub('[%s]' % re.escape(punctuation), '', text)
    text = re.sub('\n', '', text)
    # remove numbers
    text = re.sub('\w*\d\w*', '', text)
    return text

In [7]:
data['text'] = data['text'].apply(lambda x: clean_text(x))

In [8]:
data['text'].head()

0    our deeds are the reason of this earthquake ma...
1                forest fire near la ronge sask canada
2    all residents asked to shelter in place are be...
3     people receive wildfires evacuation orders in...
4    just got sent this photo from ruby alaska as s...
Name: text, dtype: object

In [9]:
tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')

In [10]:
data['text'] = data['text'].apply(lambda x: tokenizer.tokenize(x))

In [11]:
def remove_stopwords(text):
    words = [word for word in text if word not in stopwords.words('english')]
    return words

In [12]:
data['text'] = data['text'].apply(lambda x : remove_stopwords(x))

In [13]:
def combine_text(list_of_text):
    combined_text = ' '.join(list_of_text)
    return combined_text

In [14]:
data['text'] = data['text'].apply(lambda x : combine_text(x))

In [15]:
def text_preprocessing(text):
    tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')
    nopunc = clean_text(text)
    tokenized_text = tokenizer.tokenize(nopunc)
    remove_stopwords = [w for w in tokenized_text if w not in stopwords.words('english')]
    combined_text = ' '.join(remove_stopwords)
    return combined_text

In [16]:
count_vectorizer = CountVectorizer()
data_vectors = count_vectorizer.fit_transform(data['text'])

In [17]:
tfidf = TfidfVectorizer(ngram_range=(1, 2))
data_tfidf = tfidf.fit_transform(data['text'])

In [18]:
data.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,deeds reason earthquake may allah forgive us,1
1,4,,,forest fire near la ronge sask canada,1
2,5,,,residents asked shelter place notified officer...,1
3,6,,,people receive wildfires evacuation orders cal...,1
4,7,,,got sent photo ruby alaska smoke wildfires pou...,1


# Logistic Regression

In [19]:
log_mod = LogisticRegression()

In [20]:
# Fitting on Counts
scores = model_selection.cross_val_score(log_mod, data_vectors, data["target"], cv=3, scoring="f1")
scores

array([0.60942761, 0.5511811 , 0.64553015])

In [21]:
log_mod.fit(data_vectors, data["target"])

LogisticRegression()

In [22]:
# Fitting on TFIDF

scores = model_selection.cross_val_score(log_mod, data_tfidf, data["target"], cv=3, scoring="f1")
scores

array([0.39067055, 0.34097421, 0.48380701])

# Naive Bayes 

In [23]:
naive = MultinomialNB()

In [24]:
# Fitting on Counts
scores = model_selection.cross_val_score(naive, data_vectors, data["target"], cv=5, scoring="f1")
scores

array([0.63149079, 0.60675773, 0.68575519, 0.64341085, 0.72505092])

In [25]:
naive.fit(data_vectors, data["target"])

MultinomialNB()

In [26]:
# Fitting on TFIDF

scores = model_selection.cross_val_score(naive, data_tfidf, data["target"], cv=5, scoring="f1")
scores

array([0.56211813, 0.55779817, 0.61106074, 0.58494208, 0.72250423])

In [27]:
naive.fit(data_tfidf, data["target"])

MultinomialNB()