In [1]:
# Import libraries 
# Standard libraries for data processing 
import pandas as pd 
import numpy as np 

# Data visualization
import seaborn as sns 
import matplotlib.pyplot as plt 
from PIL import Image 
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator 

# NLTK processing 
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer as SIA
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk import PorterStemmer

# Modeling 
import torch as pt
import tensorflow as tf 
from sklearn.model_selection import cross_val_score, train_test_split 
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics import confusion_matrix, precision_score, recall_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
# Load data sets 
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [3]:
# How long is the data set?
print("Length of the data set: {}\n".format(len(train)))

# Select a random sample of the data set
train.sample(6)

Length of the data set: 7613



Unnamed: 0,id,keyword,location,text,target
5002,7135,military,,@UniversityofLaw For the people who died in Hu...,1
827,1203,blizzard,,Updated to Windows 10 now I get this error htt...,0
5924,8457,screamed,,I ran with earbuds in which I now realize mean...,0
2533,3637,desolation,,Hey girl you must be Toe Hobbit: Part Two: ghe...,0
5950,8500,screaming,tx,@camilacabello97 Internally and externally scr...,1
7081,10142,upheaval,maryland,A Look at State Actions a Year after Ferguson...,1


In [4]:
# How many items of each target are there?
train.groupby('target').count()

Unnamed: 0_level_0,id,keyword,location,text
target,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,4342,4323,2884,4342
1,3271,3229,2196,3271


In [5]:
# Function that takes in a data frame and target, returning a keyword pairs dictionary 
def kwDict(df, target): 
    kw = list(df[df['target']==target].keyword)
    wordfreq = [kw.count(k) for k in kw]
    kw_dict = dict(list(zip(kw, wordfreq)))
    return kw_dict

In [6]:
# Generate keyword pairs for both disaster and non-disaster lists
kw1_dict = kwDict(train, 1)
kw0_dict = kwDict(train, 0)

In [7]:
# Print out first ten
print({k: kw0_dict[k] for k in list(kw0_dict)[:10]})
print({k: kw1_dict[k] for k in list(kw1_dict)[:10]})

{nan: 19, 'ablaze': 23, 'accident': 11, 'aftershock': 34, 'airplane%20accident': 5, 'ambulance': 18, 'annihilated': 23, 'annihilation': 19, 'apocalypse': 23, 'armageddon': 37}
{nan: 42, 'ablaze': 13, 'accident': 24, 'airplane%20accident': 30, 'ambulance': 20, 'annihilated': 11, 'annihilation': 10, 'apocalypse': 9, 'armageddon': 5, 'army': 5}


In [None]:
# Next steps:
# 1. Lemmatize and stem keywords (maybe add this to the function)
# 2. Word cloud 
# 3. Check how well a model would function if we only used the keywords 
# 4. Location
# 5. Are certain keywords tied more closely to certain locations? 
# 6. Clean tweets > lowercase, stem and lemmatize. 
# 7. Count Vectorize and Tfidf > build model with these to compare results with keywords and locations only. 
# 8. Hyperparameter model-tuning after a first successful model is found. 
# 9. Test pytorch and tensorflow NN. 