### Import the Important Libraries 

In [1]:
# !pip install rake_nltk

In [2]:
# !pip install gensim --upgrade

In [3]:
import pandas as pd 
import numpy as np 
from rake_nltk import Rake 
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import RegexpTokenizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import pairwise_distances, cosine_similarity, linear_kernel
import gensim

In [4]:
df = pd.read_csv('2.processed_data/mega_data.csv')
fire_df = pd.read_csv('2.processed_data/fire_comb.csv')
blizz_df = pd.read_csv('2.processed_data/blizzard_comb.csv')
flood_df = pd.read_csv('2.processed_data/flood_comb.csv')
hurr_df = pd.read_csv('2.processed_data/hurricane_comb.csv')
earth_df = pd.read_csv('2.processed_data/earthquake_comb.csv')
torn_df = pd.read_csv('2.processed_data/tornado_comb.csv')

df.head(2)

Unnamed: 0,author,content,description,photo_url,pub_date,source,title,url
0,Lisa Rowan,"Its hurricane season, and weve got a weirdo st...","It’s hurricane season, and we’ve got a weirdo ...",https://i.kinja-img.com/gawker-media/image/upl...,2019-07-10,,Never Try To Drive Through a Flood,https://lifehacker.com/never-try-to-drive-thro...
1,"Yessenia Funes on Earther, shared by Virginia ...",Tropical Storm Barry still doesnt formally exi...,Tropical Storm Barry still doesn’t formally ex...,https://i.kinja-img.com/gawker-media/image/upl...,2019-07-11,,New Orleans Faces a Major Flood Threat [Updating],https://earther.gizmodo.com/new-orleans-faces-...


### This cell combines the text from the relevant columns in preparation for tokenization

In [5]:
df['combined_text'] = df['content'].map(str) + df['title'].map(str) + df['description'].map(str)
fire_df['combined_text'] = fire_df['content'].map(str) + fire_df['title'].map(str) + fire_df['description'].map(str)
blizz_df['combined_text'] = blizz_df['content'].map(str) + blizz_df['title'].map(str) + blizz_df['description'].map(str)
flood_df['combined_text'] = flood_df['content'].map(str) + flood_df['title'].map(str) + flood_df['description'].map(str)
hurr_df['combined_text'] = hurr_df['content'].map(str) + hurr_df['title'].map(str) + hurr_df['description'].map(str)
earth_df['combined_text'] = earth_df['content'].map(str) + earth_df['title'].map(str) + earth_df['description'].map(str)
torn_df['combined_text'] = torn_df['content'].map(str) + torn_df['title'].map(str) + torn_df['description'].map(str)

In [6]:
tokenizer = RegexpTokenizer(r'\w+')
def tokenize(x): 
    return tokenizer.tokenize(x)

df['tokens'] = df['combined_text'].map(tokenize)
fire_df['tokens'] = fire_df['combined_text'].map(tokenize)
blizz_df['tokens'] = blizz_df['combined_text'].map(tokenize)
flood_df['tokens'] = flood_df['combined_text'].map(tokenize)
hurr_df['tokens'] = hurr_df['combined_text'].map(tokenize)
earth_df['tokens'] = earth_df['combined_text'].map(tokenize)
torn_df['tokens'] = torn_df['combined_text'].map(tokenize)

In [7]:
def stemmer(x): 
    stemmer = PorterStemmer()
    return ' '.join([stemmer.stem(word) for word in x])
def lemmatize(x):
    lemmatizer = WordNetLemmatizer()
    return' '.join([lemmatizer.lemmatize(word) for word in x])

In [8]:
df['lems'] = df['tokens'].map(lemmatize)
df['stems'] = df['tokens'].map(stemmer)

fire_df['lems'] = fire_df['tokens'].map(lemmatize)
fire_df['stems'] = fire_df['tokens'].map(stemmer)

blizz_df['lems'] = blizz_df['tokens'].map(lemmatize)
blizz_df['stems'] = blizz_df['tokens'].map(stemmer)

flood_df['lems'] = flood_df['tokens'].map(lemmatize)
flood_df['stems'] = flood_df['tokens'].map(stemmer)

hurr_df['lems'] = hurr_df['tokens'].map(lemmatize)
hurr_df['stems'] = hurr_df['tokens'].map(stemmer)

earth_df['lems'] = earth_df['tokens'].map(lemmatize)
earth_df['stems'] = earth_df['tokens'].map(stemmer)

torn_df['lems'] = torn_df['tokens'].map(lemmatize)
torn_df['stems'] = torn_df['tokens'].map(stemmer)

In [9]:
df.head(1)

Unnamed: 0,author,content,description,photo_url,pub_date,source,title,url,combined_text,tokens,lems,stems
0,Lisa Rowan,"Its hurricane season, and weve got a weirdo st...","It’s hurricane season, and we’ve got a weirdo ...",https://i.kinja-img.com/gawker-media/image/upl...,2019-07-10,,Never Try To Drive Through a Flood,https://lifehacker.com/never-try-to-drive-thro...,"Its hurricane season, and weve got a weirdo st...","[Its, hurricane, season, and, weve, got, a, we...",Its hurricane season and weve got a weirdo sto...,it hurrican season and weve got a weirdo storm...


### Creating keywords
This code was adapted from a data science blog post written by Emma Grimaldi. There is an nltk package that allows you to extract keywords from a particular article. Keywords are the most relevant words that are associated with a particular piece of content. The code below extracts keywords from the combined_text column and puts them into a column called keywords. 

In [10]:
#received code from https://towardsdatascience.com/how-to-build-from-scratch-a-content-based-movie-recommender-with-natural-language-processing-25ad400eb243
# turn this into a mappable function

df['keywords'] = ""
fire_df['keywords'] = ""
blizz_df['keywords'] = ""
flood_df['keywords'] = ""
hurr_df['keywords'] = ""
earth_df['keywords'] = ""
torn_df['keywords'] = ""

for index,row in df.iterrows():
    comb_text = row['combined_text']
    r = Rake()
    r.extract_keywords_from_text(comb_text)
    key_words_dict = r.get_word_degrees()
    row['keywords'] = list(key_words_dict.keys())
    
for index,row in fire_df.iterrows():
    comb_text = row['combined_text']
    r = Rake()
    r.extract_keywords_from_text(comb_text)
    key_words_dict = r.get_word_degrees()
    row['keywords'] = list(key_words_dict.keys())
    
for index,row in blizz_df.iterrows():
    comb_text = row['combined_text']
    r = Rake()
    r.extract_keywords_from_text(comb_text)
    key_words_dict = r.get_word_degrees()
    row['keywords'] = list(key_words_dict.keys())
    
for index,row in flood_df.iterrows():
    comb_text = row['combined_text']
    r = Rake()
    r.extract_keywords_from_text(comb_text)
    key_words_dict = r.get_word_degrees()
    row['keywords'] = list(key_words_dict.keys())
    
for index,row in hurr_df.iterrows():
    comb_text = row['combined_text']
    r = Rake()
    r.extract_keywords_from_text(comb_text)
    key_words_dict = r.get_word_degrees()
    row['keywords'] = list(key_words_dict.keys())
    
for index,row in earth_df.iterrows():
    comb_text = row['combined_text']
    r = Rake()
    r.extract_keywords_from_text(comb_text)
    key_words_dict = r.get_word_degrees()
    row['keywords'] = list(key_words_dict.keys())
    
for index,row in torn_df.iterrows():
    comb_text = row['combined_text']
    r = Rake()
    r.extract_keywords_from_text(comb_text)
    key_words_dict = r.get_word_degrees()
    row['keywords'] = list(key_words_dict.keys())

### Word2Vec

Word2Vec is a two-layer neural net that processes text. It's input is some text and the output is a list of feature vectors that are similar to that original text. The purpose of using Word2Vec is to boost our search engine's capabilities

1. The user inputs keywords such as (california, wildfire)
2. The function then cleans the text string because the word2vec model can only handle lowercase strings
3. The word2vec algorithim finds the most similar words related to that search term and appends those words along with the original search term to a list.
4. The function then sees how many keywords in our dataframe match the keywords in the keywords in the search term list. 
5. The function outputs the title of the most related articles given a threshold for a specific number of search terms 

In [11]:
model = gensim.models.KeyedVectors.load_word2vec_format('lexvec.enwiki+newscrawl.300d.W.pos.vectors')

In [12]:
def art_test(df):
    search = input("Type disaster then location(ex: fire, california):")
    stripped = [x.strip() for x in search.split(' ')]
    result = [x.lower() for x in stripped]
    print(result)
    word2vec_list = []
    for x in model.most_similar(result):
        term,sim = x 
        word2vec_list.append(term)
    word2vec_list.extend(result)
    print(word2vec_list)
    article_list = []
    threshold = 7
    print(threshold)
    while len(article_list) < 5 and threshold > 3:
        for article in df.loc[df['keywords'].map(lambda x: sum(1 for w in word2vec_list if w in x)) == threshold, 'title']:
            article_list.append(article)
        threshold -= 1
    return article_list if len(article_list) > 0 else print('No Results!')

### Wildfires

In [13]:
wildfire_list = art_test(df)

Type disaster then location(ex: fire, california):wildfire
['wildfire']
['wildfires', 'blazes', 'blaze', 'bushfire', 'fires', 'bushfires', 'fire', 'conflagration', 'grassfire', 'flames', 'wildfire']
7


In [14]:
len(wildfire_list)

21

In [15]:
fire_df["final_list"] = fire_df["title"].map(lambda x: 1 if x in wildfire_list else 0)
fire_df = fire_df.loc[fire_df["final_list"] == 1]
fire_df.reset_index(inplace= True)
fire_df.drop(axis=1, columns="index")
fire_df.shape

(19, 15)

### Tornado

In [16]:
torn_list = art_test(df)

Type disaster then location(ex: fire, california):tornado
['tornado']
['tornadoes', 'twister', 'twisters', 'thunderstorm', 'storm', 'typhoon', 'tornados', 'cyclone', 'storms', 'thunderstorms', 'tornado']
7


In [17]:
len(torn_list)

6

In [18]:
torn_df["final_list"] = torn_df["title"].map(lambda x: 1 if x in torn_list else 0)
torn_df = torn_df.loc[torn_df["final_list"] == 1]
torn_df.reset_index(inplace= True)
torn_df.drop(axis=1, columns="index")
torn_df.shape

(8, 15)

### Flood

In [19]:
flood_list = art_test(df)

Type disaster then location(ex: fire, california):flood
['flood']
['flooding', 'floods', 'deluge', 'floodwaters', 'flooded', 'inundation', 'levees', 'floodwater', 'tsunami', 'levee', 'flood']
7


In [20]:
len(flood_list)

8

In [21]:
flood_df["final_list"] = flood_df["title"].map(lambda x: 1 if x in flood_list else 0)
flood_df = flood_df.loc[flood_df["final_list"] == 1]
flood_df.reset_index(inplace= True)
flood_df.drop(axis=1, columns="index")
flood_df.shape

(6, 15)

### Earthquakes

In [22]:
earth_list = art_test(df)

Type disaster then location(ex: fire, california):earthquake
['earthquake']
['quake', 'temblor', 'tsunami', 'aftershock', 'aftershocks', 'earthquakes', 'quakes', 'tsunamis', 'temblors', 'tremor', 'earthquake']
7


In [23]:
len(earth_list)

5

In [24]:
earth_df["final_list"] = earth_df["title"].map(lambda x: 1 if x in earth_list else 0)
earth_df = earth_df.loc[earth_df["final_list"] == 1]
earth_df.reset_index(inplace= True)
earth_df.drop(axis=1, columns="index")
earth_df.shape

(6, 15)

### Blizzard - No news articles for blizzards in this time of the year (Only the blizzard entertainment)

In [25]:
blizz_list = art_test(df)

Type disaster then location(ex: fire, california):blizzard
['blizzard']
['snowstorm', 'blizzards', 'snowstorms', 'whiteout', 'thunderstorm', 'snowfall', 'storms', 'rainstorm', 'torrential', 'storm', 'blizzard']
7
No Results!


In [26]:
try:
    len(blizz_list)
except:
    print("No articles")

No articles


In [27]:
try:
    blizz_df["final_list"] = blizz_df["title"].map(lambda x: 1 if x in blizz_list else 0)
    blizz_df = blizz_df.loc[blizz_df["final_list"] == 1]
    blizz_df.reset_index(inplace= True)
    blizz_df.drop(axis=1, columns="index")
    blizz_df.shape
except:
    print("No articles")

No articles


### Hurricanes

In [28]:
hurr_list = art_test(df)

Type disaster then location(ex: fire, california):hurricane
['hurricane']
['typhoon', 'cyclone', 'storm', 'superstorm', 'tropical', 'landfall', 'hurricanes', 'nhc', 'storms', 'katrina', 'hurricane']
7


In [29]:
len(hurr_list)

12

In [30]:
hurr_df["final_list"] = hurr_df["title"].map(lambda x: 1 if x in hurr_list else 0)
hurr_df = hurr_df.loc[hurr_df["final_list"] == 1]
hurr_df.reset_index(inplace= True)
hurr_df.drop(axis=1, columns="index")
hurr_df.shape

(8, 15)

### Removing Nans

In [31]:
def fill_author(prueba):
    for art in range(len(prueba)):
        if pd.isnull(prueba['url'][art]):
            prueba['author'][art] = "No author"
        else:
            if pd.isnull(prueba['author'][art]):
                if (str(prueba["url"][art]).split("/",3)[:3])[2].split(".")[0] == "www":
                    prueba['author'][art] = (str(prueba["url"][art]).split("/",3)[:3])[2].split(".")[1]
    #                 print(2)
                else:
                    prueba['author'][art] = (str(prueba["url"][art]).split("/",3)[:3])[2].split(".")[0]
    #                 print(1)
            else:
    #             print("yes")
                pass

In [32]:
fill_author(df)
fill_author(fire_df)
fill_author(blizz_df)
fill_author(earth_df)
fill_author(flood_df)
fill_author(hurr_df)
fill_author(torn_df)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


### Saving Final Data

In [33]:
fire_df.to_csv('3.final_data/fire_comb.csv', index = False)
blizz_df.to_csv('3.final_data/blizzard_comb.csv', index = False)
earth_df.to_csv('3.final_data/earthquake_comb.csv', index = False)
flood_df.to_csv('3.final_data/flood_comb.csv', index = False)
hurr_df.to_csv('3.final_data/hurricane_comb.csv', index = False)
torn_df.to_csv('3.final_data/tornado_comb.csv', index = False)
df.to_csv('3.final_data/mega_data.csv', index = False)