In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
train = pd.read_csv('/kaggle/input/nlp-getting-started/train.csv')
test=  pd.read_csv('/kaggle/input/nlp-getting-started/test.csv')

In [4]:
train.tail(50)

In [5]:
plt.figure(figsize = (8, 6))
sns.heatmap(train.isnull(), yticklabels = False, cbar = True)
plt.title("Missing values", fontsize = 14)
plt.xticks(rotation = 40, fontsize = 12)
plt.show()


In [6]:
##The heatmap shows missing data. Some keywords in the tail and head of the data are missing. A lot of locations are also missing.

In [7]:
test.isnull().sum()

In [8]:
#Out of 3262 entries, 1105 of them do not have a location

In [9]:
#set up stop words for the tweets. These words will be disregarded.
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stpwrd = nltk.corpus.stopwords.words('english')
more_stop_words=['some', 'play','game','soccer', "all", "due", "to", "on", "daily"]
stpwrd.extend(more_stop_words)
print(stpwrd)

In [10]:
from collections import defaultdict #Defaultdict is a container like dictionaries present in the module collections. 
#defining a function to read the words (strings) in text column for each tweet as long as the string is not empty or a single space
#store as many words from the text column as the value of n_grams 
#return the joined string with each word separated by a space
def generate_ngrams(text, n_gram=1):
    word = [word for word in text.lower().split(' ') if word != '' if word not in stpwrd]
    ngrams = zip(*[word[i:] for i in range(n_gram)])
    return [' '.join(ngram) for ngram in ngrams]


DISASTER_TWEETS = train['target'] == 1
N=30
# Trigrams
#Create two spearate dictionaries for disaster and non-disaster tweets.
disaster_trigrams = defaultdict(int)
nondisaster_trigrams = defaultdict(int)

#for each tweet in training data where DISASTER_TWEETS=1 (i.e. the twwe which is identified as disaster tweet),read the text column
#the for each word in the return object of the function generate_ngrams which takes two inputs, tweet and ngram, add the word to disaster_trigrams dictionary.
for tweet in train[DISASTER_TWEETS]['text']:
    for word in generate_ngrams(tweet, n_gram=3):
        disaster_trigrams[word] += 1

#this loop keeps the non_disaster tweets (DISASTER_TWEETS != 1)
for tweet in train[~DISASTER_TWEETS]['text']:
    for word in generate_ngrams(tweet, n_gram=3):
        nondisaster_trigrams[word] += 1
        
df_disaster_trigrams = pd.DataFrame(sorted(disaster_trigrams.items(), key=lambda x: x[1])[::-1])
df_nondisaster_trigrams = pd.DataFrame(sorted(nondisaster_trigrams.items(), key=lambda x: x[1])[::-1])


In [11]:
fig, axes = plt.subplots(ncols=2, figsize=(15,30), dpi=100)

sns.barplot(y=df_disaster_trigrams[0].values[:N], x=df_disaster_trigrams[1].values[:N], ax=axes[0], color='lightcoral')
sns.barplot(y=df_nondisaster_trigrams[0].values[:N], x=df_nondisaster_trigrams[1].values[:N], ax=axes[1], color='lightskyblue')

for i in range(2):
    axes[i].spines['right'].set_visible(False)
    axes[i].set_xlabel('')
    axes[i].set_ylabel('')
    axes[i].tick_params(axis='x', labelsize=13)
    axes[i].tick_params(axis='y', labelsize=11)

axes[0].set_title(f'Top {N} most common trigrams in Disaster Tweets', fontsize=20)
axes[1].set_title(f'Top {N} most common trigrams in Non-disaster Tweets', fontsize=20)

plt.show()

#the plots below show the top 30 combinations found in disaster tweets, and what is the frequency with which they appear in the test data.


In [12]:
#geopy is a Python client for several popular geocoding web services. https://geopy.readthedocs.io/en/stable/ Nominatim is the map platform like google maps that is free to use
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter #the extra.rate_limiter extension is 
import folium #folium allows us to visualize Puthon-manipulated data on a map
from folium import plugins #they are external plugins used to make the maps prettier

new_data = pd.DataFrame()
new_data['location'] = ((train['location'].value_counts())[:30]).index
new_data['count'] = ((train['location'].value_counts())[:30]).values
geolocator = Nominatim(user_agent = 'Lulu')
geocode = RateLimiter(geolocator.geocode, min_delay_seconds = 1)
lat = {}
long = {}

for i in new_data['location']:
    location = geocode(i)
    lat[i] = location.latitude
    long[i] = location.longitude
new_data['latitude'] = new_data['location'].map(lat)
new_data['longitude'] = new_data['location'].map(long)
map = folium.Map(location = [10.0, 10.0], tiles = 'openstreetmap', zoom_start = 2)
markers = []
title = '''<h1 align = "center" style = "font-size: 15px"><b>Top 30 Tweet Locations</b></h1>'''
for i, r in new_data.iterrows():
    loss = r['count']
    if r['count'] > 0:
        counts = r['count'] * 0.4
        folium.CircleMarker([float(r['latitude']), float(r['longitude'])], radius = float(counts), color = 'red', fill = True).add_to(map)
map.get_root().html.add_child(folium.Element(title))
map


In [52]:
duplicates = train.groupby(['text']).nunique().sort_values(by='target', ascending=False)
duplicates = duplicates[duplicates['target'] > 1]['target']
duplicates


In [56]:
train = train.drop(['location','keyword'],1)
train