### Importing required Libraries.

In [0]:
import nltk
nltk.download('stopwords')

import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from nltk.corpus import stopwords
from nltk.util import ngrams
from sklearn.feature_extraction.text import CountVectorizer
from collections import defaultdict
from collections import  Counter
plt.style.use('ggplot')
stop=set(stopwords.words('english'))
import re
from nltk.tokenize import word_tokenize
import gensim
import string
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from tqdm import tqdm
from keras.models import Sequential
from keras.layers import Embedding,LSTM,Dense,SpatialDropout1D
from keras.initializers import Constant
from sklearn.model_selection import train_test_split
from keras.optimizers import Adam

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Using TensorFlow backend.


## Loading the data and getting basic idea 

In [0]:
tweet= pd.read_csv('./train.csv')
test=pd.read_csv('./test.csv')
tweet.head(3)

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1


In [0]:
print('There are {} rows and {} columns in train'.format(tweet.shape[0], tweet.shape[1]))
print('There are {} rows and {} columns in train'.format(test.shape[0], test.shape[1]))

There are 7613 rows and 5 columns in train
There are 3263 rows and 4 columns in train


## Data Cleaning
As we know,twitter tweets always have to be cleaned before we go onto modelling.So we will do some basic cleaning such as spelling correction,removing punctuations,removing html tags and emojis etc.So let's start.

In [0]:
#remove tweets shorter than 4
tweet = tweet.drop(tweet[tweet['text'].apply(lambda s : len(s.split()) < 4)].index)

print('There are {} rows after removing'.format(tweet.shape[0]))
print('There are {} rows after removing'.format(test.shape[0]))

There are 7496 rows after removing
There are 3263 rows after removing


In [0]:
df = pd.concat([tweet, test], sort=False)
df.shape

(10759, 5)

In [0]:
from spellchecker import SpellChecker

spell = SpellChecker()
def correct_spellings(text):
    corrected_text = []
    misspelled_words = spell.unknown(text.split())
    for word in text.split():
        if word in misspelled_words:
            corrected_text.append(spell.correction(word))
        else:
            corrected_text.append(word)
    return " ".join(corrected_text)

def clean(tweet):         
    # Special characters
    tweet = tweet.replace("\x89Û_", "")
    tweet = tweet.replace("\x89ÛÒ", "")
    tweet = tweet.replace("\x89ÛÓ", "")
    tweet = tweet.replace("\x89ÛÏWhen", "When")
    tweet = tweet.replace("\x89ÛÏ", "")
    tweet = tweet.replace("China\x89Ûªs", "China's")
    tweet = tweet.replace("let\x89Ûªs", "let's")
    tweet = tweet.replace("\x89Û÷", "")
    tweet = tweet.replace("\x89Ûª", "")
    tweet = tweet.replace("\x89Û\x9d", "")
    tweet = tweet.replace("å_", "")
    tweet = tweet.replace("\x89Û¢", "")
    tweet = tweet.replace("\x89Û¢åÊ", "")
    tweet = tweet.replace("fromåÊwounds", "from wounds")
    tweet = tweet.replace("åÊ", "")
    tweet = tweet.replace("åÈ", "")
    tweet = tweet.replace("JapÌ_n", "Japan")    
    tweet = tweet.replace("Ì©", "e")
    tweet = tweet.replace("å¨", "")
    tweet = tweet.replace("SuruÌ¤", "Suruc")
    tweet = tweet.replace("åÇ", "")
    tweet = tweet.replace("å£3million", "3 million")
    tweet = tweet.replace("åÀ", "")
    
    # Contractions
    tweet = tweet.replace("he's", "he is")
    tweet = tweet.replace("there's", "there is")
    tweet = tweet.replace("We're", "We are")
    tweet = tweet.replace("That's", "That is")
    tweet = tweet.replace("won't", "will not")
    tweet = tweet.replace("they're", "they are")
    tweet = tweet.replace("Can't", "Cannot")
    tweet = tweet.replace("wasn't", "was not")
    tweet = tweet.replace("don\x89Ûªt", "do not")
    tweet = tweet.replace("aren't", "are not")
    tweet = tweet.replace("isn't", "is not")
    tweet = tweet.replace("What's", "What is")
    tweet = tweet.replace("haven't", "have not")
    tweet = tweet.replace("hasn't", "has not")
    tweet = tweet.replace("There's", "There is")
    tweet = tweet.replace("He's", "He is")
    tweet = tweet.replace("It's", "It is")
    tweet = tweet.replace("You're", "You are")
    tweet = tweet.replace("I'M", "I am")
    tweet = tweet.replace("shouldn't", "should not")
    tweet = tweet.replace("wouldn't", "would not")
    tweet = tweet.replace("i'm", "I am")
    tweet = tweet.replace("I\x89Ûªm", "I am")
    tweet = tweet.replace("I'm", "I am")
    tweet = tweet.replace("Isn't", "is not")
    tweet = tweet.replace("Here's", "Here is")
    tweet = tweet.replace("you've", "you have")
    tweet = tweet.replace("you\x89Ûªve", "you have")
    tweet = tweet.replace("we're", "we are")
    tweet = tweet.replace("what's", "what is")
    tweet = tweet.replace("couldn't", "could not")
    tweet = tweet.replace("we've", "we have")
    tweet = tweet.replace("it\x89Ûªs", "it is")
    tweet = tweet.replace("doesn\x89Ûªt", "does not")
    tweet = tweet.replace("It\x89Ûªs", "It is")
    tweet = tweet.replace("Here\x89Ûªs", "Here is")
    tweet = tweet.replace("who's", "who is")
    tweet = tweet.replace("I\x89Ûªve", "I have")
    tweet = tweet.replace("y'all", "you all")
    tweet = tweet.replace("can\x89Ûªt", "cannot")
    tweet = tweet.replace("would've", "would have")
    tweet = tweet.replace("it'll", "it will")
    tweet = tweet.replace("we'll", "we will")
    tweet = tweet.replace("wouldn\x89Ûªt", "would not")
    tweet = tweet.replace("We've", "We have")
    tweet = tweet.replace("he'll", "he will")
    tweet = tweet.replace("Y'all", "You all")
    tweet = tweet.replace("Weren't", "Were not")
    tweet = tweet.replace("Didn't", "Did not")
    tweet = tweet.replace("they'll", "they will")
    tweet = tweet.replace("they'd", "they would")
    tweet = tweet.replace("DON'T", "DO NOT")
    tweet = tweet.replace("That\x89Ûªs", "That is")
    tweet = tweet.replace("they've", "they have")
    tweet = tweet.replace("i'd", "I would")
    tweet = tweet.replace("should've", "should have")
    tweet = tweet.replace("You\x89Ûªre", "You are")
    tweet = tweet.replace("where's", "where is")
    tweet = tweet.replace("Don\x89Ûªt", "Do not")
    tweet = tweet.replace("we'd", "we would")
    tweet = tweet.replace("i'll", "I will")
    tweet = tweet.replace("weren't", "were not")
    tweet = tweet.replace("They're", "They are")
    tweet = tweet.replace("Can\x89Ûªt", "Cannot")
    tweet = tweet.replace("you\x89Ûªll", "you will")
    tweet = tweet.replace("I\x89Ûªd", "I would")
    tweet = tweet.replace("let's", "let us")
    tweet = tweet.replace("it's", "it is")
    tweet = tweet.replace("can't", "cannot")
    tweet = tweet.replace("don't", "do not")
    tweet = tweet.replace("you're", "you are")
    tweet = tweet.replace("i've", "I have")
    tweet = tweet.replace("that's", "that is")
    tweet = tweet.replace("i'll", "I will")
    tweet = tweet.replace("doesn't", "does not")
    tweet = tweet.replace("i'd", "I would")
    tweet = tweet.replace("didn't", "did not")
    tweet = tweet.replace("ain't", "am not")
    tweet = tweet.replace("you'll", "you will")
    tweet = tweet.replace("I've", "I have")
    tweet = tweet.replace("Don't", "do not")
    tweet = tweet.replace("I'll", "I will")
    tweet = tweet.replace("I'd", "I would")
    tweet = tweet.replace("Let's", "Let us")
    tweet = tweet.replace("you'd", "You would")
    tweet = tweet.replace("It's", "It is")
    tweet = tweet.replace("Ain't", "am not")
    tweet = tweet.replace("Haven't", "Have not")
    tweet = tweet.replace("Could've", "Could have")
    tweet = tweet.replace("youve", "you have")  
    tweet = tweet.replace("donå«t", "do not")
            
    # Character entity references
    tweet = tweet.replace("&gt;", ">")
    tweet = tweet.replace("&lt;", "<")
    tweet = tweet.replace("&amp;", "&")
    
    # Typos, slang and informal abbreviations
    tweet = tweet.replace("w/e", "whatever")
    tweet = tweet.replace("w/", "with")
    tweet = tweet.replace("USAgov", "USA government")
    tweet = tweet.replace("recentlu", "recently")
    tweet = tweet.replace("Ph0tos", "Photos")
    tweet = tweet.replace("amirite", "am I right")
    tweet = tweet.replace("exp0sed", "exposed")
    tweet = tweet.replace("<3", "love")
    tweet = tweet.replace("amageddon", "armageddon")
    tweet = tweet.replace("Trfc", "Traffic")
    tweet = tweet.replace("8/5/2015", "2015-08-05")
    tweet = tweet.replace("WindStorm", "Wind Storm")
    tweet = tweet.replace("8/6/2015", "2015-08-06")
    tweet = tweet.replace("10:38PM", "10:38 PM")
    tweet = tweet.replace("10:30pm", "10:30 PM")
    tweet = tweet.replace("16yr", "16 year")
    tweet = tweet.replace("lmao", "laughing my ass off")   
    tweet = tweet.replace("TRAUMATISED", "traumatized")
    
    # Hashtags and usernames
    tweet = tweet.replace("IranDeal", "Iran Deal")
    tweet = tweet.replace("ArianaGrande", "Ariana Grande")
    tweet = tweet.replace("camilacabello97", "camila cabello") 
    tweet = tweet.replace("RondaRousey", "Ronda Rousey")     
    tweet = tweet.replace("MTVHottest", "MTV Hottest")
    tweet = tweet.replace("TrapMusic", "Trap Music")
    tweet = tweet.replace("ProphetMuhammad", "Prophet Muhammad")
    tweet = tweet.replace("PantherAttack", "Panther Attack")
    tweet = tweet.replace("StrategicPatience", "Strategic Patience")
    tweet = tweet.replace("socialnews", "social news")
    tweet = tweet.replace("NASAHurricane", "NASA Hurricane")
    tweet = tweet.replace("onlinecommunities", "online communities")
    tweet = tweet.replace("humanconsumption", "human consumption")
    tweet = tweet.replace("Typhoon-Devastated", "Typhoon Devastated")
    tweet = tweet.replace("Meat-Loving", "Meat Loving")
    tweet = tweet.replace("facialabuse", "facial abuse")
    tweet = tweet.replace("LakeCounty", "Lake County")
    tweet = tweet.replace("BeingAuthor", "Being Author")
    tweet = tweet.replace("withheavenly", "with heavenly")
    tweet = tweet.replace("thankU", "thank you")
    tweet = tweet.replace("iTunesMusic", "iTunes Music")
    tweet = tweet.replace("OffensiveContent", "Offensive Content")
    tweet = tweet.replace("WorstSummerJob", "Worst Summer Job")
    tweet = tweet.replace("HarryBeCareful", "Harry Be Careful")
    tweet = tweet.replace("NASASolarSystem", "NASA Solar System")
    tweet = tweet.replace("animalrescue", "animal rescue")
    tweet = tweet.replace("KurtSchlichter", "Kurt Schlichter")
    tweet = tweet.replace("aRmageddon", "armageddon")
    tweet = tweet.replace("Throwingknifes", "Throwing knives")
    tweet = tweet.replace("GodsLove", "God's Love")
    tweet = tweet.replace("bookboost", "book boost")
    tweet = tweet.replace("ibooklove", "I book love")
    tweet = tweet.replace("NestleIndia", "Nestle India")
    tweet = tweet.replace("realDonaldTrump", "Donald Trump")
    tweet = tweet.replace("DavidVonderhaar", "David Vonderhaar")
    tweet = tweet.replace("CecilTheLion", "Cecil The Lion")
    tweet = tweet.replace("weathernetwork", "weather network")
    tweet = tweet.replace("withBioterrorism&use", "with Bioterrorism & use")
    tweet = tweet.replace("Hostage&2", "Hostage & 2")
    tweet = tweet.replace("GOPDebate", "GOP Debate")
    tweet = tweet.replace("RickPerry", "Rick Perry")
    tweet = tweet.replace("frontpage", "front page")
    tweet = tweet.replace("NewsInTweets", "News In Tweets")
    tweet = tweet.replace("ViralSpell", "Viral Spell")
    tweet = tweet.replace("til_now", "until now")
    tweet = tweet.replace("volcanoinRussia", "volcano in Russia")
    tweet = tweet.replace("ZippedNews", "Zipped News")
    tweet = tweet.replace("MicheleBachman", "Michele Bachman")
    tweet = tweet.replace("53inch", "53 inch")
    tweet = tweet.replace("KerrickTrial", "Kerrick Trial")
    tweet = tweet.replace("abstorm", "Alberta Storm")
    tweet = tweet.replace("Beyhive", "Beyonce hive")
    tweet = tweet.replace("IDFire", "Idaho Fire")
    tweet = tweet.replace("DETECTADO", "Detected")
    tweet = tweet.replace("RockyFire", "Rocky Fire")
    tweet = tweet.replace("Listen/Buy", "Listen / Buy")
    tweet = tweet.replace("NickCannon", "Nick Cannon")
    tweet = tweet.replace("FaroeIslands", "Faroe Islands")
    tweet = tweet.replace("yycstorm", "Calgary Storm")
    tweet = tweet.replace("IDPs:", "Internally Displaced People :")
    tweet = tweet.replace("ArtistsUnited", "Artists United")
    tweet = tweet.replace("ClaytonBryant", "Clayton Bryant")
    tweet = tweet.replace("jimmyfallon", "jimmy fallon")
    tweet = tweet.replace("justinbieber", "justin bieber")  
    tweet = tweet.replace("UTC2015", "UTC 2015")
    tweet = tweet.replace("Time2015", "Time 2015")
    tweet = tweet.replace("djicemoon", "dj icemoon")
    tweet = tweet.replace("LivingSafely", "Living Safely")
    tweet = tweet.replace("FIFA16", "Fifa 2016")
    tweet = tweet.replace("thisiswhywecanthavenicethings", "this is why we cannot have nice things")
    tweet = tweet.replace("bbcnews", "bbc news")
    tweet = tweet.replace("UndergroundRailraod", "Underground Railraod")
    tweet = tweet.replace("c4news", "c4 news")
    tweet = tweet.replace("OBLITERATION", "obliteration")
    tweet = tweet.replace("MUDSLIDE", "mudslide")
    tweet = tweet.replace("NoSurrender", "No Surrender")
    tweet = tweet.replace("NotExplained", "Not Explained")
    tweet = tweet.replace("greatbritishbakeoff", "great british bake off")
    tweet = tweet.replace("LondonFire", "London Fire")
    tweet = tweet.replace("KOTAWeather", "KOTA Weather")
    tweet = tweet.replace("LuchaUnderground", "Lucha Underground")
    tweet = tweet.replace("KOIN6News", "KOIN 6 News")
    tweet = tweet.replace("LiveOnK2", "Live On K2")
    tweet = tweet.replace("9NewsGoldCoast", "9 News Gold Coast")
    tweet = tweet.replace("nikeplus", "nike plus")
    tweet = tweet.replace("david_cameron", "David Cameron")
    tweet = tweet.replace("peterjukes", "Peter Jukes")
    tweet = tweet.replace("JamesMelville", "James Melville")
    tweet = tweet.replace("megynkelly", "Megyn Kelly")
    tweet = tweet.replace("cnewslive", "C News Live")
    tweet = tweet.replace("JamaicaObserver", "Jamaica Observer")
    tweet = tweet.replace("TweetLikeItsSeptember11th2001", "Tweet like it is september 11th 2001")
    tweet = tweet.replace("cbplawyers", "cbp lawyers")
    tweet = tweet.replace("fewmoretweets", "few more tweets")
    tweet = tweet.replace("BlackLivesMatter", "Black Lives Matter")
    tweet = tweet.replace("cjoyner", "Chris Joyner")
    tweet = tweet.replace("ENGvAUS", "England vs Australia")
    tweet = tweet.replace("ScottWalker", "Scott Walker")
    tweet = tweet.replace("MikeParrActor", "Michael Parr")
    tweet = tweet.replace("4PlayThursdays", "Foreplay Thursdays")
    tweet = tweet.replace("TGF2015", "Tontitown Grape Festival")
    tweet = tweet.replace("realmandyrain", "Mandy Rain")
    tweet = tweet.replace("GraysonDolan", "Grayson Dolan")
    tweet = tweet.replace("ApolloBrown", "Apollo Brown")
    tweet = tweet.replace("saddlebrooke", "Saddlebrooke")
    tweet = tweet.replace("TontitownGrape", "Tontitown Grape")
    tweet = tweet.replace("AbbsWinston", "Abbs Winston")
    tweet = tweet.replace("ShaunKing", "Shaun King")
    tweet = tweet.replace("MeekMill", "Meek Mill")
    tweet = tweet.replace("TornadoGiveaway", "Tornado Giveaway")
    tweet = tweet.replace("GRupdates", "GR updates")
    tweet = tweet.replace("SouthDowns", "South Downs")
    tweet = tweet.replace("braininjury", "brain injury")
    tweet = tweet.replace("auspol", "Australian politics")
    tweet = tweet.replace("PlannedParenthood", "Planned Parenthood")
    tweet = tweet.replace("calgaryweather", "Calgary Weather")
    tweet = tweet.replace("weallheartonedirection", "we all heart one direction")
    tweet = tweet.replace("edsheeran", "Ed Sheeran")
    tweet = tweet.replace("TrueHeroes", "True Heroes")
    tweet = tweet.replace("S3XLEAK", "sex leak")
    tweet = tweet.replace("ComplexMag", "Complex Magazine")
    tweet = tweet.replace("TheAdvocateMag", "The Advocate Magazine")
    tweet = tweet.replace("CityofCalgary", "City of Calgary")
    tweet = tweet.replace("EbolaOutbreak", "Ebola Outbreak")
    tweet = tweet.replace("SummerFate", "Summer Fate")
    tweet = tweet.replace("RAmag", "Royal Academy Magazine")
    tweet = tweet.replace("offers2go", "offers to go")
    tweet = tweet.replace("foodscare", "food scare")
    tweet = tweet.replace("MNPDNashville", "Metropolitan Nashville Police Department")
    tweet = tweet.replace("TfLBusAlerts", "TfL Bus Alerts")
    tweet = tweet.replace("GamerGate", "Gamer Gate")
    tweet = tweet.replace("IHHen", "Humanitarian Relief")
    tweet = tweet.replace("spinningbot", "spinning bot")
    tweet = tweet.replace("ModiMinistry", "Modi Ministry")
    tweet = tweet.replace("TAXIWAYS", "taxi ways")
    tweet = tweet.replace("Calum5SOS", "Calum Hood")
    tweet = tweet.replace("po_st", "po.st")
    tweet = tweet.replace("scoopit", "scoop.it")
    tweet = tweet.replace("UltimaLucha", "Ultima Lucha")
    tweet = tweet.replace("JonathanFerrell", "Jonathan Ferrell")
    tweet = tweet.replace("aria_ahrary", "Aria Ahrary")
    tweet = tweet.replace("rapidcity", "Rapid City")
    tweet = tweet.replace("OutBid", "outbid")
    tweet = tweet.replace("lavenderpoetrycafe", "lavender poetry cafe")
    tweet = tweet.replace("EudryLantiqua", "Eudry Lantiqua")
    tweet = tweet.replace("15PM", "15 PM")
    tweet = tweet.replace("OriginalFunko", "Funko")
    tweet = tweet.replace("rightwaystan", "Richard Tan")
    tweet = tweet.replace("CindyNoonan", "Cindy Noonan")
    tweet = tweet.replace("RT_America", "RT America")
    tweet = tweet.replace("narendramodi", "Narendra Modi")
    tweet = tweet.replace("BakeOffFriends", "Bake Off Friends")
    tweet = tweet.replace("TeamHendrick", "Hendrick Motorsports")
    tweet = tweet.replace("alexbelloli", "Alex Belloli")
    tweet = tweet.replace("itsjustinstuart", "Justin Stuart")
    tweet = tweet.replace("gunsense", "gun sense")
    tweet = tweet.replace("DebateQuestionsWeWantToHear", "debate questions we want to hear")
    tweet = tweet.replace("RoyalCarribean", "Royal Carribean")
    tweet = tweet.replace("samanthaturne19", "Samantha Turner")
    tweet = tweet.replace("JonVoyage", "Jon Stewart")
    tweet = tweet.replace("renew911health", "renew 911 health")
    tweet = tweet.replace("SuryaRay", "Surya Ray")
    tweet = tweet.replace("pattonoswalt", "Patton Oswalt")
    tweet = tweet.replace("minhazmerchant", "Minhaz Merchant")
    tweet = tweet.replace("TLVFaces", "Israel Diaspora Coalition")
    tweet = tweet.replace("pmarca", "Marc Andreessen")
    tweet = tweet.replace("pdx911", "Portland Police")
    tweet = tweet.replace("jamaicaplain", "Jamaica Plain")
    tweet = tweet.replace("Japton", "Arkansas")
    tweet = tweet.replace("RouteComplex", "Route Complex")
    tweet = tweet.replace("INSubcontinent", "Indian Subcontinent")
    tweet = tweet.replace("NJTurnpike", "New Jersey Turnpike")
    tweet = tweet.replace("Politifiact", "PolitiFact")
    tweet = tweet.replace("Hiroshima70", "Hiroshima")
    tweet = tweet.replace("GMMBC", "Greater Mt Moriah Baptist Church")
    tweet = tweet.replace("versethe", "verse the")
    tweet = tweet.replace("TubeStrike", "Tube Strike")
    tweet = tweet.replace("MissionHills", "Mission Hills")
    tweet = tweet.replace("ProtectDenaliWolves", "Protect Denali Wolves")
    tweet = tweet.replace("NANKANA", "Nankana")
    tweet = tweet.replace("SAHIB", "Sahib")
    tweet = tweet.replace("PAKPATTAN", "Pakpattan")
    tweet = tweet.replace("Newz_Sacramento", "News Sacramento")
    tweet = tweet.replace("gofundme", "go fund me")
    tweet = tweet.replace("pmharper", "Stephen Harper")
    tweet = tweet.replace("IvanBerroa", "Ivan Berroa")
    tweet = tweet.replace("LosDelSonido", "Los Del Sonido")
    tweet = tweet.replace("bancodeseries", "banco de series")
    tweet = tweet.replace("timkaine", "Tim Kaine")
    tweet = tweet.replace("IdentityTheft", "Identity Theft")
    tweet = tweet.replace("AllLivesMatter", "All Lives Matter")
    tweet = tweet.replace("mishacollins", "Misha Collins")
    tweet = tweet.replace("BillNeelyNBC", "Bill Neely")
    tweet = tweet.replace("BeClearOnCancer", "be clear on cancer")
    tweet = tweet.replace("Kowing", "Knowing")
    tweet = tweet.replace("ScreamQueens", "Scream Queens")
    tweet = tweet.replace("AskCharley", "Ask Charley")
    tweet = tweet.replace("BlizzHeroes", "Heroes of the Storm")
    tweet = tweet.replace("BradleyBrad47", "Bradley Brad")
    tweet = tweet.replace("HannaPH", "Typhoon Hanna")
    tweet = tweet.replace("meinlcymbals", "MEINL Cymbals")
    tweet = tweet.replace("Ptbo", "Peterborough")
    tweet = tweet.replace("cnnbrk", "CNN Breaking News")
    tweet = tweet.replace("IndianNews", "Indian News")
    tweet = tweet.replace("savebees", "save bees")
    tweet = tweet.replace("GreenHarvard", "Green Harvard")
    tweet = tweet.replace("StandwithPP", "Stand with planned parenthood")
    tweet = tweet.replace("hermancranston", "Herman Cranston")
    tweet = tweet.replace("WMUR9", "WMUR-TV")
    tweet = tweet.replace("RockBottomRadFM", "Rock Bottom Radio")
    tweet = tweet.replace("ameenshaikh3", "Ameen Shaikh")
    tweet = tweet.replace("ProSyn", "Project Syndicate")
    tweet = tweet.replace("Daesh", "ISIS")
    tweet = tweet.replace("s2g", "swear to god")
    tweet = tweet.replace("listenlive", "listen live")
    tweet = tweet.replace("CDCgov", "Centers for Disease Control and Prevention")
    tweet = tweet.replace("FoxNew", "Fox News")
    tweet = tweet.replace("CBSBigBrother", "Big Brother")
    tweet = tweet.replace("JulieDiCaro", "Julie DiCaro")
    tweet = tweet.replace("theadvocatemag", "The Advocate Magazine")
    tweet = tweet.replace("RohnertParkDPS", "Rohnert Park Police Department")
    tweet = tweet.replace("THISIZBWRIGHT", "Bonnie Wright")
    tweet = tweet.replace("Popularmmos", "Popular MMOs")
    tweet = tweet.replace("WildHorses", "Wild Horses")
    tweet = tweet.replace("FantasticFour", "Fantastic Four")
    tweet = tweet.replace("HORNDALE", "Horndale")
    tweet = tweet.replace("PINER", "Piner")
    tweet = tweet.replace("BathAndNorthEastSomerset", "Bath and North East Somerset")
    tweet = tweet.replace("thatswhatfriendsarefor", "that is what friends are for")
    tweet = tweet.replace("residualincome", "residual income")
    tweet = tweet.replace("YahooNewsDigest", "Yahoo News Digest")
    tweet = tweet.replace("MalaysiaAirlines", "Malaysia Airlines")
    tweet = tweet.replace("AmazonDeals", "Amazon Deals")
    tweet = tweet.replace("MissCharleyWebb", "Charley Webb")
    tweet = tweet.replace("shoalstraffic", "shoals traffic")
    tweet = tweet.replace("GeorgeFoster72", "George Foster")
    tweet = tweet.replace("pop2015", "pop 2015")
    tweet = tweet.replace("_PokemonCards_", "Pokemon Cards")
    tweet = tweet.replace("DianneG", "Dianne Gallagher")
    tweet = tweet.replace("KashmirConflict", "Kashmir Conflict")
    tweet = tweet.replace("BritishBakeOff", "British Bake Off")
    tweet = tweet.replace("FreeKashmir", "Free Kashmir")
    tweet = tweet.replace("mattmosley", "Matt Mosley")
    tweet = tweet.replace("BishopFred", "Bishop Fred")
    tweet = tweet.replace("EndConflict", "End Conflict")
    tweet = tweet.replace("EndOccupation", "End Occupation")
    tweet = tweet.replace("UNHEALED", "unhealed")
    tweet = tweet.replace("CharlesDagnall", "Charles Dagnall")
    tweet = tweet.replace("Latestnews", "Latest news")
    tweet = tweet.replace("KindleCountdown", "Kindle Countdown")
    tweet = tweet.replace("NoMoreHandouts", "No More Handouts")
    tweet = tweet.replace("datingtips", "dating tips")
    tweet = tweet.replace("charlesadler", "Charles Adler")
    tweet = tweet.replace("twia", "Texas Windstorm Insurance Association")
    tweet = tweet.replace("txlege", "Texas Legislature")
    tweet = tweet.replace("WindstormInsurer", "Windstorm Insurer")
    tweet = tweet.replace("Newss", "News")
    tweet = tweet.replace("hempoil", "hemp oil")
    tweet = tweet.replace("CommoditiesAre", "Commodities are")
    tweet = tweet.replace("tubestrike", "tube strike")
    tweet = tweet.replace("JoeNBC", "Joe Scarborough")
    tweet = tweet.replace("LiteraryCakes", "Literary Cakes")
    tweet = tweet.replace("TI5", "The International 5")
    tweet = tweet.replace("thehill", "the hill")
    tweet = tweet.replace("3others", "3 others")
    tweet = tweet.replace("stighefootball", "Sam Tighe")
    tweet = tweet.replace("whatstheimportantvideo", "what is the important video")
    tweet = tweet.replace("ClaudioMeloni", "Claudio Meloni")
    tweet = tweet.replace("DukeSkywalker", "Duke Skywalker")
    tweet = tweet.replace("carsonmwr", "Fort Carson")
    tweet = tweet.replace("offdishduty", "off dish duty")
    tweet = tweet.replace("andword", "and word")
    tweet = tweet.replace("rhodeisland", "Rhode Island")
    tweet = tweet.replace("easternoregon", "Eastern Oregon")
    tweet = tweet.replace("WAwildfire", "Washington Wildfire")
    tweet = tweet.replace("fingerrockfire", "Finger Rock Fire")
    tweet = tweet.replace("57am", "57 am")
    tweet = tweet.replace("fingerrockfire", "Finger Rock Fire")
    tweet = tweet.replace("JacobHoggard", "Jacob Hoggard")
    tweet = tweet.replace("newnewnew", "new new new")
    tweet = tweet.replace("under50", "under 50")
    tweet = tweet.replace("getitbeforeitsgone", "get it before it is gone")
    tweet = tweet.replace("freshoutofthebox", "fresh out of the box")
    tweet = tweet.replace("amwriting", "am writing")
    tweet = tweet.replace("Bokoharm", "Boko Haram")
    tweet = tweet.replace("Nowlike", "Now like")
    tweet = tweet.replace("seasonfrom", "season from")
    tweet = tweet.replace("epicente", "epicenter")
    tweet = tweet.replace("epicenterr", "epicenter")
    tweet = tweet.replace("sicklife", "sick life")
    tweet = tweet.replace("yycweather", "Calgary Weather")
    tweet = tweet.replace("calgarysun", "Calgary Sun")
    tweet = tweet.replace("approachng", "approaching")
    tweet = tweet.replace("evng", "evening")
    tweet = tweet.replace("Sumthng", "something")
    tweet = tweet.replace("EllenPompeo", "Ellen Pompeo")
    tweet = tweet.replace("shondarhimes", "Shonda Rhimes")
    tweet = tweet.replace("ABCNetwork", "ABC Network")
    tweet = tweet.replace("SushmaSwaraj", "Sushma Swaraj")
    tweet = tweet.replace("pray4japan", "Pray for Japan")
    tweet = tweet.replace("hope4japan", "Hope for Japan")
    tweet = tweet.replace("Illusionimagess", "Illusion images")
    tweet = tweet.replace("SummerUnderTheStars", "Summer Under The Stars")
    tweet = tweet.replace("ShallWeDance", "Shall We Dance")
    tweet = tweet.replace("TCMParty", "TCM Party")
    tweet = tweet.replace("marijuananews", "marijuana news")
    tweet = tweet.replace("onbeingwithKristaTippett", "on being with Krista Tippett")
    tweet = tweet.replace("Beingtweets", "Being tweets")
    tweet = tweet.replace("newauthors", "new authors")
    tweet = tweet.replace("remedyyyy", "remedy")
    tweet = tweet.replace("44PM", "44 PM")
    tweet = tweet.replace("HeadlinesApp", "Headlines App")
    tweet = tweet.replace("40PM", "40 PM")
    tweet = tweet.replace("myswc", "Severe Weather Center")
    tweet = tweet.replace("ithats", "that is")
    tweet = tweet.replace("icouldsitinthismomentforever", "I could sit in this moment forever")
    tweet = tweet.replace("FatLoss", "Fat Loss")
    tweet = tweet.replace("02PM", "02 PM")
    tweet = tweet.replace("MetroFmTalk", "Metro Fm Talk")
    tweet = tweet.replace("Bstrd", "bastard")
    tweet = tweet.replace("bldy", "bloody")
    tweet = tweet.replace("MetrofmTalk", "Metro Fm Talk")
    tweet = tweet.replace("terrorismturn", "terrorism turn")
    tweet = tweet.replace("BBCNewsAsia", "BBC News Asia")
    tweet = tweet.replace("BehindTheScenes", "Behind The Scenes")
    tweet = tweet.replace("GeorgeTakei", "George Takei")
    tweet = tweet.replace("WomensWeeklyMag", "Womens Weekly Magazine")
    tweet = tweet.replace("SurvivorsGuidetoEarth", "Survivors Guide to Earth")
    tweet = tweet.replace("incubusband", "incubus band")
    tweet = tweet.replace("Babypicturethis", "Baby picture this")
    tweet = tweet.replace("BombEffects", "Bomb Effects")
    tweet = tweet.replace("win10", "Windows 10")
    tweet = tweet.replace("idkidk", "I do not know I do not know")
    tweet = tweet.replace("TheWalkingDead", "The Walking Dead")
    tweet = tweet.replace("amyschumer", "Amy Schumer")
    tweet = tweet.replace("crewlist", "crew list")
    tweet = tweet.replace("Erdogans", "Erdogan")
    tweet = tweet.replace("BBCLive", "BBC Live")
    tweet = tweet.replace("TonyAbbottMHR", "Tony Abbott")
    tweet = tweet.replace("paulmyerscough", "Paul Myerscough")
    tweet = tweet.replace("georgegallagher", "George Gallagher")
    tweet = tweet.replace("JimmieJohnson", "Jimmie Johnson")
    tweet = tweet.replace("pctool", "pc tool")
    tweet = tweet.replace("DoingHashtagsRight", "Doing Hashtags Right")
    tweet = tweet.replace("ThrowbackThursday", "Throwback Thursday")
    tweet = tweet.replace("SnowBackSunday", "Snowback Sunday")
    tweet = tweet.replace("LakeEffect", "Lake Effect")
    tweet = tweet.replace("RTphotographyUK", "Richard Thomas Photography UK")
    tweet = tweet.replace("BigBang_CBS", "Big Bang CBS")
    tweet = tweet.replace("writerslife", "writers life")
    tweet = tweet.replace("NaturalBirth", "Natural Birth")
    tweet = tweet.replace("UnusualWords", "Unusual Words")
    tweet = tweet.replace("wizkhalifa", "Wiz Khalifa")
    tweet = tweet.replace("acreativedc", "a creative DC")
    tweet = tweet.replace("vscodc", "vsco DC")
    tweet = tweet.replace("VSCOcam", "vsco camera")
    tweet = tweet.replace("TheBEACHDC", "The beach DC")
    tweet = tweet.replace("buildingmuseum", "building museum")
    tweet = tweet.replace("WorldOil", "World Oil")
    tweet = tweet.replace("redwedding", "red wedding")
    tweet = tweet.replace("AmazingRaceCanada", "Amazing Race Canada")
    tweet = tweet.replace("WakeUpAmerica", "Wake Up America")
    tweet = tweet.replace("\\Allahuakbar\\", "Allahu Akbar")
    tweet = tweet.replace("bleased", "blessed")
    tweet = tweet.replace("nigeriantribune", "Nigerian Tribune")
    tweet = tweet.replace("HIDEO_KOJIMA_EN", "Hideo Kojima")
    tweet = tweet.replace("FusionFestival", "Fusion Festival")
    tweet = tweet.replace("50Mixed", "50 Mixed")
    tweet = tweet.replace("NoAgenda", "No Agenda")
    tweet = tweet.replace("WhiteGenocide", "White Genocide")
    tweet = tweet.replace("dirtylying", "dirty lying")
    tweet = tweet.replace("SyrianRefugees", "Syrian Refugees")
    tweet = tweet.replace("changetheworld", "change the world")
    tweet = tweet.replace("Ebolacase", "Ebola case")
    tweet = tweet.replace("mcgtech", "mcg technologies")
    tweet = tweet.replace("withweapons", "with weapons")
    tweet = tweet.replace("advancedwarfare", "advanced warfare")
    tweet = tweet.replace("letsFootball", "let us Football")
    tweet = tweet.replace("LateNiteMix", "late night mix")
    tweet = tweet.replace("PhilCollinsFeed", "Phil Collins")
    tweet = tweet.replace("RudyHavenstein", "Rudy Havenstein")
    tweet = tweet.replace("22PM", "22 PM")
    tweet = tweet.replace("54am", "54 AM")
    tweet = tweet.replace("38am", "38 AM")
    tweet = tweet.replace("OldFolkExplainStuff", "Old Folk Explain Stuff")
    tweet = tweet.replace("BlacklivesMatter", "Black Lives Matter")
    tweet = tweet.replace("InsaneLimits", "Insane Limits")
    tweet = tweet.replace("youcantsitwithus", "you cannot sit with us")
    tweet = tweet.replace("2k15", "2015")
    tweet = tweet.replace("TheIran", "Iran")
    tweet = tweet.replace("JimmyFallon", "Jimmy Fallon")
    tweet = tweet.replace("AlbertBrooks", "Albert Brooks")
    tweet = tweet.replace("defense_news", "defense news")
    tweet = tweet.replace("nuclearrcSA", "Nuclear Risk Control Self Assessment")
    tweet = tweet.replace("Auspol", "Australia Politics")
    tweet = tweet.replace("NuclearPower", "Nuclear Power")
    tweet = tweet.replace("WhiteTerrorism", "White Terrorism")
    tweet = tweet.replace("truthfrequencyradio", "Truth Frequency Radio")
    tweet = tweet.replace("ErasureIsNotEquality", "Erasure is not equality")
    tweet = tweet.replace("ProBonoNews", "Pro Bono News")
    tweet = tweet.replace("JakartaPost", "Jakarta Post")
    tweet = tweet.replace("toopainful", "too painful")
    tweet = tweet.replace("melindahaunton", "Melinda Haunton")
    tweet = tweet.replace("NoNukes", "No Nukes")
    tweet = tweet.replace("curryspcworld", "Currys PC World")
    tweet = tweet.replace("ineedcake", "I need cake")
    tweet = tweet.replace("blackforestgateau", "black forest gateau")
    tweet = tweet.replace("BBCOne", "BBC One")
    tweet = tweet.replace("AlexxPage", "Alex Page")
    tweet = tweet.replace("jonathanserrie", "Jonathan Serrie")
    tweet = tweet.replace("SocialJerkBlog", "Social Jerk Blog")
    tweet = tweet.replace("ChelseaVPeretti", "Chelsea Peretti")
    tweet = tweet.replace("irongiant", "iron giant")
    tweet = tweet.replace("RonFunches", "Ron Funches")
    tweet = tweet.replace("TimCook", "Tim Cook")
    tweet = tweet.replace("sebastianstanisaliveandwell", "Sebastian Stan is alive and well")
    tweet = tweet.replace("Madsummer", "Mad summer")
    tweet = tweet.replace("NowYouKnow", "Now you know")
    tweet = tweet.replace("concertphotography", "concert photography")
    tweet = tweet.replace("TomLandry", "Tom Landry")
    tweet = tweet.replace("showgirldayoff", "show girl day off")
    tweet = tweet.replace("Yougslavia", "Yugoslavia")
    tweet = tweet.replace("QuantumDataInformatics", "Quantum Data Informatics")
    tweet = tweet.replace("FromTheDesk", "From The Desk")
    tweet = tweet.replace("TheaterTrial", "Theater Trial")
    tweet = tweet.replace("CatoInstitute", "Cato Institute")
    tweet = tweet.replace("EmekaGift", "Emeka Gift")
    tweet = tweet.replace("LetsBe_Rational", "Let us be rational")
    tweet = tweet.replace("Cynicalreality", "Cynical reality")
    tweet = tweet.replace("FredOlsenCruise", "Fred Olsen Cruise")
    tweet = tweet.replace("NotSorry", "not sorry")
    tweet = tweet.replace("UseYourWords", "use your words")
    tweet = tweet.replace("WordoftheDay", "word of the day")
    tweet = tweet.replace("Dictionarycom", "Dictionary.com")
    tweet = tweet.replace("TheBrooklynLife", "The Brooklyn Life")
    tweet = tweet.replace("jokethey", "joke they")
    tweet = tweet.replace("nflweek1picks", "NFL week 1 picks")
    tweet = tweet.replace("uiseful", "useful")
    tweet = tweet.replace("JusticeDotOrg", "The American Association for Justice")
    tweet = tweet.replace("autoaccidents", "auto accidents")
    tweet = tweet.replace("SteveGursten", "Steve Gursten")
    tweet = tweet.replace("MichiganAutoLaw", "Michigan Auto Law")
    tweet = tweet.replace("birdgang", "bird gang")
    tweet = tweet.replace("nflnetwork", "NFL Network")
    tweet = tweet.replace("NYDNSports", "NY Daily News Sports")
    tweet = tweet.replace("RVacchianoNYDN", "Ralph Vacchiano NY Daily News")
    tweet = tweet.replace("EdmontonEsks", "Edmonton Eskimos")
    tweet = tweet.replace("david_brelsford", "David Brelsford")
    tweet = tweet.replace("TOI_India", "The Times of India")
    tweet = tweet.replace("hegot", "he got")
    tweet = tweet.replace("SkinsOn9", "Skins on 9")
    tweet = tweet.replace("sothathappened", "so that happened")
    tweet = tweet.replace("LCOutOfDoors", "LC Out Of Doors")
    tweet = tweet.replace("NationFirst", "Nation First")
    tweet = tweet.replace("IndiaToday", "India Today")
    tweet = tweet.replace("HLPS", "helps")
    tweet = tweet.replace("HOSTAGESTHROSW", "hostages throw")
    tweet = tweet.replace("SNCTIONS", "sanctions")
    tweet = tweet.replace("BidTime", "Bid Time")
    tweet = tweet.replace("crunchysensible", "crunchy sensible")
    tweet = tweet.replace("RandomActsOfRomance", "Random acts of romance")
    tweet = tweet.replace("MomentsAtHill", "Moments at hill")
    tweet = tweet.replace("eatshit", "eat shit")
    tweet = tweet.replace("liveleakfun", "live leak fun")
    tweet = tweet.replace("SahelNews", "Sahel News")
    tweet = tweet.replace("abc7newsbayarea", "ABC 7 News Bay Area")
    tweet = tweet.replace("facilitiesmanagement", "facilities management")
    tweet = tweet.replace("facilitydude", "facility dude")
    tweet = tweet.replace("CampLogistics", "Camp logistics")
    tweet = tweet.replace("alaskapublic", "Alaska public")
    tweet = tweet.replace("MarketResearch", "Market Research")
    tweet = tweet.replace("AccuracyEsports", "Accuracy Esports")
    tweet = tweet.replace("TheBodyShopAust", "The Body Shop Australia")
    tweet = tweet.replace("yychail", "Calgary hail")
    tweet = tweet.replace("yyctraffic", "Calgary traffic")
    tweet = tweet.replace("eliotschool", "eliot school")
    tweet = tweet.replace("TheBrokenCity", "The Broken City")
    tweet = tweet.replace("OldsFireDept", "Olds Fire Department")
    tweet = tweet.replace("RiverComplex", "River Complex")
    tweet = tweet.replace("fieldworksmells", "field work smells")
    tweet = tweet.replace("IranElection", "Iran Election")
    tweet = tweet.replace("glowng", "glowing")
    tweet = tweet.replace("kindlng", "kindling")
    tweet = tweet.replace("riggd", "rigged")
    tweet = tweet.replace("slownewsday", "slow news day")
    tweet = tweet.replace("MyanmarFlood", "Myanmar Flood")
    tweet = tweet.replace("abc7chicago", "ABC 7 Chicago")
    tweet = tweet.replace("copolitics", "Colorado Politics")
    tweet = tweet.replace("AdilGhumro", "Adil Ghumro")
    tweet = tweet.replace("netbots", "net bots")
    tweet = tweet.replace("byebyeroad", "bye bye road")
    tweet = tweet.replace("massiveflooding", "massive flooding")
    tweet = tweet.replace("EndofUS", "End of United States")
    tweet = tweet.replace("35PM", "35 PM")
    tweet = tweet.replace("greektheatrela", "Greek Theatre Los Angeles")
    tweet = tweet.replace("76mins", "76 minutes")
    tweet = tweet.replace("publicsafetyfirst", "public safety first")
    tweet = tweet.replace("livesmatter", "lives matter")
    tweet = tweet.replace("myhometown", "my hometown")
    tweet = tweet.replace("tankerfire", "tanker fire")
    tweet = tweet.replace("MEMORIALDAY", "memorial day")
    tweet = tweet.replace("MEMORIAL_DAY", "memorial day")
    tweet = tweet.replace("instaxbooty", "instagram booty")
    tweet = tweet.replace("Jerusalem_Post", "Jerusalem Post")
    tweet = tweet.replace("WayneRooney_INA", "Wayne Rooney")
    tweet = tweet.replace("VirtualReality", "Virtual Reality")
    tweet = tweet.replace("OculusRift", "Oculus Rift")
    tweet = tweet.replace("OwenJones84", "Owen Jones")
    tweet = tweet.replace("jeremycorbyn", "Jeremy Corbyn")
    tweet = tweet.replace("paulrogers002", "Paul Rogers")
    tweet = tweet.replace("mortalkombatx", "Mortal Kombat X")
    tweet = tweet.replace("mortalkombat", "Mortal Kombat")
    tweet = tweet.replace("FilipeCoelho92", "Filipe Coelho")
    tweet = tweet.replace("OnlyQuakeNews", "Only Quake News")
    tweet = tweet.replace("kostumes", "costumes")
    tweet = tweet.replace("YEEESSSS", "yes")
    tweet = tweet.replace("ToshikazuKatayama", "Toshikazu Katayama")
    tweet = tweet.replace("IntlDevelopment", "Intl Development")
    tweet = tweet.replace("ExtremeWeather", "Extreme Weather")
    tweet = tweet.replace("WereNotGruberVoters", "We are not gruber voters")
    tweet = tweet.replace("NewsThousands", "News Thousands")
    tweet = tweet.replace("EdmundAdamus", "Edmund Adamus")
    tweet = tweet.replace("EyewitnessWV", "Eye witness WV")
    tweet = tweet.replace("PhiladelphiaMuseu", "Philadelphia Museum")
    tweet = tweet.replace("DublinComicCon", "Dublin Comic Con")
    tweet = tweet.replace("NicholasBrendon", "Nicholas Brendon")
    tweet = tweet.replace("Alltheway80s", "All the way 80s")
    tweet = tweet.replace("FromTheField", "From the field")
    tweet = tweet.replace("NorthIowa", "North Iowa")
    tweet = tweet.replace("WillowFire", "Willow Fire")
    tweet = tweet.replace("MadRiverComplex", "Mad River Complex")
    tweet = tweet.replace("feelingmanly", "feeling manly")
    tweet = tweet.replace("stillnotoverit", "still not over it")
    tweet = tweet.replace("FortitudeValley", "Fortitude Valley")
    tweet = tweet.replace("CoastpowerlineTramTr", "Coast powerline")
    tweet = tweet.replace("ServicesGold", "Services Gold")
    tweet = tweet.replace("NewsbrokenEmergency", "News broken emergency")
    tweet = tweet.replace("Evaucation", "evacuation")
    tweet = tweet.replace("leaveevacuateexitbe", "leave evacuate exit be")
    tweet = tweet.replace("P_EOPLE", "PEOPLE")
    tweet = tweet.replace("Tubestrike", "tube strike")
    tweet = tweet.replace("CLASS_SICK", "CLASS SICK")
    tweet = tweet.replace("localplumber", "local plumber")
    tweet = tweet.replace("awesomejobsiri", "awesome job siri")
    tweet = tweet.replace("PayForItHow", "Pay for it how")
    tweet = tweet.replace("ThisIsAfrica", "This is Africa")
    tweet = tweet.replace("crimeairnetwork", "crime air network")
    tweet = tweet.replace("KimAcheson", "Kim Acheson")
    tweet = tweet.replace("cityofcalgary", "City of Calgary")
    tweet = tweet.replace("prosyndicate", "pro syndicate")
    tweet = tweet.replace("660NEWS", "660 NEWS")
    tweet = tweet.replace("BusInsMagazine", "Business Insurance Magazine")
    tweet = tweet.replace("wfocus", "focus")
    tweet = tweet.replace("ShastaDam", "Shasta Dam")
    tweet = tweet.replace("go2MarkFranco", "Mark Franco")
    tweet = tweet.replace("StephGHinojosa", "Steph Hinojosa")
    tweet = tweet.replace("Nashgrier", "Nash Grier")
    tweet = tweet.replace("NashNewVideo", "Nash new video")
    tweet = tweet.replace("IWouldntGetElectedBecause", "I would not get elected because")
    tweet = tweet.replace("SHGames", "Sledgehammer Games")
    tweet = tweet.replace("bedhair", "bed hair")
    tweet = tweet.replace("JoelHeyman", "Joel Heyman")
    tweet = tweet.replace("viaYouTube", "via YouTube")

    # Acronyms
    tweet = tweet.replace("MH370", "Malaysia Airlines Flight 370")
    tweet = tweet.replace("mÌ¼sica", "music")
    tweet = tweet.replace("okwx", "Oklahoma City Weather")
    tweet = tweet.replace("arwx", "Arkansas Weather")    
    tweet = tweet.replace("gawx", "Georgia Weather")  
    tweet = tweet.replace("scwx", "South Carolina Weather")  
    tweet = tweet.replace("cawx", "California Weather")
    tweet = tweet.replace("tnwx", "Tennessee Weather")
    tweet = tweet.replace("azwx", "Arizona Weather")  
    tweet = tweet.replace("alwx", "Alabama Weather")
    tweet = tweet.replace("wordpressdotcom", "wordpress")    
    tweet = tweet.replace("usNWSgov", "United States National Weather Service")
    tweet = tweet.replace("Suruc", "Sanliurfa")

    # Grouping same words without embeddings
    tweet = tweet.replace("Bestnaijamade", "bestnaijamade")
    tweet = tweet.replace("SOUDELOR", "Soudelor") 

    # ... and ..
    tweet = tweet.replace('...', ' ... ')
    if '...' not in tweet:
        tweet = tweet.replace('..', ' ... ')  

    # Remove html tag
    html = re.compile(r'<.*?>')
    tweet = html.sub(r'', tweet) 

    # URLs
    url = re.compile(r'https?://\S+|www\.\S+')
    tweet = url.sub(r'', tweet)

    # Remove emojis
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    tweet = emoji_pattern.sub(r'', tweet)

    # Removing punctuations
    table = str.maketrans('', '', string.punctuation)
    tweet = tweet.translate(table)  

    # spell correction:
    # tweet = correct_spellings(tweet)
    return tweet.rstrip()

df['text'] = df['text'].apply(lambda x : clean(x))

## GloVe for Vectorization

Here we will use GloVe pretrained corpus model to represent our words.It is available in 3 varieties :50D ,100D and 200 Dimentional.We will try 100 D here.

In [0]:
def create_corpus(df):
    corpus=[]
    for tweet in tqdm(df['text']):
        words=[word.lower() for word in word_tokenize(tweet) if((word.isalpha()==1) & (word not in stop))]
        corpus.append(words)
    return corpus

In [0]:
import nltk
nltk.download('punkt')
corpus = create_corpus(df)

  6%|▌         | 613/10759 [00:00<00:01, 6118.00it/s]

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


100%|██████████| 10759/10759 [00:01<00:00, 6578.63it/s]


In [0]:
embedding_dict={}
with open('./glove.6B.100d.txt','r') as f:
    for line in f:
        values=line.split()
        word=values[0]
        vectors=np.asarray(values[1:],'float32')
        embedding_dict[word]=vectors
f.close()

In [0]:
MAX_LEN=50
tokenizer_obj=Tokenizer()
tokenizer_obj.fit_on_texts(corpus)
sequences=tokenizer_obj.texts_to_sequences(corpus)

tweet_pad=pad_sequences(sequences,maxlen=MAX_LEN,truncating='post',padding='post')

In [0]:
word_index=tokenizer_obj.word_index
print('Number of unique words:',len(word_index))

Number of unique words: 20025


In [0]:
num_words=len(word_index)+1
embedding_matrix=np.zeros((num_words,100))

for word,i in tqdm(word_index.items()):
    if i > num_words:
        continue
    
    emb_vec=embedding_dict.get(word)
    if emb_vec is not None:
        embedding_matrix[i]=emb_vec
            

100%|██████████| 20025/20025 [00:00<00:00, 474860.42it/s]


## Baseline Model

In [0]:
model=Sequential()

embedding=Embedding(num_words,100,embeddings_initializer=Constant(embedding_matrix),
                   input_length=MAX_LEN,trainable=False)

model.add(embedding)
model.add(SpatialDropout1D(0.2))
model.add(LSTM(64, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))


optimzer=Adam(lr=1e-5)

model.compile(loss='binary_crossentropy',optimizer=optimzer,metrics=['accuracy'])

In [0]:
model.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 50, 100)           2002600   
_________________________________________________________________
spatial_dropout1d_3 (Spatial (None, 50, 100)           0         
_________________________________________________________________
lstm_3 (LSTM)                (None, 64)                42240     
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 65        
Total params: 2,044,905
Trainable params: 42,305
Non-trainable params: 2,002,600
_________________________________________________________________


In [0]:
train=tweet_pad[:tweet.shape[0]]
test=tweet_pad[tweet.shape[0]:]

In [0]:
X_train,X_test,y_train,y_test=train_test_split(train,tweet['target'].values,test_size=0.2)
print('Shape of train',X_train.shape)
print("Shape of Validation ",X_test.shape)

Shape of train (5996, 50)
Shape of Validation  (1500, 50)


In [0]:
history=model.fit(X_train,y_train,batch_size=4,epochs=15,validation_data=(X_test,y_test),verbose=2)

Train on 5996 samples, validate on 1500 samples
Epoch 1/15
 - 56s - loss: 0.6915 - acc: 0.5664 - val_loss: 0.6896 - val_acc: 0.5640
Epoch 2/15
 - 55s - loss: 0.6776 - acc: 0.5745 - val_loss: 0.6090 - val_acc: 0.6987
Epoch 3/15
 - 54s - loss: 0.5906 - acc: 0.7071 - val_loss: 0.5220 - val_acc: 0.7740
Epoch 4/15
 - 54s - loss: 0.5686 - acc: 0.7250 - val_loss: 0.5087 - val_acc: 0.7820
Epoch 5/15
 - 54s - loss: 0.5616 - acc: 0.7365 - val_loss: 0.4982 - val_acc: 0.7787
Epoch 6/15
 - 55s - loss: 0.5527 - acc: 0.7468 - val_loss: 0.4955 - val_acc: 0.7873
Epoch 7/15
 - 55s - loss: 0.5472 - acc: 0.7513 - val_loss: 0.4887 - val_acc: 0.7913
Epoch 8/15
 - 55s - loss: 0.5468 - acc: 0.7485 - val_loss: 0.4882 - val_acc: 0.7873
Epoch 9/15
 - 55s - loss: 0.5468 - acc: 0.7535 - val_loss: 0.4855 - val_acc: 0.7880
Epoch 10/15
 - 54s - loss: 0.5364 - acc: 0.7590 - val_loss: 0.4816 - val_acc: 0.7933
Epoch 11/15
 - 55s - loss: 0.5333 - acc: 0.7637 - val_loss: 0.4792 - val_acc: 0.7907
Epoch 12/15
 - 55s - loss:

## Making our submission

In [0]:
sample_sub=pd.read_csv('./sample_submission.csv')

In [0]:
y_pre=model.predict(test)
y_pre=np.round(y_pre).astype(int).reshape(3263)
sub=pd.DataFrame({'id':sample_sub['id'].values.tolist(),'target':y_pre})
sub.to_csv('submission.csv',index=False)


In [0]:
sub.head()

Unnamed: 0,id,target
0,0,1
1,2,0
2,3,1
3,9,1
4,11,1
