In [2]:
import string
from collections import defaultdict
import operator
import gc
import pickle

import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

import matplotlib.pyplot as plt
import seaborn as sns

import re
import nltk
nltk.download('stopwords')
stemmer = nltk.SnowballStemmer("english")
from nltk.corpus import stopwords
stopword=set(stopwords.words('english'))
from wordcloud import WordCloud, STOPWORDS
from nltk.tokenize import word_tokenize

import xgboost as xgb

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfTransformer

import keras
from keras.models import Model
from keras.layers import LSTM, Activation, Dense, Dropout, Input, Embedding,SpatialDropout1D
from keras.optimizers import RMSprop
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
from keras.utils import to_categorical
from keras.callbacks import EarlyStopping
from keras.callbacks import EarlyStopping,ModelCheckpoint
from keras.models import Sequential

import warnings

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/javier/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Index

1. [Importing Data](#Data)
2. [Data Analysis](../Analysis/train_data_analysis.ipynb)
3. [Embeddings and Text Cleaning](#Embeddings-and-Text-Cleaning)
4. [Preprocessing](#Preprocessing)
5. [Models](#Models)
6. [Real Test](#Real-Test)

# Data

In [3]:
part_paths = [
    '../data/kaggle/sentiment/training.1600000.processed.noemoticon.csv.part1',
    '../data/kaggle/sentiment/training.1600000.processed.noemoticon.csv.part2',
    '../data/kaggle/sentiment/training.1600000.processed.noemoticon.csv.part3',
    '../data/kaggle/sentiment/training.1600000.processed.noemoticon.csv.part4',
    '../data/kaggle/sentiment/training.1600000.processed.noemoticon.csv.part5',
    '../data/kaggle/sentiment/training.1600000.processed.noemoticon.csv.part6',
    '../data/kaggle/sentiment/training.1600000.processed.noemoticon.csv.part7',
    '../data/kaggle/sentiment/training.1600000.processed.noemoticon.csv.part8',
    '../data/kaggle/sentiment/training.1600000.processed.noemoticon.csv.part9',
    '../data/kaggle/sentiment/training.1600000.processed.noemoticon.csv.part10',
    '../data/kaggle/sentiment/training.1600000.processed.noemoticon.csv.part11',
    '../data/kaggle/sentiment/training.1600000.processed.noemoticon.csv.part12',
]

In [4]:
df_list = [pd.read_csv(part_path, encoding='utf-8', lineterminator='\n', on_bad_lines='skip') for part_path in part_paths]
data = pd.concat(df_list, ignore_index=True)

In [5]:
data

Unnamed: 0,target,id,date,query,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."
...,...,...,...,...,...,...
1599995,4,2193601966,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,AmandaMarie1028,Just woke up. Having no school is the best fee...
1599996,4,2193601969,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,TheWDBoards,TheWDB.com - Very cool to hear old Walt interv...
1599997,4,2193601991,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,bpbabe,Are you ready for your MoJo Makeover? Ask me f...
1599998,4,2193602064,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,tinydiamondz,Happy 38th Birthday to my boo of alll time!!! ...


# Text Cleaning

The `clean(text)` function takes a text as input and performs a series of transformations to clean and normalize the text. Here is a brief explanation of what each step does:

* Converts the text to lowercase.
* Removes special characters and replaces specific characters with others.
* Replaces common contractions with their full forms.
* Replaces character entity references such as ">" and "<".
* Corrects typos, slang, and informal abbreviations.
* Replaces hashtags and social media usernames with their full forms or descriptions.
* Cleans and normalizes words related to natural disasters, current events, and other specific words.
* Performs additional corrections and normalizations in the text.
* Removes URLs starting with "http://" or "https://" followed by alphanumeric characters.
* Replaces each punctuation and special character with a space and the punctuation/special character itself.
* Replaces specific acronyms with their expanded forms or related terms.
* Removes Tags, Links, and Punctuation.
* Does Stopword Removal and Stemming

In [6]:
def clean(text): 
    text = str(text).lower()

    # Special characters
    text = re.sub(r"\x89Û_", "", text)
    text = re.sub(r"\x89ÛÒ", "", text)
    text = re.sub(r"\x89ÛÓ", "", text)
    text = re.sub(r"\x89ÛÏWhen", "When", text)
    text = re.sub(r"\x89ÛÏ", "", text)
    text = re.sub(r"China\x89Ûªs", "China's", text)
    text = re.sub(r"let\x89Ûªs", "let's", text)
    text = re.sub(r"\x89Û÷", "", text)
    text = re.sub(r"\x89Ûª", "", text)
    text = re.sub(r"\x89Û\x9d", "", text)
    text = re.sub(r"å_", "", text)
    text = re.sub(r"\x89Û¢", "", text)
    text = re.sub(r"\x89Û¢åÊ", "", text)
    text = re.sub(r"fromåÊwounds", "from wounds", text)
    text = re.sub(r"åÊ", "", text)
    text = re.sub(r"åÈ", "", text)
    text = re.sub(r"JapÌ_n", "Japan", text)    
    text = re.sub(r"Ì©", "e", text)
    text = re.sub(r"å¨", "", text)
    text = re.sub(r"SuruÌ¤", "Suruc", text)
    text = re.sub(r"åÇ", "", text)
    text = re.sub(r"å£3million", "3 million", text)
    text = re.sub(r"åÀ", "", text)
    
    # Contractions
    text = re.sub(r"he's", "he is", text)
    text = re.sub(r"there's", "there is", text)
    text = re.sub(r"We're", "We are", text)
    text = re.sub(r"That's", "That is", text)
    text = re.sub(r"won't", "will not", text)
    text = re.sub(r"they're", "they are", text)
    text = re.sub(r"Can't", "Cannot", text)
    text = re.sub(r"wasn't", "was not", text)
    text = re.sub(r"don\x89Ûªt", "do not", text)
    text = re.sub(r"aren't", "are not", text)
    text = re.sub(r"isn't", "is not", text)
    text = re.sub(r"What's", "What is", text)
    text = re.sub(r"haven't", "have not", text)
    text = re.sub(r"hasn't", "has not", text)
    text = re.sub(r"There's", "There is", text)
    text = re.sub(r"He's", "He is", text)
    text = re.sub(r"It's", "It is", text)
    text = re.sub(r"You're", "You are", text)
    text = re.sub(r"I'M", "I am", text)
    text = re.sub(r"shouldn't", "should not", text)
    text = re.sub(r"wouldn't", "would not", text)
    text = re.sub(r"i'm", "I am", text)
    text = re.sub(r"I\x89Ûªm", "I am", text)
    text = re.sub(r"I'm", "I am", text)
    text = re.sub(r"Isn't", "is not", text)
    text = re.sub(r"Here's", "Here is", text)
    text = re.sub(r"you've", "you have", text)
    text = re.sub(r"you\x89Ûªve", "you have", text)
    text = re.sub(r"we're", "we are", text)
    text = re.sub(r"what's", "what is", text)
    text = re.sub(r"couldn't", "could not", text)
    text = re.sub(r"we've", "we have", text)
    text = re.sub(r"it\x89Ûªs", "it is", text)
    text = re.sub(r"doesn\x89Ûªt", "does not", text)
    text = re.sub(r"It\x89Ûªs", "It is", text)
    text = re.sub(r"Here\x89Ûªs", "Here is", text)
    text = re.sub(r"who's", "who is", text)
    text = re.sub(r"I\x89Ûªve", "I have", text)
    text = re.sub(r"y'all", "you all", text)
    text = re.sub(r"can\x89Ûªt", "cannot", text)
    text = re.sub(r"would've", "would have", text)
    text = re.sub(r"it'll", "it will", text)
    text = re.sub(r"we'll", "we will", text)
    text = re.sub(r"wouldn\x89Ûªt", "would not", text)
    text = re.sub(r"We've", "We have", text)
    text = re.sub(r"he'll", "he will", text)
    text = re.sub(r"Y'all", "You all", text)
    text = re.sub(r"Weren't", "Were not", text)
    text = re.sub(r"Didn't", "Did not", text)
    text = re.sub(r"they'll", "they will", text)
    text = re.sub(r"they'd", "they would", text)
    text = re.sub(r"DON'T", "DO NOT", text)
    text = re.sub(r"That\x89Ûªs", "That is", text)
    text = re.sub(r"they've", "they have", text)
    text = re.sub(r"i'd", "I would", text)
    text = re.sub(r"should've", "should have", text)
    text = re.sub(r"You\x89Ûªre", "You are", text)
    text = re.sub(r"where's", "where is", text)
    text = re.sub(r"Don\x89Ûªt", "Do not", text)
    text = re.sub(r"we'd", "we would", text)
    text = re.sub(r"i'll", "I will", text)
    text = re.sub(r"weren't", "were not", text)
    text = re.sub(r"They're", "They are", text)
    text = re.sub(r"Can\x89Ûªt", "Cannot", text)
    text = re.sub(r"you\x89Ûªll", "you will", text)
    text = re.sub(r"I\x89Ûªd", "I would", text)
    text = re.sub(r"let's", "let us", text)
    text = re.sub(r"it's", "it is", text)
    text = re.sub(r"can't", "cannot", text)
    text = re.sub(r"don't", "do not", text)
    text = re.sub(r"you're", "you are", text)
    text = re.sub(r"i've", "I have", text)
    text = re.sub(r"that's", "that is", text)
    text = re.sub(r"i'll", "I will", text)
    text = re.sub(r"doesn't", "does not", text)
    text = re.sub(r"i'd", "I would", text)
    text = re.sub(r"didn't", "did not", text)
    text = re.sub(r"ain't", "am not", text)
    text = re.sub(r"you'll", "you will", text)
    text = re.sub(r"I've", "I have", text)
    text = re.sub(r"Don't", "do not", text)
    text = re.sub(r"I'll", "I will", text)
    text = re.sub(r"I'd", "I would", text)
    text = re.sub(r"Let's", "Let us", text)
    text = re.sub(r"you'd", "You would", text)
    text = re.sub(r"It's", "It is", text)
    text = re.sub(r"Ain't", "am not", text)
    text = re.sub(r"Haven't", "Have not", text)
    text = re.sub(r"Could've", "Could have", text)
    text = re.sub(r"youve", "you have", text)  
    text = re.sub(r"donå«t", "do not", text)   
            
    # Character entity references
    text = re.sub(r"&gt;", ">", text)
    text = re.sub(r"&lt;", "<", text)
    text = re.sub(r"&amp;", "&", text)
    
    # Typos, slang and informal abbreviations
    text = re.sub(r"w/e", "whatever", text)
    text = re.sub(r"w/", "with", text)
    text = re.sub(r"USAgov", "USA government", text)
    text = re.sub(r"recentlu", "recently", text)
    text = re.sub(r"Ph0tos", "Photos", text)
    text = re.sub(r"amirite", "am I right", text)
    text = re.sub(r"exp0sed", "exposed", text)
    text = re.sub(r"<3", "love", text)
    text = re.sub(r"amageddon", "armageddon", text)
    text = re.sub(r"Trfc", "Traffic", text)
    text = re.sub(r"8/5/2015", "2015-08-05", text)
    text = re.sub(r"WindStorm", "Wind Storm", text)
    text = re.sub(r"8/6/2015", "2015-08-06", text)
    text = re.sub(r"10:38PM", "10:38 PM", text)
    text = re.sub(r"10:30pm", "10:30 PM", text)
    text = re.sub(r"16yr", "16 year", text)
    text = re.sub(r"lmao", "laughing my ass off", text)   
    text = re.sub(r"TRAUMATISED", "traumatized", text)
    
    # Hashtags and usernames
    text = re.sub(r"IranDeal", "Iran Deal", text)
    text = re.sub(r"ArianaGrande", "Ariana Grande", text)
    text = re.sub(r"camilacabello97", "camila cabello", text) 
    text = re.sub(r"RondaRousey", "Ronda Rousey", text)     
    text = re.sub(r"MTVHottest", "MTV Hottest", text)
    text = re.sub(r"TrapMusic", "Trap Music", text)
    text = re.sub(r"ProphetMuhammad", "Prophet Muhammad", text)
    text = re.sub(r"PantherAttack", "Panther Attack", text)
    text = re.sub(r"StrategicPatience", "Strategic Patience", text)
    text = re.sub(r"socialnews", "social news", text)
    text = re.sub(r"NASAHurricane", "NASA Hurricane", text)
    text = re.sub(r"onlinecommunities", "online communities", text)
    text = re.sub(r"humanconsumption", "human consumption", text)
    text = re.sub(r"Typhoon-Devastated", "Typhoon Devastated", text)
    text = re.sub(r"Meat-Loving", "Meat Loving", text)
    text = re.sub(r"facialabuse", "facial abuse", text)
    text = re.sub(r"LakeCounty", "Lake County", text)
    text = re.sub(r"BeingAuthor", "Being Author", text)
    text = re.sub(r"withheavenly", "with heavenly", text)
    text = re.sub(r"thankU", "thank you", text)
    text = re.sub(r"iTunesMusic", "iTunes Music", text)
    text = re.sub(r"OffensiveContent", "Offensive Content", text)
    text = re.sub(r"WorstSummerJob", "Worst Summer Job", text)
    text = re.sub(r"HarryBeCareful", "Harry Be Careful", text)
    text = re.sub(r"NASASolarSystem", "NASA Solar System", text)
    text = re.sub(r"animalrescue", "animal rescue", text)
    text = re.sub(r"KurtSchlichter", "Kurt Schlichter", text)
    text = re.sub(r"aRmageddon", "armageddon", text)
    text = re.sub(r"Throwingknifes", "Throwing knives", text)
    text = re.sub(r"GodsLove", "God's Love", text)
    text = re.sub(r"bookboost", "book boost", text)
    text = re.sub(r"ibooklove", "I book love", text)
    text = re.sub(r"NestleIndia", "Nestle India", text)
    text = re.sub(r"realDonaldTrump", "Donald Trump", text)
    text = re.sub(r"DavidVonderhaar", "David Vonderhaar", text)
    text = re.sub(r"CecilTheLion", "Cecil The Lion", text)
    text = re.sub(r"weathernetwork", "weather network", text)
    text = re.sub(r"withBioterrorism&use", "with Bioterrorism & use", text)
    text = re.sub(r"Hostage&2", "Hostage & 2", text)
    text = re.sub(r"GOPDebate", "GOP Debate", text)
    text = re.sub(r"RickPerry", "Rick Perry", text)
    text = re.sub(r"frontpage", "front page", text)
    text = re.sub(r"NewsIntexts", "News In texts", text)
    text = re.sub(r"ViralSpell", "Viral Spell", text)
    text = re.sub(r"til_now", "until now", text)
    text = re.sub(r"volcanoinRussia", "volcano in Russia", text)
    text = re.sub(r"ZippedNews", "Zipped News", text)
    text = re.sub(r"MicheleBachman", "Michele Bachman", text)
    text = re.sub(r"53inch", "53 inch", text)
    text = re.sub(r"KerrickTrial", "Kerrick Trial", text)
    text = re.sub(r"abstorm", "Alberta Storm", text)
    text = re.sub(r"Beyhive", "Beyonce hive", text)
    text = re.sub(r"IDFire", "Idaho Fire", text)
    text = re.sub(r"DETECTADO", "Detected", text)
    text = re.sub(r"RockyFire", "Rocky Fire", text)
    text = re.sub(r"Listen/Buy", "Listen / Buy", text)
    text = re.sub(r"NickCannon", "Nick Cannon", text)
    text = re.sub(r"FaroeIslands", "Faroe Islands", text)
    text = re.sub(r"yycstorm", "Calgary Storm", text)
    text = re.sub(r"IDPs:", "Internally Displaced People :", text)
    text = re.sub(r"ArtistsUnited", "Artists United", text)
    text = re.sub(r"ClaytonBryant", "Clayton Bryant", text)
    text = re.sub(r"jimmyfallon", "jimmy fallon", text)
    text = re.sub(r"justinbieber", "justin bieber", text)  
    text = re.sub(r"UTC2015", "UTC 2015", text)
    text = re.sub(r"Time2015", "Time 2015", text)
    text = re.sub(r"djicemoon", "dj icemoon", text)
    text = re.sub(r"LivingSafely", "Living Safely", text)
    text = re.sub(r"FIFA16", "Fifa 2016", text)
    text = re.sub(r"thisiswhywecanthavenicethings", "this is why we cannot have nice things", text)
    text = re.sub(r"bbcnews", "bbc news", text)
    text = re.sub(r"UndergroundRailraod", "Underground Railraod", text)
    text = re.sub(r"c4news", "c4 news", text)
    text = re.sub(r"OBLITERATION", "obliteration", text)
    text = re.sub(r"MUDSLIDE", "mudslide", text)
    text = re.sub(r"NoSurrender", "No Surrender", text)
    text = re.sub(r"NotExplained", "Not Explained", text)
    text = re.sub(r"greatbritishbakeoff", "great british bake off", text)
    text = re.sub(r"LondonFire", "London Fire", text)
    text = re.sub(r"KOTAWeather", "KOTA Weather", text)
    text = re.sub(r"LuchaUnderground", "Lucha Underground", text)
    text = re.sub(r"KOIN6News", "KOIN 6 News", text)
    text = re.sub(r"LiveOnK2", "Live On K2", text)
    text = re.sub(r"9NewsGoldCoast", "9 News Gold Coast", text)
    text = re.sub(r"nikeplus", "nike plus", text)
    text = re.sub(r"david_cameron", "David Cameron", text)
    text = re.sub(r"peterjukes", "Peter Jukes", text)
    text = re.sub(r"JamesMelville", "James Melville", text)
    text = re.sub(r"megynkelly", "Megyn Kelly", text)
    text = re.sub(r"cnewslive", "C News Live", text)
    text = re.sub(r"JamaicaObserver", "Jamaica Observer", text)
    text = re.sub(r"textLikeItsSeptember11th2001", "text like it is september 11th 2001", text)
    text = re.sub(r"cbplawyers", "cbp lawyers", text)
    text = re.sub(r"fewmoretexts", "few more texts", text)
    text = re.sub(r"BlackLivesMatter", "Black Lives Matter", text)
    text = re.sub(r"cjoyner", "Chris Joyner", text)
    text = re.sub(r"ENGvAUS", "England vs Australia", text)
    text = re.sub(r"ScottWalker", "Scott Walker", text)
    text = re.sub(r"MikeParrActor", "Michael Parr", text)
    text = re.sub(r"4PlayThursdays", "Foreplay Thursdays", text)
    text = re.sub(r"TGF2015", "Tontitown Grape Festival", text)
    text = re.sub(r"realmandyrain", "Mandy Rain", text)
    text = re.sub(r"GraysonDolan", "Grayson Dolan", text)
    text = re.sub(r"ApolloBrown", "Apollo Brown", text)
    text = re.sub(r"saddlebrooke", "Saddlebrooke", text)
    text = re.sub(r"TontitownGrape", "Tontitown Grape", text)
    text = re.sub(r"AbbsWinston", "Abbs Winston", text)
    text = re.sub(r"ShaunKing", "Shaun King", text)
    text = re.sub(r"MeekMill", "Meek Mill", text)
    text = re.sub(r"TornadoGiveaway", "Tornado Giveaway", text)
    text = re.sub(r"GRupdates", "GR updates", text)
    text = re.sub(r"SouthDowns", "South Downs", text)
    text = re.sub(r"braininjury", "brain injury", text)
    text = re.sub(r"auspol", "Australian politics", text)
    text = re.sub(r"PlannedParenthood", "Planned Parenthood", text)
    text = re.sub(r"calgaryweather", "Calgary Weather", text)
    text = re.sub(r"weallheartonedirection", "we all heart one direction", text)
    text = re.sub(r"edsheeran", "Ed Sheeran", text)
    text = re.sub(r"TrueHeroes", "True Heroes", text)
    text = re.sub(r"S3XLEAK", "sex leak", text)
    text = re.sub(r"ComplexMag", "Complex Magazine", text)
    text = re.sub(r"TheAdvocateMag", "The Advocate Magazine", text)
    text = re.sub(r"CityofCalgary", "City of Calgary", text)
    text = re.sub(r"EbolaOutbreak", "Ebola Outbreak", text)
    text = re.sub(r"SummerFate", "Summer Fate", text)
    text = re.sub(r"RAmag", "Royal Academy Magazine", text)
    text = re.sub(r"offers2go", "offers to go", text)
    text = re.sub(r"foodscare", "food scare", text)
    text = re.sub(r"MNPDNashville", "Metropolitan Nashville Police Department", text)
    text = re.sub(r"TfLBusAlerts", "TfL Bus Alerts", text)
    text = re.sub(r"GamerGate", "Gamer Gate", text)
    text = re.sub(r"IHHen", "Humanitarian Relief", text)
    text = re.sub(r"spinningbot", "spinning bot", text)
    text = re.sub(r"ModiMinistry", "Modi Ministry", text)
    text = re.sub(r"TAXIWAYS", "taxi ways", text)
    text = re.sub(r"Calum5SOS", "Calum Hood", text)
    text = re.sub(r"po_st", "po.st", text)
    text = re.sub(r"scoopit", "scoop.it", text)
    text = re.sub(r"UltimaLucha", "Ultima Lucha", text)
    text = re.sub(r"JonathanFerrell", "Jonathan Ferrell", text)
    text = re.sub(r"aria_ahrary", "Aria Ahrary", text)
    text = re.sub(r"rapidcity", "Rapid City", text)
    text = re.sub(r"OutBid", "outbid", text)
    text = re.sub(r"lavenderpoetrycafe", "lavender poetry cafe", text)
    text = re.sub(r"EudryLantiqua", "Eudry Lantiqua", text)
    text = re.sub(r"15PM", "15 PM", text)
    text = re.sub(r"OriginalFunko", "Funko", text)
    text = re.sub(r"rightwaystan", "Richard Tan", text)
    text = re.sub(r"CindyNoonan", "Cindy Noonan", text)
    text = re.sub(r"RT_America", "RT America", text)
    text = re.sub(r"narendramodi", "Narendra Modi", text)
    text = re.sub(r"BakeOffFriends", "Bake Off Friends", text)
    text = re.sub(r"TeamHendrick", "Hendrick Motorsports", text)
    text = re.sub(r"alexbelloli", "Alex Belloli", text)
    text = re.sub(r"itsjustinstuart", "Justin Stuart", text)
    text = re.sub(r"gunsense", "gun sense", text)
    text = re.sub(r"DebateQuestionsWeWantToHear", "debate questions we want to hear", text)
    text = re.sub(r"RoyalCarribean", "Royal Carribean", text)
    text = re.sub(r"samanthaturne19", "Samantha Turner", text)
    text = re.sub(r"JonVoyage", "Jon Stewart", text)
    text = re.sub(r"renew911health", "renew 911 health", text)
    text = re.sub(r"SuryaRay", "Surya Ray", text)
    text = re.sub(r"pattonoswalt", "Patton Oswalt", text)
    text = re.sub(r"minhazmerchant", "Minhaz Merchant", text)
    text = re.sub(r"TLVFaces", "Israel Diaspora Coalition", text)
    text = re.sub(r"pmarca", "Marc Andreessen", text)
    text = re.sub(r"pdx911", "Portland Police", text)
    text = re.sub(r"jamaicaplain", "Jamaica Plain", text)
    text = re.sub(r"Japton", "Arkansas", text)
    text = re.sub(r"RouteComplex", "Route Complex", text)
    text = re.sub(r"INSubcontinent", "Indian Subcontinent", text)
    text = re.sub(r"NJTurnpike", "New Jersey Turnpike", text)
    text = re.sub(r"Politifiact", "PolitiFact", text)
    text = re.sub(r"Hiroshima70", "Hiroshima", text)
    text = re.sub(r"GMMBC", "Greater Mt Moriah Baptist Church", text)
    text = re.sub(r"versethe", "verse the", text)
    text = re.sub(r"TubeStrike", "Tube Strike", text)
    text = re.sub(r"MissionHills", "Mission Hills", text)
    text = re.sub(r"ProtectDenaliWolves", "Protect Denali Wolves", text)
    text = re.sub(r"NANKANA", "Nankana", text)
    text = re.sub(r"SAHIB", "Sahib", text)
    text = re.sub(r"PAKPATTAN", "Pakpattan", text)
    text = re.sub(r"Newz_Sacramento", "News Sacramento", text)
    text = re.sub(r"gofundme", "go fund me", text)
    text = re.sub(r"pmharper", "Stephen Harper", text)
    text = re.sub(r"IvanBerroa", "Ivan Berroa", text)
    text = re.sub(r"LosDelSonido", "Los Del Sonido", text)
    text = re.sub(r"bancodeseries", "banco de series", text)
    text = re.sub(r"timkaine", "Tim Kaine", text)
    text = re.sub(r"IdentityTheft", "Identity Theft", text)
    text = re.sub(r"AllLivesMatter", "All Lives Matter", text)
    text = re.sub(r"mishacollins", "Misha Collins", text)
    text = re.sub(r"BillNeelyNBC", "Bill Neely", text)
    text = re.sub(r"BeClearOnCancer", "be clear on cancer", text)
    text = re.sub(r"Kowing", "Knowing", text)
    text = re.sub(r"ScreamQueens", "Scream Queens", text)
    text = re.sub(r"AskCharley", "Ask Charley", text)
    
    # Urls
    text = re.sub(r"https?:\/\/t.co\/[A-Za-z0-9]+", "", text)
        
    # Words with punctuations and special characters
    punctuations = '@#!?+&*[]-%.:/();$=><|{}^' + "'`"
    for p in punctuations:
        text = text.replace(p, f' {p} ')
        
    # ... and ..
    text = text.replace('...', ' ... ')
    if '...' not in text:
        text = text.replace('..', ' ... ')      
        
    # Acronyms
    text = re.sub(r"MH370", "Malaysia Airlines Flight 370", text)
    text = re.sub(r"mÌ¼sica", "music", text)
    text = re.sub(r"okwx", "Oklahoma City Weather", text)
    text = re.sub(r"arwx", "Arkansas Weather", text)    
    text = re.sub(r"gawx", "Georgia Weather", text)  
    text = re.sub(r"scwx", "South Carolina Weather", text)  
    text = re.sub(r"cawx", "California Weather", text)
    text = re.sub(r"tnwx", "Tennessee Weather", text)
    text = re.sub(r"azwx", "Arizona Weather", text)  
    text = re.sub(r"alwx", "Alabama Weather", text)
    text = re.sub(r"wordpressdotcom", "wordpress", text)    
    text = re.sub(r"usNWSgov", "United States National Weather Service", text)
    text = re.sub(r"Suruc", "Sanliurfa", text)   
    
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    text = [word for word in text.split(' ') if word not in stopword]
    text=" ".join(text)
    text = [stemmer.stem(word) for word in text.split(' ')]
    text=" ".join(text)
    
    return text

In [7]:
%%time

data['text'] = data['text'].apply(clean)

CPU times: user 3min 58s, sys: 1.54 s, total: 3min 59s
Wall time: 3min 59s


# Preprocessing

Assigns the 'text' column to the variable X_data and the 'target' column to the variable y_data.

In [8]:
data.loc[data['target'] == 4, 'target'] = 1

In [9]:
X_data = data['text']
y_data = data['target']

X_data

0            switchfoot http      twitpic  com      awww ...
1          upset cannot updat facebook text       might c...
2            kenichan dive mani time ball   manag save   ...
3                           whole bodi feel itchi like fire 
4            nationwideclass behav   i mad     cannot see   
                                 ...                        
1599995                        woke   school best feel ever 
1599996    thewdb  com    cool hear old walt interview   ...
1599997                      readi mojo makeov   ask detail 
1599998    happi  birthday boo alll time       tupac amar...
1599999    happi   charitytuesday   thenspcc   sparkschar...
Name: text, Length: 1600000, dtype: object

In [20]:
X_truncated, X_discarded, y_truncated, y_discarded = train_test_split(X_data, y_data, train_size=0.08, stratify=y_data, random_state=42)

Data is divided in train and test

In [21]:
x_train, x_test, y_train, y_test = train_test_split(X_data, y_data, random_state=42)
print(len(x_train), len(y_train))
print(len(x_test), len(y_test))

1200000 1200000
400000 400000


In [22]:
x_train_trunc, x_test_trunc, y_train_trunc, y_test_trunc = train_test_split(X_truncated, y_truncated, random_state=42)
print(len(x_train_trunc), len(y_train_trunc))
print(len(x_test_trunc), len(y_test_trunc))

96000 96000
32000 32000


## Count Vectorizer and Stop Words

The **CountVectorizer** operates on each individual text and performs the following steps:

* **Tokenization**: It splits each document into individual words or terms, which are also referred to as tokens. 
* **Counting**: It counts the occurrence of each token in each document and creates a matrix where columns represent unique tokens.
* **Vectorization**: It assigns a numerical value (count) to each token in each document, indicating how many times the token appears in that document.
* **Vocabulary** **Creation**: It builds a vocabulary of unique tokens based on the training data. Each token corresponds to a specific column in the matrix.
* **Transforming** **Test** **Data**: When applied to test data, the CountVectorizer uses the learned vocabulary from the training data and creates the matrix of token counts using the same columns as in the training matrix.

**Stop words** are a set of commonly used words in a language that are considered insignificant and are often removed during text preprocessing or natural language processing tasks. These words are filtered out because they typically do not carry much meaning or contribute significantly to the overall understanding of the text. Examples of stop words in English include "the," "is," "and," "a," "an," and so on.

In [13]:
count = CountVectorizer(stop_words='english', ngram_range=(1,5))
x_train_vectorizer=count.fit_transform(x_train)
x_test_vectorizer=count.transform(x_test)

In [14]:
count.vocabulary_

{'nathyeah': 9468685,
 'dame': 2891228,
 'nathyeah dame': 9468686,
 'boburnham': 1438620,
 'thesaurus': 13861185,
 'world': 15995103,
 'sale': 11942029,
 'end': 3829507,
 'boburnham thesaurus': 1438646,
 'thesaurus world': 13861202,
 'world sale': 16003462,
 'sale end': 11942671,
 'boburnham thesaurus world': 1438647,
 'thesaurus world sale': 13861203,
 'world sale end': 16003463,
 'boburnham thesaurus world sale': 1438648,
 'thesaurus world sale end': 13861204,
 'boburnham thesaurus world sale end': 1438649,
 'jojoalexand': 7057980,
 'ight': 6554051,
 'let': 7697576,
 'lil': 7890659,
 'white': 15677926,
 'boy': 1549032,
 'know': 7355440,
 'hahaha': 5632379,
 'jojoalexand ight': 7057981,
 'ight let': 6554101,
 'let lil': 7710510,
 'lil white': 7896275,
 'white boy': 15678337,
 'boy know': 1554319,
 'know hahaha': 7379516,
 'jojoalexand ight let': 7057982,
 'ight let lil': 6554102,
 'let lil white': 7710517,
 'lil white boy': 7896276,
 'white boy know': 15678345,
 'boy know hahaha': 155

In [23]:
count_trunc = CountVectorizer(stop_words='english', ngram_range=(1,5))
x_train_trunc_vectorizer=count_trunc.fit_transform(x_train_trunc)
x_test_trunc_vectorizer=count_trunc.transform(x_test_trunc)

# Models

* [**XGBoost**](##XGBoost)
* [**Keras**](##Keras)

## XGBoost

XGBoost (Extreme Gradient Boosting) is a powerful machine learning algorithm used for classification and regression tasks. It combines multiple weak models, typically decision trees, to create a stronger and more accurate model.

Despite the intention to employ a neural network for text classification, an XGBoost model has been trained to evaluate its performance and compare it with the results of the neural network.

XGBoost Training

In [None]:
xgb_model=xgb.XGBClassifier(
        learning_rate=0.1,
        max_depth=7,
        n_estimators=80,
        use_label_encoder=False,
        eval_metric='auc' )



### XGBoost with vectorizer data

In [None]:
xgb_model_vectorizer = xgb_model.fit(x_train_trunc_vectorizer, y_train_trunc)
xgb_predictions_vectorizer=xgb_model_vectorizer.predict(x_test_trunc_vectorizer)
print(confusion_matrix(y_test_trunc,xgb_predictions_vectorizer))
print (classification_report(y_test_trunc, xgb_predictions_vectorizer))

[[10688  9219]
 [ 2670 17423]]
              precision    recall  f1-score   support

           0       0.80      0.54      0.64     19907
           1       0.65      0.87      0.75     20093

    accuracy                           0.70     40000
   macro avg       0.73      0.70      0.69     40000
weighted avg       0.73      0.70      0.69     40000



## Keras

Text data is being processed using the Keras library.

In [24]:
max_words = 50000
max_len = 300
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(x_train_trunc)
sequences = tokenizer.texts_to_sequences(x_train_trunc)
sequences_matrix = sequence.pad_sequences(sequences,maxlen=max_len)

## Arquitecture

* **Embedding Layer:** This layer converts input words into dense vectors of fixed length. The vocabulary size is set to 50,000 words, and each word is represented as a 100-dimensional vector. The input size of this layer is a maximum length of 300 words.

* **Spatial Dropout1D Layer:** This layer applies dropout to prevent overfitting. It randomly drops out entire channels (feature maps) instead of individual neurons. In this case, 20% of the outputs from the Embedding layer are randomly set to 0.

* **LSTM Layer:** This layer utilizes LSTM (Long Short-Term Memory) units to model the sequence of words in the text. Each LSTM unit has 100 memory cells and can capture long-term patterns in sequential data. The LSTM layer also applies dropout with a rate of 20% on the recurrent connections to prevent overfitting.

* **Dense Layer:** This is the output layer of the model, consisting of a single neuron with a sigmoid activation function. It produces an output between 0 and 1, representing the probability of the text instance belonging to a particular class (e.g., positive or negative).

In [25]:
model = Sequential()
model.add(Embedding(max_words, 100, input_length=max_len))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))
model.summary()
model.compile(loss='binary_crossentropy',optimizer=RMSprop(),metrics=['accuracy'])

2024-03-10 15:09:57.662229: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M1 Pro
2024-03-10 15:09:57.662270: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 16.00 GB
2024-03-10 15:09:57.662285: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 5.33 GB
2024-03-10 15:09:57.662351: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:303] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-03-10 15:09:57.662374: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:269] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 300, 100)          5000000   
                                                                 
 spatial_dropout1d (Spatial  (None, 300, 100)          0         
 Dropout1D)                                                      
                                                                 
 lstm (LSTM)                 (None, 100)               80400     
                                                                 
 dense (Dense)               (None, 1)                 101       
                                                                 
Total params: 5080501 (19.38 MB)
Trainable params: 5080501 (19.38 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________




## Training

**EarlyStopping:**
* monitor='val_accuracy': It monitors the validation accuracy during training.
* mode='max': It maximizes the monitored metric (validation accuracy in this case).
* patience=5: It specifies the number of epochs to wait before stopping the training process if the monitored metric doesn't improve.

<br>

**ModelCheckpoint:**
* filepath='./keras': It specifies the path and filename to save the model weights.
* save_weights_only=True: It indicates that only the weights of the best model will be saved, not the entire model.
* monitor='val_accuracy': It monitors the validation accuracy during training.
* mode='max': It maximizes the monitored metric (validation accuracy in this case).
* save_best_only=True: It saves only the weights of the best model based on the monitored metric.


In [26]:
stop = EarlyStopping(
    monitor='val_accuracy', 
    mode='max',
    patience=5
)

checkpoint= ModelCheckpoint(
    filepath='./keras',
    save_weights_only=True,
    monitor='val_accuracy',
    mode='max',
    save_best_only=True)

In [27]:
history=model.fit(sequences_matrix,y_train_trunc,batch_size=1024,epochs=10,
            validation_split=0.2,callbacks=[stop,checkpoint])

Epoch 1/10


2024-03-10 15:10:03.855780: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.




2024-03-10 17:48:27.623543: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10


## Evaluation

In [28]:
test_sequences = tokenizer.texts_to_sequences(x_test)
test_sequences_matrix = sequence.pad_sequences(test_sequences,maxlen=max_len)

In [29]:
accr = model.evaluate(test_sequences_matrix,y_test)



In [30]:
lstm_prediction=model.predict(test_sequences_matrix)

2024-03-11 22:57:20.583270: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.




In [1]:
res=[]
for prediction in lstm_prediction:
    if prediction[0]<0.5:
        res.append(0)
    else:
        res.append(1)

NameError: name 'lstm_prediction' is not defined

In [2]:
print(confusion_matrix(y_test,res))Ç

SyntaxError: invalid syntax (3125813826.py, line 1)

In [3]:
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

NameError: name 'pickle' is not defined

In [4]:
model.save("sentiment_model.h5")

NameError: name 'model' is not defined

# Real Test

In [None]:
load_model=keras.models.load_model("./sentiment_model.h5")
with open('tokenizer.pickle', 'rb') as handle:
    load_tokenizer = pickle.load(handle)

In [None]:
test = 'I hate you idiot'

test=[clean(test)]
print(test)
seq = load_tokenizer.texts_to_sequences(test)
padded = sequence.pad_sequences(seq, maxlen=300)
print(seq)
pred = load_model.predict(padded)
print("pred", pred)
if pred<0.3:
    print("positive")
else:
    print("negative")