# Trabalho de RnR - Gerando Tweets com LSTMs

## Detalhes

- **Dataset:** https://www.kaggle.com/kulgen/elon-musks-tweets
- **Aluno:** Plínio Larrubia Ferreira de Moura


## 1 Obtenção dos dados


### Importação das Bibliotecas


In [79]:
!pip install keras --upgrade
!pip install matplotlib --upgrade
!pip install numpy --upgrade
!pip install pandas --upgrade
!pip install seaborn --upgrade
!pip install tensorflow --upgrade




In [80]:
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import LSTM
from keras.callbacks import ModelCheckpoint
from keras.utils import np_utils
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import sys


### Carregando Base de Dados


In [81]:
# from google.colab import drive
# drive.mount('/content/drive')


In [82]:
elon_base = pd.read_csv('data/data_elonmusk.csv', encoding='latin1')


## 2 Análise Explorátoria


In [83]:
elon_base.head()


Unnamed: 0,row ID,Tweet,Time,Retweet from,User
0,Row0,@MeltingIce Assuming max acceleration of 2 to ...,2017-09-29 17:39:19,,elonmusk
1,Row1,RT @SpaceX: BFR is capable of transporting sat...,2017-09-29 10:44:54,SpaceX,elonmusk
2,Row2,@bigajm Yup :),2017-09-29 10:39:57,,elonmusk
3,Row3,Part 2 https://t.co/8Fvu57muhM,2017-09-29 09:56:12,,elonmusk
4,Row4,Fly to most places on Earth in under 30 mins a...,2017-09-29 09:19:21,,elonmusk


In [84]:
elon_base.iloc[-5:, :]


Unnamed: 0,row ID,Tweet,Time,Retweet from,User
3213,Row3213,"@YOUSRC Amos's article was fair, but his edito...",2012-11-20 08:52:03,,elonmusk
3214,Row3214,These articles in Space News describe why Aria...,2012-11-20 08:38:31,,elonmusk
3215,Row3215,Was misquoted by BBC as saying Europe's rocket...,2012-11-20 08:30:44,,elonmusk
3216,Row3216,Just returned from a trip to London and Oxford...,2012-11-19 08:59:46,,elonmusk
3217,Row3217,RT @Jon_Favreau: My Model S just arrived and I...,2012-11-16 17:59:47,Jon_Favreau,elonmusk


In [85]:
np.unique(elon_base['User'], return_counts=True)


(array(['elonmusk'], dtype=object), array([3218]))

In [86]:
elon_base.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3218 entries, 0 to 3217
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   row ID        3218 non-null   object
 1   Tweet         3218 non-null   object
 2   Time          3218 non-null   object
 3   Retweet from  525 non-null    object
 4   User          3218 non-null   object
dtypes: object(5)
memory usage: 125.8+ KB


In [87]:
elon_base.describe()


Unnamed: 0,row ID,Tweet,Time,Retweet from,User
count,3218,3218,3218,525,3218
unique,3218,3216,3217,201,1
top,Row0,RT @SpaceX: Webcast of Falcon 9 launch is now ...,2013-07-24 10:49:13,SpaceX,elonmusk
freq,1,2,2,109,3218


In [88]:
retweet_values = elon_base['Retweet from'].unique()
retweet_values


array([nan, 'SpaceX', 'mayemusk', 'MotorTrend', 'FastCompany', 'Gizmodo',
       'Tesla', 'karpathy', 'Hyperloop', 'Teslarati', 'cmleahey', 'NASA',
       'OpenAI', 'mkarolian', 'guardianeco', 'rj8w', 'TheOnion',
       'tanyaofmars', 'mcannonbrookes', 'BBCWorld', 'HuffPostAU',
       'justAfanDavid', 'cnntech', 'jovanik21', 'ElectrekCo',
       'businessinsider', 'newscientist', 'RobertIger', 'TheMarsSociety',
       'insideclimate', 'GeorgeTakei', 'waitbutwhy', 'creepypuppet',
       'wcax', 'business', 'MrJc2012', 'mashable', '350', 'XHNews',
       'realamberheard', 'AAAauto', 'ProfBrianCox', 'PokerVixen',
       'RH_Way', 'morrisonbrett', 'arctechinc', 'Herifin_teki',
       'RicardoTwumasi', 'Shkottt', 'danieltadros', 'bobbykansara',
       'ryanrossuk', 'verge', 'SteveCase', 'WIRED', 'techinsider',
       'neiltyson', 'tsrandall', 'BBC_TopGear', 'IridiumComm', 'thedrive',
       'TechCrunch', 'garrett_bauman', 'TheEconomist', 'NatGeo',
       'RealRonHoward', 'ggreenwald', 'jody

### Retweets da SpaceX


In [89]:
by_retweets = elon_base.groupby(['Retweet from'])


In [90]:
pd.DataFrame(by_retweets['Tweet'].get_group(name='SpaceX'))


Unnamed: 0,Tweet
1,RT @SpaceX: BFR is capable of transporting sat...
5,RT @SpaceX: Supporting the creation of a perma...
10,"RT @SpaceX: Nine years ago today, Falcon 1 bec..."
44,RT @SpaceX: More photos from today?s Falcon 9 ...
110,RT @SpaceX: Successful deployment of FORMOSAT-...
...,...
2699,RT @SpaceX: [PHOTO] Falcon 9 and SES-8 liftoff...
2729,RT @SpaceX: Falcon 9 launch window opens tmrw ...
2733,RT @SpaceX: Falcon 9 targeted to launch SES-8 ...
2900,RT @SpaceX: [WATCH] SpaceX?s 5.2m satellite fa...


## 3 Preparação dos Dados


In [91]:
tweets_data = elon_base['Tweet'].values
tweets_data

array(["@MeltingIce Assuming max acceleration of 2 to 3 g's, but in a comfortable direction. Will feel like a mild to moder? https://t.co/fpjmEgrHfC",
       'RT @SpaceX: BFR is capable of transporting satellites to orbit, crew and cargo to the @Space_Station and completing missions to the Moon an?',
       '@bigajm Yup :)', ...,
       "Was misquoted by BBC as saying Europe's rocket has no chance. Just said the [Franco-German] Ariane 5 has no chance, so go with Ariane 6.",
       'Just returned from a trip to London and Oxford, where I met with many interesting people. I really like Britain!',
       'RT @Jon_Favreau: My Model S just arrived and I went electric like Dylan! #FF @TeslaMotors @elonmusk'],
      dtype=object)

In [92]:
dataset_text = '\n'.join(tweets_data)


In [93]:
characteres = sorted(list(set(dataset_text)))
print('{} unique characters'.format(len(characteres)))


97 unique characters


In [94]:
characteres

['\n',
 ' ',
 '!',
 '#',
 '$',
 '%',
 '&',
 "'",
 '(',
 ')',
 '*',
 '+',
 ',',
 '-',
 '.',
 '/',
 '0',
 '1',
 '2',
 '3',
 '4',
 '5',
 '6',
 '7',
 '8',
 '9',
 ':',
 ';',
 '<',
 '=',
 '>',
 '?',
 '@',
 'A',
 'B',
 'C',
 'D',
 'E',
 'F',
 'G',
 'H',
 'I',
 'J',
 'K',
 'L',
 'M',
 'N',
 'O',
 'P',
 'Q',
 'R',
 'S',
 'T',
 'U',
 'V',
 'W',
 'X',
 'Y',
 'Z',
 '[',
 ']',
 '^',
 '_',
 '`',
 'a',
 'b',
 'c',
 'd',
 'e',
 'f',
 'g',
 'h',
 'i',
 'j',
 'k',
 'l',
 'm',
 'n',
 'o',
 'p',
 'q',
 'r',
 's',
 't',
 'u',
 'v',
 'w',
 'x',
 'y',
 'z',
 '|',
 '~',
 '\xa0',
 '°',
 'Ü',
 'ä',
 'é']

In [95]:
char2idx = { char: index for index, char in enumerate(characteres) }
char2idx

{'\n': 0,
 ' ': 1,
 '!': 2,
 '#': 3,
 '$': 4,
 '%': 5,
 '&': 6,
 "'": 7,
 '(': 8,
 ')': 9,
 '*': 10,
 '+': 11,
 ',': 12,
 '-': 13,
 '.': 14,
 '/': 15,
 '0': 16,
 '1': 17,
 '2': 18,
 '3': 19,
 '4': 20,
 '5': 21,
 '6': 22,
 '7': 23,
 '8': 24,
 '9': 25,
 ':': 26,
 ';': 27,
 '<': 28,
 '=': 29,
 '>': 30,
 '?': 31,
 '@': 32,
 'A': 33,
 'B': 34,
 'C': 35,
 'D': 36,
 'E': 37,
 'F': 38,
 'G': 39,
 'H': 40,
 'I': 41,
 'J': 42,
 'K': 43,
 'L': 44,
 'M': 45,
 'N': 46,
 'O': 47,
 'P': 48,
 'Q': 49,
 'R': 50,
 'S': 51,
 'T': 52,
 'U': 53,
 'V': 54,
 'W': 55,
 'X': 56,
 'Y': 57,
 'Z': 58,
 '[': 59,
 ']': 60,
 '^': 61,
 '_': 62,
 '`': 63,
 'a': 64,
 'b': 65,
 'c': 66,
 'd': 67,
 'e': 68,
 'f': 69,
 'g': 70,
 'h': 71,
 'i': 72,
 'j': 73,
 'k': 74,
 'l': 75,
 'm': 76,
 'n': 77,
 'o': 78,
 'p': 79,
 'q': 80,
 'r': 81,
 's': 82,
 't': 83,
 'u': 84,
 'v': 85,
 'w': 86,
 'x': 87,
 'y': 88,
 'z': 89,
 '|': 90,
 '~': 91,
 '\xa0': 92,
 '°': 93,
 'Ü': 94,
 'ä': 95,
 'é': 96}

In [96]:
idx2char = np.array(characteres)
idx2char

array(['\n', ' ', '!', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',',
       '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
       ':', ';', '<', '=', '>', '?', '@', 'A', 'B', 'C', 'D', 'E', 'F',
       'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S',
       'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', ']', '^', '_', '`', 'a',
       'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
       'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '|',
       '~', '\xa0', '°', 'Ü', 'ä', 'é'], dtype='<U1')

In [97]:
char2idx['é'], idx2char[96]

(96, 'é')

In [98]:
text_as_int = np.array([char2idx[char] for char in dataset_text])
text_as_int.shape

(300787,)

In [99]:
print('{} characters mapped to int ---> {}'.format(repr(dataset_text[:15]), text_as_int[:15]))

'@MeltingIce Ass' characters mapped to int ---> [32 45 68 75 83 72 77 70 41 66 68  1 33 82 82]


## 4 Criação dos exemplos de treinamento e batches

## 5 Aplicação de Redes Neurais Recorrentes (LTSM)


## 6 Apresentação dos resultados
