# Tutorial

In [2]:
import sys
sys.path.insert(0, '/home/agarcia/repos/disaster_tweets')

![architecture](./architecture.png)

# Database

In [3]:
import pandas as pd

df_train = pd.read_csv("../disaster_tweets/data/train.csv")
df_test = pd.read_csv("../disaster_tweets/data/test.csv")
display(df_train.head())
display(df_test.head())

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


# Processor

In [4]:
from disaster_tweets.data_loader.data_loaders import BasicDataPreprocessor

processor = BasicDataPreprocessor()
for x in processor(["hello how are you?", "I am fine thank you!"]):
    print(x)

['hello', 'how', 'are', 'you']
['i', 'am', 'fine', 'thank', 'you']


# Vocabulary

In [5]:
from disaster_tweets.data_loader.data_loaders import VocabBuilder

processor = BasicDataPreprocessor()
vocabulary = VocabBuilder.from_iterator("../disaster_tweets/data/train.csv", processor)

In [6]:
print(vocabulary(["fire", "hello"]))
print(len(vocabulary))

[42, 1485]
16102


# Dataset

In [7]:
from disaster_tweets.data_loader.data_loaders import TweetDataset

csv_path = "../disaster_tweets/data/train.csv"
data_preprocessor = BasicDataPreprocessor()
vocabulary = VocabBuilder.from_iterator("../disaster_tweets/data/train.csv", processor)

dataset = TweetDataset(csv_path, data_preprocessor, vocabulary)

In [14]:
print(len(dataset))
label = dataset[0][0]
sentence_vector = dataset[0][1]

print(label)
print(sentence_vector, len(sentence_vector))
list(data_preprocessor([df_train.text[0], ]))

7613
tensor(1)
tensor([ 108, 4946,   20,    1,  888,    5,   18,  239,  125, 1688, 5195,   69,
          39]) 13


[['our',
  'deeds',
  'are',
  'the',
  'reason',
  'of',
  'this',
  'earthquake',
  'may',
  'allah',
  'forgive',
  'us',
  'all']]

# DataLoader

In [33]:
from disaster_tweets.data_loader.data_loaders import tweet_data_loader

data_loader = tweet_data_loader(dataset, shuffle=True, batch_size=8)
for x in data_loader:
    print(x)
    break

(tensor([[1.],
        [0.],
        [0.],
        [0.],
        [1.],
        [0.],
        [0.],
        [1.]]), tensor([   69,   178,   374,   285,  3312,    51,    76,   489,     5,  1568,
           10,   104,    17,  3057, 14195, 15978,  5473,     3,  2293,    25,
            1,    42,   639,  6420,    65,     1,   328,     6,   434,   753,
           89,  3036, 15808,   188,    93,  8691,  1952,     4,    12, 10321,
          894,    52,    81,  2510,     8,   191,    30,     7,    61,   140,
            4,    21,   439,    13,   116,   714,  2812,    72,   254,   726,
          262,   264,   301,  1550,  1556,   417,  1388,  7642,   469,     5,
            1,    48,  3632,  5236,   233,     9,  7572,  1008, 15410,    15,
          540,    22,     2,   525,  1457,    24,   588,    30,   470,  5958,
          753,  4330,    17,   656,   448,   123,   774,   194,  3122,    25,
         3375,   257,  2588,    63]), tensor([ 0, 15, 35, 36, 56, 65, 79, 89]))


# Executing our first training

In [None]:
>> source .tox/disaster_tweets/bin/activate 
>> PYTHONPATH=. python disaster_tweets/train.py -c disaster_tweets/config.json