<a href="https://colab.research.google.com/github/Koks-creator/TwitterAnalysis/blob/main/TwitterRNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install keras==2.4.3 tensorflow==2.3.1 numpy==1.18.5

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
from google.colab import files

uploaded = files.upload()
!mkdir -p ~/.kaggle/ && mv kaggle.json ~/.kaggle/ && chmod 600 ~/.kaggle/kaggle.json

Saving kaggle.json to kaggle.json


In [None]:
!kaggle competitions download -c nlp-getting-started

nlp-getting-started.zip: Skipping, found more recently modified local copy (use --force to force download)


In [None]:
import os, glob
from zipfile import ZipFile

with ZipFile("/content/nlp-getting-started.zip") as zip:
  zip.extractall()

In [None]:
import re
from collections import Counter
import string
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
df = pd.read_csv("/content/train.csv")
df = df[["text", "target"]]

In [None]:
df["text"].head(n=5)

0    Our Deeds are the Reason of this #earthquake M...
1               Forest fire near La Ronge Sask. Canada
2    All residents asked to 'shelter in place' are ...
3    13,000 people receive #wildfires evacuation or...
4    Just got sent this photo from Ruby #Alaska as ...
Name: text, dtype: object

In [None]:
# preprocessing
stop_words = set(stopwords.words('english'))

class DataCleaning:

    @staticmethod
    def remove_html_tags(raw_text: str) -> str:
        cleanr = re.compile("<.*?>") 
        cleantext = re.sub(cleanr, '', raw_text)
        return cleantext
  
    @staticmethod
    def remove_url(text: str) -> str:
        url_pattern = re.compile(r"http[s]?://\S+.\S+.\S+")
        return url_pattern.sub(r"", text)
  
    @staticmethod
    def remove_punct(text: str) -> str:
        translator = str.maketrans("", "", string.punctuation)
        return text.translate(translator)
    
    @staticmethod
    def remove_non_ascii(text: str) -> str:
        pattern = re.compile(r"[^\x00-\x7f][ ]?")
        return pattern.sub(r"", text)
    
    @staticmethod
    def remove_stopword(text: str) -> str:
        filtered_text = [word.lower() for word in text.split() if word.lower() not in stop_words]
        return " ".join(filtered_text)
    



In [None]:
pr = DataCleaning()

df["text"] = df["text"].apply(lambda x: pr.remove_html_tags(x))
df["text"] = df["text"].apply(lambda x: pr.remove_url(x))
df["text"] = df["text"].apply(lambda x: pr.remove_punct(x))
df["text"] = df["text"].apply(lambda x: pr.remove_non_ascii(x))
df["text"] = df["text"].apply(lambda x: pr.remove_stopword(x))

In [None]:
df["text"].head(n=50)

0          deeds reason earthquake may allah forgive us
1                 forest fire near la ronge sask canada
2     residents asked shelter place notified officer...
3     13000 people receive wildfires evacuation orde...
4     got sent photo ruby alaska smoke wildfires pou...
5     rockyfire update california hwy 20 closed dire...
6     flood disaster heavy rain causes flash floodin...
7                            im top hill see fire woods
8     theres emergency evacuation happening building...
9                         im afraid tornado coming area
10                      three people died heat wave far
11    haha south tampa getting flooded hah wait seco...
12    raining flooding florida tampabay tampa 18 19 ...
13                      flood bago myanmar arrived bago
14        damage school bus 80 multi car crash breaking
15                                            whats man
16                                          love fruits
17                                        summer

In [None]:
def count_words(col: pd.Series) -> Counter:
    count = Counter()

    for text in col.values:
        for word in text.split():
            count[word] += 1
    
    return count

In [None]:
counter = count_words(df["text"])

counter

Counter({'deeds': 2,
         'reason': 20,
         'earthquake': 46,
         'may': 88,
         'allah': 9,
         'forgive': 2,
         'us': 159,
         'forest': 65,
         'fire': 251,
         'near': 54,
         'la': 26,
         'ronge': 1,
         'sask': 1,
         'canada': 9,
         'residents': 8,
         'asked': 9,
         'shelter': 6,
         'place': 26,
         'notified': 1,
         'officers': 8,
         'evacuation': 50,
         'orders': 11,
         'expected': 15,
         '13000': 4,
         'people': 195,
         'receive': 2,
         'wildfires': 10,
         'california': 117,
         'got': 112,
         'sent': 13,
         'photo': 40,
         'ruby': 1,
         'alaska': 6,
         'smoke': 48,
         'pours': 1,
         'school': 66,
         'rockyfire': 4,
         'update': 37,
         'hwy': 9,
         '20': 26,
         'closed': 20,
         'directions': 1,
         'due': 31,
         'lake': 14,
         'cou

In [None]:
num_unique_words = len(counter)
num_unique_words

17320

In [None]:
train_size = int(len(df) * 0.8)

train_df = df[:train_size]
val_df = df[train_size:]

train_sents = train_df["text"].to_numpy()
train_labels = train_df["target"].to_numpy()
val_sents = val_df["text"].to_numpy()
val_labels = val_df["target"].to_numpy()

In [None]:
tokenizer = Tokenizer(num_words=num_unique_words)
tokenizer.fit_on_texts(train_sents)

In [None]:
words_index = tokenizer.word_index

In [None]:
train_sequences = tokenizer.texts_to_sequences(train_sents)
val_sequences = tokenizer.texts_to_sequences(val_sents)

In [None]:
max_length = 15

train_padded = pad_sequences(train_sequences, maxlen=max_length, padding="pre", truncating="pre")
val_padded = pad_sequences(val_sequences, maxlen=max_length, padding="pre", truncating="pre")

train_padded.shape, val_padded.shape

((6090, 15), (1523, 15))

In [None]:
print(train_sents[0])
print(train_sequences[0])
print(train_padded[0])

deeds reason earthquake may allah forgive us
[3667, 681, 293, 39, 1275, 3668, 13]
[   0    0    0    0    0    0    0    0 3667  681  293   39 1275 3668
   13]


In [None]:
reverse_word_index = dict([(idx, word) for (word, idx) in words_index.items()])

In [None]:
reverse_word_index

{1: 'like',
 2: 'amp',
 3: 'im',
 4: 'fire',
 5: 'get',
 6: 'new',
 7: 'dont',
 8: 'people',
 9: 'emergency',
 10: 'one',
 11: '2',
 12: 'news',
 13: 'us',
 14: 'disaster',
 15: 'video',
 16: 'body',
 17: 'burning',
 18: 'would',
 19: 'buildings',
 20: 'police',
 21: 'crash',
 22: 'first',
 23: 'california',
 24: 'still',
 25: 'man',
 26: 'got',
 27: 'know',
 28: 'day',
 29: 'back',
 30: 'going',
 31: 'two',
 32: 'time',
 33: 'full',
 34: 'accident',
 35: 'love',
 36: 'cant',
 37: 'world',
 38: 'nuclear',
 39: 'may',
 40: 'see',
 41: 'go',
 42: 'attack',
 43: 'many',
 44: '3',
 45: 'watch',
 46: 'collapse',
 47: 'dead',
 48: 'today',
 49: 'mass',
 50: 'car',
 51: 'want',
 52: 'good',
 53: 'years',
 54: 'work',
 55: 'train',
 56: 'last',
 57: 'think',
 58: 'u',
 59: 'families',
 60: 'rt',
 61: 'fires',
 62: 'could',
 63: 'say',
 64: 'hiroshima',
 65: 'death',
 66: 'hot',
 67: 'forest',
 68: 'life',
 69: 'way',
 70: 'need',
 71: 'legionnaires',
 72: 'killed',
 73: 'war',
 74: 'fatal',
 7

In [None]:
def decode_seq(seq):
  return " ".join([reverse_word_index.get(index, "?") for index in seq])

In [None]:
model = keras.models.Sequential()

model.add(layers.Embedding(num_unique_words, 64, input_length=max_length))
model.add(layers.LSTM(32, return_sequences=True, activation='relu'))
model.add(layers.LSTM(16, return_sequences=False, activation='relu'))
model.add(layers.Dropout(0.2))
model.add(layers.Dense(1, activation="sigmoid"))

model.summary()

Model: "sequential_22"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_22 (Embedding)     (None, 15, 64)            1108480   
_________________________________________________________________
lstm_55 (LSTM)               (None, 15, 32)            12416     
_________________________________________________________________
lstm_56 (LSTM)               (None, 16)                3136      
_________________________________________________________________
dropout_14 (Dropout)         (None, 16)                0         
_________________________________________________________________
dense_15 (Dense)             (None, 1)                 17        
Total params: 1,124,049
Trainable params: 1,124,049
Non-trainable params: 0
_________________________________________________________________


In [None]:
optim = keras.optimizers.Adam(lr=0.001)

model.compile(
    optimizer=optim,
    loss="binary_crossentropy",
    metrics=["accuracy"]
)

In [None]:
model.fit(
    train_padded,
    train_labels,
    epochs=6,
    validation_data=(val_padded, val_labels)
)

Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


<tensorflow.python.keras.callbacks.History at 0x7f449b251be0>

In [None]:
predictions = model.predict(val_padded)
predictions = [1 if p > 0.5 else 0 for p in predictions]


print(val_sents[14])

print(val_labels[14])
print(predictions[14])
# print(predictions)

val_df["Predictions"] = predictions
toshowdf = val_df.where(df["target"] == 1)
toshowdf = toshowdf.dropna()
print(toshowdf["Predictions"].to_list().count(1))
print(len(toshowdf["Predictions"]))

movie titanic jack rose could stayed wooden beam without sinking
0
0
501
709


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  val_df["Predictions"] = predictions


In [None]:
import pickle
from tensorflow.keras.models import save_model

with open("tokenizer.pkl", "wb") as f:
    pickle.dump(tokenizer, f)


model.save("twitter_classification_model.h5")