In [13]:
import pandas as pd
import numpy as np
import tensorflow as tf
import string
import re


In [14]:
data = pd.read_csv('/kaggle/input/us-crime-data/US_Crime_Data.csv')
data.sample(10)

Unnamed: 0,Date,Title,Organization,City,State,URL,Keyword,Summary
1237,4/27/17 10:03,"Within Trump's first 100 days, anti-Muslim inc...",Mic,New York,NY,http://mic.com/articles/175332/within-trump-s-...,american anti border cair cbp customs donald i...,by Sarah A. Harvard\tPresident Donald Trump ha...
6481,12/4/17 7:45,"Gates Police: Man pointed gun, yelled racial s...",rochesterfirst.com,Mendon,NY,http://www.rochesterfirst.com/news/local-news/...,,
6106,11/13/17 8:30,FBI reports: Hate crimes rose for 2nd year in ...,FOX 61,Hartford,CT,http://fox61.com/2017/11/13/fbi-reports-hate-c...,4229 andrew baltimore burton crimes fbi freddi...,"Posted 7:06 PM, November 13, 2017, by AP\tBALT..."
4528,9/4/17 4:20,Nuke F******G Sikhs': Man Vandalizes Sikh Temp...,Carbonated.tv,San Ramon,CA,http://www.carbonated.tv/news/sikh-temple-vand...,coalition facebook feliz hate hollywood los_an...,The police allegedly did not seem too interest...
3314,7/13/17 1:03,Man arrested in DC hit-and-run that critically...,WTOP,Washington,DC,http://wtop.com/dc/2017/07/hit-run-transgender...,anderson captain corado critically davon faceb...,"By Jack Pointer July 13, 2017 4:05 am\tWASHING..."
1588,5/19/17 14:56,Benefit concert Saturday in Troy for victims o...,Albany Times Union,Albany,NY,http://www.timesunion.com/allwcm/article/Benef...,arson cold concert crime felonies garage hate ...,TROY — A benefit concert to help the family wh...
4923,9/26/17 15:41,Victims in Thousands of Potential Hate Crimes ...,Tri County Sentry,,,http://tricountysentry.com/blog/victims-in-tho...,attorney bureau crime crimes fbi general hate ...,By Joe Sexton\tMore than half of the people wh...
6658,12/11/17 8:48,Facebook Live Torture Case: Teen Takes Plea De...,Patch.com,New York,,https://patch.com/illinois/crystallake/faceboo...,brittany chicago covington crime crystal donal...,She and three others were accused of kidnappin...
306,3/5/17 10:00,Police Contact FBI After Sikh Shot Outside Sea...,OPB News,Portland,OR,http://www.opb.org/news/article/police-fbi-sik...,,
444,3/10/17 2:30,Sikh man shooting in US: Kent police terms inc...,Tech Know Bits,Cumming,GA,http://techknowbits.com/2017/03/sikh-man-shoot...,,


In [15]:
data.isna().sum()

Date               0
Title              1
Organization       0
City            1167
State           1245
URL                0
Keyword         1176
Summary         2256
dtype: int64

## For this model we just need headlines

In [16]:
df = data[['Title']]
df = df.dropna().reset_index(drop = True)
df.head()

Unnamed: 0,Title
0,Pizza Hut driver who killed co-worker with sho...
1,Residents of NJ township receiving KKK promoti...
2,House OKs bill to expand Kentucky's hate crime...
3,"Amid Protests, 'Blue Lives Matter' Bill Passes..."
4,Lafourche inmates charged with hate crimes in ...


In [17]:
df['Title'][123]

'St. Louis suburb victimized by cemetery vandalism mulling hate crimes registry'

In [18]:
df.shape

(6782, 1)

# Removing Punctuations

In [19]:
def clean_text(df):
    df['Title'] = df['Title'].apply(lambda x : x.lower())
    tokens = df['Title'].str.replace('[{}]'.format(string.punctuation), '')
    return tokens

In [20]:
tokens = clean_text(df)

  This is separate from the ipykernel package so we can avoid doing imports until


In [21]:
tokens

0       pizza hut driver who killed coworker with shot...
1       residents of nj township receiving kkk promoti...
2       house oks bill to expand kentuckys hate crimes...
3       amid protests blue lives matter bill passes ke...
4       lafourche inmates charged with hate crimes in ...
                              ...                        
6777    police rash of vandalism on margate menorahs i...
6778    exstudent accused of smearing used tampon on r...
6779    queensbury woman charged with hate crime again...
6780    hate crime hartford student accused of smearin...
6781    police make arrest following alleged hate crim...
Name: Title, Length: 6782, dtype: object

In [22]:
len(set(tokens))

6563

# Tokenizing

In [24]:
tokenizer = tf.keras.preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(tokens)
seq = tokenizer.texts_to_sequences(tokens)

In [25]:
seq[:10]

[[3973, 3974, 450, 47, 88, 3975, 12, 2211, 103, 1370, 2803],
 [830, 4, 423, 1558, 2212, 470, 3976, 672],
 [257, 1846, 75, 5, 593, 3977, 1, 6, 36],
 [721, 898, 480, 274, 301, 75, 481, 634, 257],
 [3978, 2804, 16, 12, 1, 6, 3, 424, 19],
 [13, 722, 1, 2, 48, 594, 3979, 5, 401],
 [3980, 480, 274, 301, 75, 21, 1371, 2805, 5, 214, 3981, 173],
 [194, 320, 18, 402, 2806, 239, 15, 98, 38],
 [194,
  320,
  18,
  402,
  2806,
  239,
  15,
  98,
  38,
  635,
  174,
  556,
  557,
  14,
  498,
  558,
  275,
  42],
 [33, 174, 673, 302, 2, 1372, 2807, 3, 2213]]

# Creating input and output data list

In [29]:
x = []
y = []
total_words_drop = 0
for i in seq:
    if len(i) > 1:
        for j in range(1, len(i)):
            x.append(i[:j])
            y.append(i[j])
            
    else : 
        total_words_drop +=1
print('Total Words Dropped : {}'.format(total_words_drop))

Total Words Dropped : 12


In [31]:
y[: 10]

[3974, 450, 47, 88, 3975, 12, 2211, 103, 1370, 2803]

# Padding sequences

In [33]:
x = tf.keras.preprocessing.sequence.pad_sequences(x)

In [35]:
x.shape

(64701, 49)

# Shaping y same as x

In [36]:
y = tf.keras.utils.to_categorical(y)

In [37]:
y.shape

(64701, 7569)

# Vocab Size : total no. of unique words

In [38]:
vocab_size = len(tokenizer.word_index) + 1
vocab_size

7569

# LSTM Model

In [40]:
model = tf.keras.Sequential([tf.keras.layers.Embedding(vocab_size,49 ),
                            tf.keras.layers.LSTM(100, return_sequences = True),
                            tf.keras.layers.LSTM(100),
                            tf.keras.layers.Dense(100, activation = 'relu'),
                            tf.keras.layers.Dense(vocab_size, activation = 'softmax')])

In [41]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, None, 49)          370881    
_________________________________________________________________
lstm_2 (LSTM)                (None, None, 100)         60000     
_________________________________________________________________
lstm_3 (LSTM)                (None, 100)               80400     
_________________________________________________________________
dense_1 (Dense)              (None, 100)               10100     
_________________________________________________________________
dense_2 (Dense)              (None, 7569)              764469    
Total params: 1,285,850
Trainable params: 1,285,850
Non-trainable params: 0
_________________________________________________________________


In [42]:
model.compile(loss  = 'categorical_crossentropy',
             optimizer = 'adam',
             metrics = ['accuracy'],
             )

In [44]:
history = model.fit(x,y,
                   epochs = 100,
                    batch_size = 256,
                    callbacks = tf.keras.callbacks.EarlyStopping(monitor = 'val_loss',
                                                               patience = 5,
                                                               restore_best_weights = True))

2023-01-25 12:22:51.670457: W tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 1958887476 exceeds 10% of free system memory.
2023-01-25 12:22:53.639515: W tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 1958887476 exceeds 10% of free system memory.


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

# Saving model

In [45]:
model.save('model.h5')

# Vocab Array : list of all the unique words

In [46]:
vocab_array = np.array(list(tokenizer.word_index.keys()))
vocab_array

array(['hate', 'crime', 'in', ..., 'walks', 'lgbti', 'retires'],
      dtype='<U23')

# Final Function for Predictions

In [51]:
def make_predictions(text, n_words):
    for i in range(n_words):
        text_tokenize = tokenizer.texts_to_sequences([text])
        text_padding  = tf.keras.preprocessing.sequence.pad_sequences(text_tokenize, maxlen = 49)
        prediction = np.squeeze(np.argmax(model.predict(text_padding), axis = -1))
        prediction = str(vocab_array[prediction - 1])
        text += " " + prediction
    return text

# Testing Model

In [52]:
make_predictions('california',5)

'california man accused of hate crime'

In [54]:
make_predictions('new york',8)

'new york mosque found not guilty to death on muslim'

In [58]:
make_predictions('highway',8)

'highway toppled against white nationalism life to tackle online'

> Its so fun to get prediction !

# Thanks!