<a href="https://colab.research.google.com/github/JimenaBaripatti/FeatureEngineering/blob/main/Fire_Incidents_NLP_TopicModelling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Cardinality reduction of semi-fixed text variables**

Process: text preprocessing, BoW, LDA Topic Modelling 

In [2]:
!pip install Unidecode

Collecting Unidecode
  Downloading Unidecode-1.3.2-py3-none-any.whl (235 kB)
[?25l[K     |█▍                              | 10 kB 20.3 MB/s eta 0:00:01[K     |██▉                             | 20 kB 25.8 MB/s eta 0:00:01[K     |████▏                           | 30 kB 20.7 MB/s eta 0:00:01[K     |█████▋                          | 40 kB 16.9 MB/s eta 0:00:01[K     |███████                         | 51 kB 5.6 MB/s eta 0:00:01[K     |████████▍                       | 61 kB 6.1 MB/s eta 0:00:01[K     |█████████▊                      | 71 kB 5.5 MB/s eta 0:00:01[K     |███████████▏                    | 81 kB 6.1 MB/s eta 0:00:01[K     |████████████▌                   | 92 kB 6.1 MB/s eta 0:00:01[K     |██████████████                  | 102 kB 5.3 MB/s eta 0:00:01[K     |███████████████▎                | 112 kB 5.3 MB/s eta 0:00:01[K     |████████████████▊               | 122 kB 5.3 MB/s eta 0:00:01[K     |██████████████████              | 133 kB 5.3 MB/s eta 0:00:0

In [54]:
!pip install flashtext

Collecting flashtext
  Downloading flashtext-2.7.tar.gz (14 kB)
Building wheels for collected packages: flashtext
  Building wheel for flashtext (setup.py) ... [?25l[?25hdone
  Created wheel for flashtext: filename=flashtext-2.7-py2.py3-none-any.whl size=9310 sha256=a7a67dc3aef533bc7659de94d493e9382e263f7a6d5516a5654a393c587f126b
  Stored in directory: /root/.cache/pip/wheels/cb/19/58/4e8fdd0009a7f89dbce3c18fff2e0d0fa201d5cdfd16f113b7
Successfully built flashtext
Installing collected packages: flashtext
Successfully installed flashtext-2.7


In [83]:
!pip install pyldavis

Collecting pyldavis
  Downloading pyLDAvis-3.3.1.tar.gz (1.7 MB)
[K     |████████████████████████████████| 1.7 MB 5.2 MB/s 
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Installing backend dependencies ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
Collecting funcy
  Downloading funcy-1.16-py2.py3-none-any.whl (32 kB)
Collecting pandas>=1.2.0
  Downloading pandas-1.3.4-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (11.3 MB)
[K     |████████████████████████████████| 11.3 MB 44.7 MB/s 
Collecting numpy>=1.20.0
  Downloading numpy-1.21.4-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (15.7 MB)
[K     |████████████████████████████████| 15.7 MB 17.3 MB/s 
Building wheels for collected packages: pyldavis
  Building wheel for pyldavis (PEP 517) ... [?25l[?25hdone
  Created wheel for pyldavis: filename=pyLDAvis-3.3.1-py2.py3-none-any.whl size=136897 sha256=9728705394fe8

In [1]:
import pandas as pd
import numpy as np

In [13]:
#general

import time

from collections import Counter

from unidecode import unidecode

import re

import glob

import pprint

from pathlib import Path

#nlp libraries

from flashtext import KeywordProcessor

keyword_processor = KeywordProcessor(case_sensitive=False)

from nltk import bigrams

from nltk.util import trigrams

import spacy

nlp = spacy.load('en_core_web_sm')

In [37]:
import nltk

In [39]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

# Functions

In [19]:
def remove_special_characters(text, remove_digits=False):

    text = re.sub('\'', '', text) # replace apostraphes with nothing

    text = re.sub('’', '', text) # replace apostraphes with nothing

    text = re.sub(u"\u2019", '', text)

    text = re.sub(u"\u2018", '', text)# replace apostraphes with nothing

    pattern = r'[^a-zA-z0-9\s]' if not remove_digits else r'[^a-zA-z\s]' #replace all other text with a space

    text = re.sub(pattern, ' ', text)

    return text

 

def lemmatize(text):

    sent = []

    doc = nlp(text)

    for word in doc:

        if word.is_punct == False and word.is_alpha == True and word.is_stop == False and word.pos_ in {'NOUN', 'VERB', 'ADJ'} :

            sent.append(word.lemma_)

    return " ".join(sent)

 

def remove_stop_words(text):  

    sent = []

    doc = nlp(text)

    for word in doc:

        if word.is_stop == False:

            sent.append(word.lemma_)

    return " ".join(sent)

 

def standardize(text):

    text = remove_special_characters(text)

    #text = keyword_processor.replace_keywords(text)

    text = lemmatize(text)

    text = remove_stop_words(text)   

    #text = keyword_processor.replace_keywords(text)

    return text

 

In [26]:
def get_w_freq(serie_string):

    """

    Create a dataframe with word frequency

    """

    serie_docs = serie_string.str.split()

    bow = serie_docs.explode()

    count_words = Counter(bow)

    return pd.DataFrame.from_records(count_words.most_common(), columns=['word', 'frequency'])

 

def get_bigram_freq(serie, n):

    serie_bigram = serie.str.split().apply(lambda x: list(bigrams(x)))

    count_bigrams = Counter(serie_bigram.explode())

    return pd.DataFrame.from_records(count_bigrams.most_common(n), columns=['bigram', 'frequency'])

# Preprocessing

In [7]:
df = pd.read_csv('https://raw.githubusercontent.com/JimenaBaripatti/FeatureEngineering/main/data/current_dataset/fire_incident_station_weather_demo_combined.csv', index_col=[0])

In [9]:
#create data frame to store high cardinality variables (X5), ID and label
df_c = df[['Incident_Numberinc_','Property_Use','Initial_CAD_Event_Type','Ignition_Source','Area_of_Origin','Material_First_Ignited']].copy()

In [10]:
df_c.head()

Unnamed: 0,Incident_Numberinc_,Property_Use,Initial_CAD_Event_Type,Ignition_Source,Area_of_Origin,Material_First_Ignited
0,F18020956,"896 - Sidewalk, street, roadway, highway, hwy ...",Vehicle Fire,999 - Undetermined,81 - Engine Area,47 - Vehicle
1,F18020969,"896 - Sidewalk, street, roadway, highway, hwy ...",Fire - Grass/Rubbish,999 - Undetermined,"75 - Trash, rubbish area (outside)",97 - Other
2,F18021182,891 - Outdoor general auto parking,Fire - Highrise Residential,,,
3,F18021192,511 - Department Store,Fire - Commercial/Industrial,999 - Undetermined,"75 - Trash, rubbish area (outside)",99 - Undetermined (formerly 98)
4,F18021271,860 - Lawn around structure,Fire - Residential,,,


In [11]:
df_c.columns

Index(['Incident_Numberinc_', 'Property_Use', 'Initial_CAD_Event_Type',
       'Ignition_Source', 'Area_of_Origin', 'Material_First_Ignited'],
      dtype='object')

In [13]:
df_c['raw_text'] = df_c['Property_Use'].str.cat(others= df_c[['Initial_CAD_Event_Type', 'Ignition_Source', 'Area_of_Origin', 'Material_First_Ignited']], sep=" " , na_rep= '',  )

In [14]:
df_c.head()

Unnamed: 0,Incident_Numberinc_,Property_Use,Initial_CAD_Event_Type,Ignition_Source,Area_of_Origin,Material_First_Ignited,raw_text
0,F18020956,"896 - Sidewalk, street, roadway, highway, hwy ...",Vehicle Fire,999 - Undetermined,81 - Engine Area,47 - Vehicle,"896 - Sidewalk, street, roadway, highway, hwy ..."
1,F18020969,"896 - Sidewalk, street, roadway, highway, hwy ...",Fire - Grass/Rubbish,999 - Undetermined,"75 - Trash, rubbish area (outside)",97 - Other,"896 - Sidewalk, street, roadway, highway, hwy ..."
2,F18021182,891 - Outdoor general auto parking,Fire - Highrise Residential,,,,891 - Outdoor general auto parking Fire - Hig...
3,F18021192,511 - Department Store,Fire - Commercial/Industrial,999 - Undetermined,"75 - Trash, rubbish area (outside)",99 - Undetermined (formerly 98),511 - Department Store Fire - Commercial/Indus...
4,F18021271,860 - Lawn around structure,Fire - Residential,,,,860 - Lawn around structure Fire - Residential


In [15]:
df_c['raw_text'][:5].values

array(['896 - Sidewalk, street, roadway, highway, hwy (do not use for fire incidents) Vehicle Fire 999 - Undetermined 81 - Engine Area 47 - Vehicle',
       '896 - Sidewalk, street, roadway, highway, hwy (do not use for fire incidents) Fire - Grass/Rubbish 999 - Undetermined 75 - Trash, rubbish area (outside) 97 - Other',
       '891 - Outdoor general auto parking Fire -  Highrise Residential   ',
       '511 - Department Store Fire - Commercial/Industrial 999 - Undetermined 75 - Trash, rubbish area (outside) 99 - Undetermined (formerly 98)',
       '860 - Lawn around structure Fire - Residential   '], dtype=object)

In [25]:
df_c['clean_text'] = df_c['raw_text'].apply(standardize)

In [27]:
w_freq =  get_w_freq(df_c['clean_text'])

In [74]:
w_freq[200:300]

Unnamed: 0,word,frequency
200,senior,70
201,camp,70
202,sale,70
203,linen,69
204,forest,68
...,...,...
295,park,24
296,bakery,24
297,clothing,24
298,elevator,23


## Standarize

In [56]:
tokens = df_c['clean_text'][ df_c['clean_text'].str.len() > 1].str.split() #only text with more than 1 word
serie_bigram = tokens.apply(lambda x: list(bigrams(x)))

count_bigrams = Counter(serie_bigram.explode())

count_bigrams.most_common(30)

[(('dwell', 'unit'), 3677),
 (('range', 'burner'), 1915),
 (('rubbish', 'trash'), 1836),
 (('trash', 'waste'), 1827),
 (('smoker', 'article'), 1594),
 (('article', 'cigarette'), 1594),
 (('cigarette', 'cigar'), 1594),
 (('cigar', 'pipe'), 1594),
 (('pipe', 'ignite'), 1594),
 (('oil', 'grease'), 1457),
 (('unit', 'fihr'), 1407),
 (('cooking', 'oil'), 1259),
 (('trash', 'inc'), 1070),
 (('inc', 'garbage'), 1070),
 (('garbage', 'chute'), 1070),
 (('chute', 'room'), 1070),
 (('room', 'garbage'), 1070),
 (('garbage', 'industri'), 1070),
 (('burner', 'cooking'), 1000),
 (('street', 'roadway'), 942),
 (('roadway', 'highway'), 942),
 (('highway', 'use'), 942),
 (('use', 'fire'), 942),
 (('fire', 'incident'), 942),
 (('patient', 'room'), 871),
 (('room', 'dormitory'), 871),
 (('industri', 'rubbish'), 817),
 (('undetermine', 'undetermined'), 759),
 (('attach', 'dwell'), 684),
 (('automobile', 'vef'), 661)]

In [60]:
keyword_dict = {'trash' : ['rubbish', 'waste', 'garbage'],
                'cigarette': ['cigar', 'pipe'],
                'road': ['roadway', 'highway'],
                'vehicle': ['car', 'automobile']
         
                }

keyword_processor.add_keywords_from_dict(keyword_dict)

df_c['clean_text'] =  df_c['clean_text'].apply(keyword_processor.replace_keywords)

In [2]:
#df_c.to_csv('fire_incident_clean_text.csv')
df_c = pd.read_csv('fire_incident_clean_text.csv')

In [6]:
df_c.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17536 entries, 0 to 17535
Data columns (total 9 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   Unnamed: 0              17536 non-null  int64 
 1   Incident_Numberinc_     17536 non-null  object
 2   Property_Use            17535 non-null  object
 3   Initial_CAD_Event_Type  17536 non-null  object
 4   Ignition_Source         15623 non-null  object
 5   Area_of_Origin          15623 non-null  object
 6   Material_First_Ignited  15623 non-null  object
 7   raw_text                17536 non-null  object
 8   clean_text              17322 non-null  object
dtypes: int64(1), object(8)
memory usage: 1.2+ MB


In [7]:
df_c['clean_text'] = df_c['clean_text'].astype('str')

# Topic Modelling - LDA

In [3]:
from gensim.corpora.dictionary import Dictionary
from gensim import models

In [18]:
texts = df_c['clean_text'].str.split().copy()
dictionary = Dictionary(texts)

dictionary.filter_extremes(no_below=10, no_above=0.80)

stoplist = ['undetermined', 'undetermine', 'e', 'g']

stop_ids = [
    dictionary.token2id[stopword]
    for stopword in stoplist
    if stopword in dictionary.token2id
]

dictionary.filter_tokens(stop_ids)

print(dictionary)

Dictionary(347 unique tokens: ['fire', 'incident', 'road', 'street', 'use']...)


In [19]:
num_topics = 4
corpus = [dictionary.doc2bow(text) for text in texts]

lda = models.LdaModel(corpus, id2word=dictionary, num_topics=num_topics)

  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt

In [20]:
coherence_model_lda = models.CoherenceModel(model=lda, texts=texts, dictionary=dictionary, coherence='c_v')

coherence_lda = coherence_model_lda.get_coherence()

print('\nCoherence Score: ', coherence_lda)



Coherence Score:  0.2406728821943167


In [21]:
for i in range(num_topics):

    print('Topic number', i)

    pprint.pprint(lda.show_topic(i, topn=5))

    print('\n')


Topic number 0
[('vehicle', 0.1257901),
 ('burner', 0.09620944),
 ('range', 0.09592578),
 ('oil', 0.07106248),
 ('grease', 0.06953113)]


Topic number 1
[('unit', 0.08423406),
 ('dwell', 0.08318238),
 ('room', 0.053337473),
 ('patient', 0.03800157),
 ('dormitory', 0.03799863)]


Topic number 2
[('attach', 0.04685684),
 ('dwell', 0.040135056),
 ('townhouse', 0.034877993),
 ('rowhouse', 0.034876738),
 ('wood', 0.03139219)]


Topic number 3
[('trash', 0.28812546),
 ('cigarette', 0.13891321),
 ('smoker', 0.04477518),
 ('article', 0.044667505),
 ('ignite', 0.044413183)]




In [22]:
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
pyLDAvis.enable_notebook()

# feed the LDA model into the pyLDAvis instance
lda_viz = gensimvis.prepare(lda, corpus, dictionary, sort_topics=True)


  by='saliency', ascending=False).head(R).drop('saliency', 1)


In [23]:
lda_viz

In [27]:
texts[0]

['street', 'road', 'road', 'use', 'fire', 'incident', 'undetermine', 'vehicle']

In [25]:
lda.get_document_topics(texts[0])

ValueError: ignored