<a href="https://colab.research.google.com/github/JimenaBaripatti/FeatureEngineering/blob/main/Fire_Incidents_NLP_TopicModelling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Cardinality reduction of semi-fixed text variables**

Process: text preprocessing, BoW, LDA Topic Modelling 

In [33]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
!pip install Unidecode



In [3]:
!pip install flashtext



In [4]:
!pip install pyldavis



In [5]:
import pandas as pd
import numpy as np

In [6]:
#general

import time

from collections import Counter

from unidecode import unidecode

import re

import glob

import pprint

from pathlib import Path

#nlp libraries

from flashtext import KeywordProcessor

keyword_processor = KeywordProcessor(case_sensitive=False)

from nltk import bigrams

from nltk.util import trigrams

import spacy

nlp = spacy.load('en_core_web_sm')

In [None]:
import nltk

In [None]:
nltk.download('punkt')

# Functions

In [7]:
def remove_special_characters(text, remove_digits=False):

    text = re.sub('\'', '', text) # replace apostraphes with nothing

    text = re.sub('’', '', text) # replace apostraphes with nothing

    text = re.sub(u"\u2019", '', text)

    text = re.sub(u"\u2018", '', text)# replace apostraphes with nothing

    pattern = r'[^a-zA-z0-9\s]' if not remove_digits else r'[^a-zA-z\s]' #replace all other text with a space

    text = re.sub(pattern, ' ', text)

    return text

 

def lemmatize(text):

    sent = []

    doc = nlp(text)

    for word in doc:

        if word.is_punct == False and word.is_alpha == True and word.is_stop == False and word.pos_ in {'NOUN', 'VERB', 'ADJ'} :

            sent.append(word.lemma_)

    return " ".join(sent)

 

def remove_stop_words(text):  

    sent = []

    doc = nlp(text)

    for word in doc:

        if word.is_stop == False:

            sent.append(word.lemma_)

    return " ".join(sent)

 

def standardize(text):

    text = remove_special_characters(text)

    text = lemmatize(text)

    text = remove_stop_words(text)   

    text = keyword_processor.replace_keywords(text)

    return text

In [8]:
def get_w_freq(serie_string):

    """

    Create a dataframe with word frequency

    """

    serie_docs = serie_string.str.split()

    bow = serie_docs.explode()

    count_words = Counter(bow)

    return pd.DataFrame.from_records(count_words.most_common(), columns=['word', 'frequency'])

 

def get_bigram_freq(serie, n):

    tokens = serie[ serie.str.len() > 1].str.split() #only text with more than 1 word
    serie_bigram = tokens.apply(lambda x: list(bigrams(x)))
    count_bigrams = Counter(serie_bigram.explode())

    return pd.DataFrame.from_records(count_bigrams.most_common(n), columns=['bigram', 'frequency'])

# Clean text and standarize

Use training data

In [9]:
df = pd.read_csv('https://raw.githubusercontent.com/JimenaBaripatti/FeatureEngineering/main/data/current_dataset/training_initial.csv')
#create data frame to store high cardinality variables (X5), ID and label
df_c = df[['Incident_Numberinc_','Property_Use','Initial_CAD_Event_Type','Ignition_Source','Area_of_Origin','Material_First_Ignited']].copy()

In [11]:
df_c.head()

Unnamed: 0_level_0,Incident_Numberinc_,Property_Use,Initial_CAD_Event_Type,Ignition_Source,Area_of_Origin,Material_First_Ignited
_id_x,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1951921,F17039755,"539 - Gifts, jewellery, leather goods, mixed g...",FIHRD,12 - Oven,28 - Office,14 - Interior Wall/Ceiling
1949598,F18134861,"896 - Sidewalk, street, roadway, highway, hwy ...",Fire - Grass/Rubbish,,,
1964401,F13046589,323 - Multi-Unit Dwelling - Over 12 Units,FAHR,52 - Florescent Lamp (includes ballast),"12 - Hallway, Corridor",43 - Electrical Wiring Insulation
1961269,F13104454,323 - Multi-Unit Dwelling - Over 12 Units,FAHR,999 - Undetermined,42 - Garage,"46 - Rubbish, Trash, Waste"
1951192,F18062211,"896 - Sidewalk, street, roadway, highway, hwy ...",Check Call,,,


In [12]:
df_c.shape

(14028, 6)

In [13]:
df_c['raw_text'] = df_c['Property_Use'].str.cat(others= df_c[['Initial_CAD_Event_Type', 'Ignition_Source', 'Area_of_Origin', 'Material_First_Ignited']], sep=" " , na_rep= '',  )

In [14]:
df_c.head()

Unnamed: 0_level_0,Incident_Numberinc_,Property_Use,Initial_CAD_Event_Type,Ignition_Source,Area_of_Origin,Material_First_Ignited,raw_text
_id_x,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1951921,F17039755,"539 - Gifts, jewellery, leather goods, mixed g...",FIHRD,12 - Oven,28 - Office,14 - Interior Wall/Ceiling,"539 - Gifts, jewellery, leather goods, mixed g..."
1949598,F18134861,"896 - Sidewalk, street, roadway, highway, hwy ...",Fire - Grass/Rubbish,,,,"896 - Sidewalk, street, roadway, highway, hwy ..."
1964401,F13046589,323 - Multi-Unit Dwelling - Over 12 Units,FAHR,52 - Florescent Lamp (includes ballast),"12 - Hallway, Corridor",43 - Electrical Wiring Insulation,323 - Multi-Unit Dwelling - Over 12 Units FAHR...
1961269,F13104454,323 - Multi-Unit Dwelling - Over 12 Units,FAHR,999 - Undetermined,42 - Garage,"46 - Rubbish, Trash, Waste",323 - Multi-Unit Dwelling - Over 12 Units FAHR...
1951192,F18062211,"896 - Sidewalk, street, roadway, highway, hwy ...",Check Call,,,,"896 - Sidewalk, street, roadway, highway, hwy ..."


In [115]:
keyword_dict = {'trash' : ['rubbish', 'waste', 'garbage'],
                'cigarette': ['cigar', 'pipe'],
                'road': ['street', 'roadway', 'highway'],
                'vehicle': ['car', 'automobile'],
                'house': ['townhouse', 'rowhouse', 'dwell'],
         
                }

keyword_processor.add_keywords_from_dict(keyword_dict)

In [16]:
df_c['clean_text'] = df_c['raw_text'].apply(standardize)

In [38]:
df_c['clean_text'] = df_c['clean_text'].apply(keyword_processor.replace_keywords)

In [39]:
w_freq =  get_w_freq(df_c['clean_text'])

In [40]:
w_freq[:50]

Unnamed: 0,word,frequency
0,trash,9495
1,house,4756
2,cigarette,3954
3,vehicle,3830
4,undetermined,3334
5,unit,3129
6,undetermine,2276
7,road,2248
8,room,1967
9,burner,1534


In [60]:
df_c[ df_c['clean_text'].str.contains('unit ')]

Unnamed: 0_level_0,Incident_Numberinc_,Property_Use,Initial_CAD_Event_Type,Ignition_Source,Area_of_Origin,Material_First_Ignited,raw_text,clean_text
_id_x,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1964401,F13046589,323 - Multi-Unit Dwelling - Over 12 Units,FAHR,52 - Florescent Lamp (includes ballast),"12 - Hallway, Corridor",43 - Electrical Wiring Insulation,323 - Multi-Unit Dwelling - Over 12 Units FAHR...,house unit include ballast insulation
1961269,F13104454,323 - Multi-Unit Dwelling - Over 12 Units,FAHR,999 - Undetermined,42 - Garage,"46 - Rubbish, Trash, Waste",323 - Multi-Unit Dwelling - Over 12 Units FAHR...,house unit undetermined trash trash trash
1962593,F11161591,323 - Multi-Unit Dwelling - Over 12 Units,FIHR,"11 - Stove, Range-top burner",24 - Cooking Area or Kitchen,"74 - Cooking Oil, Grease",323 - Multi-Unit Dwelling - Over 12 Units FIHR...,house unit fihr range burner cooking oil grease
1952539,F17093622,323 - Multi-Unit Dwelling - Over 12 Units,FIHR,999 - Undetermined,"13 - Stairway, Escalator","46 - Rubbish, Trash, Waste",323 - Multi-Unit Dwelling - Over 12 Units FIHR...,house unit fihr undetermine stairway escalator...
1961385,F16102892,322 - Multi-Unit Dwelling - 7 to 12 Units,FAHR,"11 - Stove, Range-top burner",24 - Cooking Area or Kitchen,"74 - Cooking Oil, Grease",322 - Multi-Unit Dwelling - 7 to 12 Units FAHR...,multi dwelling unit range burner cooking oil g...
...,...,...,...,...,...,...,...,...
1959821,F15100989,323 - Multi-Unit Dwelling - Over 12 Units,FIHRD,999 - Undetermined,22 - Sleeping Area or Bedroom (inc. patients r...,"31 - Mattress, Pillow",323 - Multi-Unit Dwelling - Over 12 Units FIHR...,house unit undetermine patient room dormitory ...
1960069,F15017831,323 - Multi-Unit Dwelling - Over 12 Units,FIHRD,79 - Other Open Flame Tools/Smokers' Articles,22 - Sleeping Area or Bedroom (inc. patients r...,"31 - Mattress, Pillow",323 - Multi-Unit Dwelling - Over 12 Units FIHR...,house unit tool article patient room dormitory...
1963060,F15112359,322 - Multi-Unit Dwelling - 7 to 12 Units,FIHR,"11 - Stove, Range-top burner",24 - Cooking Area or Kitchen,97 - Other,322 - Multi-Unit Dwelling - 7 to 12 Units FIHR...,multi house unit fihr range burner
1953714,F11121707,323 - Multi-Unit Dwelling - Over 12 Units,FIHR,"11 - Stove, Range-top burner",24 - Cooking Area or Kitchen,23 - Cabinetry,323 - Multi-Unit Dwelling - Over 12 Units FIHR...,house unit fihr range burner


In [41]:
bigram_freq = get_bigram_freq(df_c['clean_text'], 20)
bigram_freq

Unnamed: 0,bigram,frequency
0,"(trash, trash)",3421
1,"(house, unit)",2939
2,"(cigarette, cigarette)",2560
3,"(range, burner)",1531
4,"(road, road)",1498
5,"(smoker, article)",1280
6,"(article, cigarette)",1280
7,"(cigarette, ignite)",1280
8,"(oil, grease)",1167
9,"(unit, fihr)",1119


In [20]:
df_c.to_csv('train_incident_clean_text.csv')
#df_c = pd.read_csv('fire_incident_clean_text.csv')

In [None]:
df_c.info()

In [None]:
#df_c['clean_text'] = df_c['clean_text'].astype('str')

# Topic Modelling - LDA

In [21]:
from gensim.corpora.dictionary import Dictionary
from gensim import models

In [61]:
texts = df_c['clean_text'].str.split().copy()
dictionary = Dictionary(texts)

dictionary.filter_extremes(no_below=10, no_above=0.50)

stoplist = ['undetermined', 'undetermine', 'e', 'g', 'fire', 'incident', 'unit', 'inc', 'fihr', 'fir', 'fici']

stop_ids = [
    dictionary.token2id[stopword]
    for stopword in stoplist
    if stopword in dictionary.token2id
]

dictionary.filter_tokens(stop_ids)

print(dictionary)

Dictionary(319 unique tokens: ['good', 'jewellery', 'leather', 'office', 'oven']...)


In [62]:
num_topics = 4

corpus = [dictionary.doc2bow(text) for text in texts]
ldamodel = models.LdaModel(corpus, id2word=dictionary, num_topics=num_topics, random_state=0)

for i in range(num_topics):

    print('Topic number', i)

    pprint.pprint(ldamodel.show_topic(i, topn=5))

    print('\n')


Topic number 0
[('house', 0.11779296),
 ('road', 0.10669046),
 ('burner', 0.074478656),
 ('range', 0.07437411),
 ('oil', 0.052091643)]


Topic number 1
[('vehicle', 0.24173911),
 ('electrical', 0.06565055),
 ('insulation', 0.06564075),
 ('plastic', 0.032919366),
 ('vef', 0.025043154)]


Topic number 2
[('cigarette', 0.15222582),
 ('house', 0.059176177),
 ('ignite', 0.054586492),
 ('article', 0.054175403),
 ('smoker', 0.04938931)]


Topic number 3
[('trash', 0.3587738),
 ('room', 0.086102255),
 ('house', 0.063032776),
 ('dormitory', 0.039448503),
 ('patient', 0.039437115)]




In [63]:
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
pyLDAvis.enable_notebook()

# feed the LDA model into the pyLDAvis instance
lda_viz = gensimvis.prepare(ldamodel, corpus, dictionary, sort_topics=True)


In [64]:
lda_viz

In [65]:
num_topics = 5

ldamodel = models.LdaModel(corpus, id2word=dictionary, num_topics=num_topics, random_state=0)

for i in range(num_topics):

    print('Topic number', i)

    pprint.pprint(ldamodel.show_topic(i, topn=5))

    print('\n')


Topic number 0
[('road', 0.14144872),
 ('burner', 0.09881301),
 ('range', 0.098673835),
 ('oil', 0.06928492),
 ('grease', 0.068331)]


Topic number 1
[('vehicle', 0.24781783),
 ('electrical', 0.06631986),
 ('insulation', 0.060128465),
 ('plastic', 0.03199409),
 ('vef', 0.026995987)]


Topic number 2
[('cigarette', 0.18139337),
 ('ignite', 0.06526273),
 ('article', 0.0592747),
 ('smoker', 0.058847006),
 ('house', 0.044898994)]


Topic number 3
[('trash', 0.5420605),
 ('room', 0.0629687),
 ('chute', 0.05710876),
 ('industri', 0.056664445),
 ('house', 0.04367334)]


Topic number 4
[('house', 0.16022101),
 ('room', 0.070015125),
 ('dormitory', 0.06112365),
 ('patient', 0.0611231),
 ('wear', 0.040099725)]




In [66]:
# feed the LDA model into the pyLDAvis instance
lda_viz = gensimvis.prepare(ldamodel, corpus, dictionary, sort_topics=True)


In [67]:
lda_viz

In [68]:
coherence_model_lda = models.CoherenceModel(model=ldamodel, texts=texts, dictionary=dictionary, coherence='c_v')

coherence_lda = coherence_model_lda.get_coherence()

print('\nCoherence Score: ', coherence_lda)



Coherence Score:  0.3026828913086479


In [70]:
# Save model to disk.
ldamodel.save('lda.model')

## Topic probabilities

In [96]:
#len(corpus) 14028

In [100]:
train_vecs = []
for i in range(len(corpus)):
    top_topics = ldamodel.get_document_topics(corpus[i], minimum_probability=0.0)
    topic_vec = [top_topics[i][1] for i in range(num_topics)]
    train_vecs.append(topic_vec)

In [102]:
df_topic_prob = pd.DataFrame(train_vecs, columns=['t0', 't1', 't2', 't3', 't4'])

In [104]:
df_topic_prob.head()

Unnamed: 0,t0,t1,t2,t3,t4
0,0.514684,0.159836,0.028584,0.028623,0.268272
1,0.703259,0.033333,0.033384,0.19669,0.033333
2,0.040245,0.404311,0.040157,0.040158,0.475129
3,0.040418,0.040003,0.040325,0.838076,0.041178
4,0.700119,0.033334,0.199865,0.033349,0.033334


In [112]:
df_c.reset_index(inplace=True)
df_final_train = df_c.join(df_topic_prob)
df_train_topic_prob = df_final_train[[ 'Incident_Numberinc_', 't0', 't1', 't2',
       't3', 't4']]

In [113]:
df_train_topic_prob.head()

Unnamed: 0,Incident_Numberinc_,t0,t1,t2,t3,t4
0,F17039755,0.514684,0.159836,0.028584,0.028623,0.268272
1,F18134861,0.703259,0.033333,0.033384,0.19669,0.033333
2,F13046589,0.040245,0.404311,0.040157,0.040158,0.475129
3,F13104454,0.040418,0.040003,0.040325,0.838076,0.041178
4,F18062211,0.700119,0.033334,0.199865,0.033349,0.033334


In [114]:
df_train_topic_prob.to_csv('train_topic_model_prob.csv')

#Test dataset

In [117]:
df = pd.read_csv('https://raw.githubusercontent.com/JimenaBaripatti/FeatureEngineering/main/data/current_dataset/testing_initial.csv')
#create data frame to store high cardinality variables (X5), ID and label
df_c = df[['Incident_Numberinc_','Property_Use','Initial_CAD_Event_Type','Ignition_Source','Area_of_Origin','Material_First_Ignited']].copy()

In [118]:
df_c['raw_text'] = df_c['Property_Use'].str.cat(others= df_c[['Initial_CAD_Event_Type', 'Ignition_Source', 'Area_of_Origin', 'Material_First_Ignited']], sep=" " , na_rep= '',  )

In [119]:
df_c['clean_text'] = df_c['raw_text'].apply(standardize)

In [120]:
texts = df_c['clean_text'].str.split().copy()
corpus = [dictionary.doc2bow(text) for text in texts]

train_vecs = []
for i in range(len(corpus)):
    top_topics = ldamodel.get_document_topics(corpus[i], minimum_probability=0.0)
    topic_vec = [top_topics[i][1] for i in range(num_topics)]
    train_vecs.append(topic_vec)

In [121]:
df_topic_prob = pd.DataFrame(train_vecs, columns=['t0', 't1', 't2', 't3', 't4'])

In [122]:
df_c.reset_index(inplace=True)
df_final_test = df_c.join(df_topic_prob)
df_test_topic_prob = df_final_test[[ 'Incident_Numberinc_', 't0', 't1', 't2',
       't3', 't4']]

In [123]:
df_test_topic_prob.head()

Unnamed: 0,Incident_Numberinc_,t0,t1,t2,t3,t4
0,F13080506,0.025029,0.025001,0.025092,0.898346,0.026533
1,F13110611,0.595794,0.226407,0.02235,0.022308,0.133142
2,F17084837,0.040422,0.040041,0.838111,0.04029,0.041137
3,F12006092,0.050717,0.050006,0.796714,0.050472,0.052091
4,F12031790,0.015448,0.015388,0.489893,0.463866,0.015404


In [124]:
df_test_topic_prob.shape

(3508, 6)

In [125]:
df_test_topic_prob.to_csv('test_topic_model_prob.csv')