<a href="https://colab.research.google.com/github/JimenaBaripatti/FeatureEngineering/blob/main/NLP_High_Cardinality_Variables.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**NLP: FE Categorical variables with high cardinality **bold text**

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# setting up libraries
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import norm 
import statistics
from sklearn.metrics import matthews_corrcoef
from scipy.stats import chi2_contingency
import math
from patsy import dmatrices
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.ensemble import IsolationForest
from scipy.stats import zscore
from textblob import TextBlob

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', -1)

%matplotlib inline


# Setting working directory
#os.chdir("/content/drive/MyDrive/Colab Notebooks/Data/")

  import pandas.util.testing as tm


In [None]:
!pip install category_encoders

# Functions

In [3]:
# extract categorical vs. numerical columns
def find_diff_type_var(df):
    cat_var = df.loc[:,df.dtypes == np.object].columns
    num_var = df.loc[:,df.dtypes != np.object].columns
    return(cat_var, num_var)

In [None]:
# explore different variables
def exploration_cat(c):

  print('---------------------------------------')

  print('variable name:', c, '\n')
  print('number of categories: ' ,df[c].unique().shape[0], '\n')
  print(df[c].value_counts(normalize=True, dropna=False))

  print('---------------------------------------')

In [None]:
# plot categorical variables
def plot_cat(c):
  plt.figure(figsize=(20,7))
  cat_count=df[c].value_counts(normalize=True, dropna=False)
  fig=sns.barplot(100*cat_count.values[0:11], cat_count.index[0:11], alpha=0.9)
  plt.title('Frequency Distribution of ' + str(c))
  plt.xlabel('% occurences', fontsize=12)
  #fig.set_yticklabels(fig.get_yticklabels(),rotation=45, horizontalalignment='right')
  plt.show()


In [None]:
# frequency encoding
def freq_encode(c):
  freq_count=df[c].value_counts(normalize=True).to_dict()
  c_imputed=str(c)+'_imputed'
  df[c_imputed]=df[c].map(freq_count)
  return df

In [None]:
# Impute with the most frequent category
def impute_frequent (c):
  top_cat=df[c].value_counts().nlargest(1).index[0]
  index_to_impute=df[df[c].isnull()].index
  df.at[index_to_impute,c]=top_cat
  return df

In [None]:
# use WOE for categorical variables - feature creation based on the output woe value
def woe_encoder(c,target):
  from category_encoders import WOEEncoder
  x = WOEEncoder()
  x.fit(df[c],df[target])
  df[str(c)+'_woe']=x.transform(df[c],df[target])
  return df

# Load Complete Fire Data


In [4]:
df=pd.read_csv('https://raw.githubusercontent.com/JimenaBaripatti/FeatureEngineering/main/data/current_dataset/fire_incident_station_weather_demo_combined.csv', index_col=[0])

In [5]:
df.shape

(17536, 106)

In [None]:
df.head(2)

In [7]:
#drop unwanted columns
my_not_needed_cols = ['Station_Area', 'TFS_Alarm_Timeinc_', 'TFS_Arrival_Time',
                      'YEAR','TFS_ARR_DAY','TFS_ALM_DAY','TFS_ARR_HOUR','TFS_ALM_HOUR']
# additional numerical columns that are not needed
add_num_cols = ['Incident_Station_Area', 'Incident_Ward','ADDRESS_POINT_ID', 'ADDRESS_ID',
       'CENTRELINE_ID', 'ADDRESS_NUMBER', 'OBJECTID', 'fs_Longitude',
       'fs_Latitude', 'Latitude', 'Longitude']
add_cat_cols = ['Intersection', 'Last_TFS_Unit_Clear_Time','TFS_Alarm_Timeinc_', 
        'TFS_Arrival_Time', 'NAME', 'ADDRESS','LINEAR_NAME_FULL', 'PLACE_NAME', 
        'WARD_NAME', 'MUNICIPALITY_NAME', 'geometry','INCIDENT_DATE',
                'Ext_agent_app_or_defer_time','Fire_Under_Control_Time','location']

fire_not_needed_cols = ['_id_x','_id_y','Exposures']
Demographics_not_needed_cols = ['fsa_imputed','location','FSA']                        
station_not_needed_cols=['ID', 'ADDRESS_ID_LINK', 'X','Y', 'LATITUDE', 'LONGITUDE','POSTAL_CODE','MAINT_STAGE','GENERAL_USE','CLASS_FAMILY_DESC', 'INCIDENT_PERIOD_CAT']
weather_not_needed_cols=['date', 'Incident_Numberdemo','TFS_Alarm_Timedemo']
drop_columns = my_not_needed_cols + add_num_cols + add_cat_cols + Demographics_not_needed_cols + fire_not_needed_cols + station_not_needed_cols + weather_not_needed_cols
df.drop(columns=drop_columns, axis=1,inplace=True)

In [8]:
find_diff_type_var(df)

(Index(['Area_of_Origin', 'Building_Status', 'Business_Impact',
        'Extent_Of_Fire', 'Final_Incident_Type',
        'Fire_Alarm_System_Impact_on_Evacuation', 'Fire_Alarm_System_Operation',
        'Fire_Alarm_System_Presence', 'Ignition_Source', 'Incident_Numberinc_',
        'Initial_CAD_Event_Type', 'Level_Of_Origin', 'Material_First_Ignited',
        'Method_Of_Fire_Control', 'Possible_Cause', 'Property_Use',
        'Smoke_Alarm_at_Fire_Origin',
        'Smoke_Alarm_at_Fire_Origin_Alarm_Failure',
        'Smoke_Alarm_at_Fire_Origin_Alarm_Type',
        'Smoke_Alarm_Impact_on_Persons_Evacuating_Impact_on_Evacuation',
        'Smoke_Spread', 'Sprinkler_System_Operation',
        'Sprinkler_System_Presence', 'Status_of_Fire_On_Arrival'],
       dtype='object'),
 Index(['Civilian_Casualties', 'Count_of_Persons_Rescued',
        'Estimated_Dollar_Loss', 'Estimated_Number_Of_Persons_Displaced',
        'Number_of_responding_apparatus', 'Number_of_responding_personnel',
        'TFS_

# Categorical variables overview

In [9]:
cat_var = df.loc[:,df.dtypes == np.object].columns
n_cat = [df[x].unique().shape[0] for x in cat_var]
n_missing=[df[x].isnull().sum() for x in cat_var]

In [10]:
cat_var_summary=pd.DataFrame(data=zip(n_cat,n_missing),index=cat_var, columns=['number of categories','number of missing records']).sort_values(by=['number of categories'], ascending=False)

In [11]:
high_cat=cat_var_summary[cat_var_summary['number of categories']>=20]
med_cat=cat_var_summary[(cat_var_summary['number of categories']>5) & (cat_var_summary['number of categories']<20)]
low_cat=cat_var_summary[cat_var_summary['number of categories']<=5]

In [12]:
cat_var_id=['Incident_Numberinc_','FSA']
high_cat=[x for x in high_cat.index if x not in cat_var_id]

In [13]:
cat_var_summary

Unnamed: 0,number of categories,number of missing records
Incident_Numberinc_,17536,0
Property_Use,284,1
Initial_CAD_Event_Type,115,0
Ignition_Source,85,1913
Area_of_Origin,74,1913
Level_Of_Origin,60,6322
Material_First_Ignited,55,1913
Possible_Cause,25,1913
Extent_Of_Fire,13,6322
Smoke_Alarm_at_Fire_Origin_Alarm_Failure,12,6322


In [None]:
for c in high_cat:
  exploration_cat(c)

In [None]:
for c in high_cat:
  plot_cat(c)

In [None]:
med_cat=[x for x in med_cat.index if x not in cat_var_id]
for c in med_cat:
  exploration_cat(c)

### High Card: Level of Origin binning


* 001-003 low_rise_level
* 004-012 mid_rise_level
* 012-100 high_rise_level
* 996 roof
* B - below_grade_level
* rest including NaN - other



In [None]:
#Level_of_Origin: https://www.toronto.ca/ext/open_data/catalog/data_set_files/ofmcodes2009.pdf
LOO=df['Level_Of_Origin']

In [None]:
LOO_imputed=[None]*len(LOO)
for i in range(len(LOO)):
  try: 
    #LOO_imputed[i]=int(LOO[i])
    if int(LOO[i])<=3:
      LOO_imputed[i]='low_rise_level'
    elif int(LOO[i])<=12:
      LOO_imputed[i]='mid_rise_level'
    elif int(LOO[i])<=100:
      LOO_imputed[i]='high_rise_level'
    elif int(LOO[i])==996:
      LOO_imputed[i]='roof'
    else:
      LOO_imputed[i]='other'
  except:
    LOO_imputed[i]='below_grade_level'
print(LOO_imputed[0:50])

['other', 'other', 'other', 'other', 'other', 'other', 'low_rise_level', 'mid_rise_level', 'low_rise_level', 'other', 'other', 'low_rise_level', 'other', 'low_rise_level', 'other', 'other', 'low_rise_level', 'high_rise_level', 'low_rise_level', 'other', 'other', 'other', 'mid_rise_level', 'low_rise_level', 'other', 'other', 'low_rise_level', 'other', 'other', 'other', 'other', 'other', 'other', 'other', 'other', 'low_rise_level', 'other', 'other', 'other', 'other', 'other', 'other', 'low_rise_level', 'other', 'low_rise_level', 'mid_rise_level', 'other', 'other', 'mid_rise_level', 'other']


In [None]:
df['LOO_imputed']=LOO_imputed

### High Card: Possible Cause WOE


*   evaluate woe for each category
*   create bins based on woe



In [None]:
df_test=woe_encoder('Possible_Cause','LABEL')

In [None]:
df_test[['Possible_Cause','Possible_Cause_woe']].drop_duplicates().sort_values(by='Possible_Cause_woe')

Unnamed: 0,Possible_Cause,Possible_Cause_woe
13,"28 - Routine maintenance deficiency, eg creosote, lint, grease buildup",-1.421197
21,51 - Mechanical Failure,-1.358665
1,03 - Suspected Vandalism,-1.299777
106,49 - Improper Storage,-1.25793
947,73 - Natural Cause,-0.995566
6,20 - Design/Construction/Installation/Maintenance Deficiency,-0.823195
0,99 - Undetermined,-0.610312
58,01 - Suspected Arson,-0.546298
7,52 - Electrical Failure,-0.400887
48,45 - Improperly Discarded,-0.331906


In [None]:
df_test=woe_encoder('Initial_CAD_Event_Type','LABEL')

In [None]:
df_test[['Initial_CAD_Event_Type','Initial_CAD_Event_Type_woe']].drop_duplicates().sort_values(by='Initial_CAD_Event_Type_woe')

Unnamed: 0,Initial_CAD_Event_Type,Initial_CAD_Event_Type_woe
1,Fire - Grass/Rubbish,-3.526908
4834,VEFH,-2.105228
4837,FIG,-2.058633
4832,VEF,-1.882695
0,Vehicle Fire,-1.558351
41,Vehicle Fire - Highway,-1.552002
5019,FITP,-1.223144
4920,FIS,-1.168585
4933,FII,-1.159195
181,Fire - Other,-0.973093


## High Card: combined categorical variables


*   List item
*   List item



2) nlp: bow/n-gram/embedding to generate feature than use PCA to reduce

In [23]:
#create data frame to store high cardinality variables (X5), ID and label
df_c=df[['Incident_Numberinc_','Property_Use','Initial_CAD_Event_Type','Ignition_Source','Area_of_Origin','Material_First_Ignited','LABEL']]

In [24]:
df_c.isnull().sum()

Incident_Numberinc_       0   
Property_Use              1   
Initial_CAD_Event_Type    0   
Ignition_Source           1913
Area_of_Origin            1913
Material_First_Ignited    1913
LABEL                     0   
dtype: int64

In [25]:
for col in df_c.columns:
  df_c.at[df_c[df_c[col].isnull()].index,col]=' '

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value)


In [26]:
df_c.isnull().sum()

Incident_Numberinc_       0
Property_Use              0
Initial_CAD_Event_Type    0
Ignition_Source           0
Area_of_Origin            0
Material_First_Ignited    0
LABEL                     0
dtype: int64

In [None]:
#create a combined feature by joining the feature columns
df_c['combined_feature']=df_c.iloc[:,1]+' '+df_c.iloc[:,2]+' '+df_c.iloc[:,3]+' '+df_c.iloc[:,4]+' '+df_c.iloc[:,5]

In [None]:
# clean up/process the text

def clean_text(c):
  df=pd.DataFrame()
  df['text']=c
  # replace all non-printable characters (such as "\n") with space, and replace multiple spaces with single space
  df['ss_text']=df['text'].apply(lambda x: ' '.join(c for c in x.split()))
  #expand contraction
  df['contract_text']=df['ss_text'].apply(lambda x: ' '.join([contractions.fix(w) for w in x.split()]))
  #lower case
  df['lower_text']=df['contract_text'].apply(lambda x: ' '.join([w.lower() for w in x.split()]))
  #spelling correction
  df['spl_text'] = df['lower_text'].apply(lambda x: str(TextBlob(x).correct()))
  #lemmatization
  lmtzr=WordNetLemmatizer()
  df['lemm_text']=df['spl_text'].apply(lambda x: ' '.join([lmtzr.lemmatize(w) for w in x.split()]))
  #remove punctuation
  df['punct_text']=df['lemm_text'].str.replace('[^\w\s]',' ')
  #remove english stopwords
  STOPWORDS=set(stopwords.words('english'))
  df['stop_text']=df['punct_text'].apply(lambda x: ' '.join([w for w in x.split() if w not in STOPWORDS]))
  #remove all numbers
  df['text_cleaned']=df['stop_text'].apply(lambda x: ' '.join([w for w in x.split() if not w.isdigit()]))
  #return
  return df

In [None]:
#testing 
df_test=clean_text(df_c['combined_feature'][0:10000])

In [None]:
# check for word frequency
word_list=' '.join(df_test['text_cleaned'])
word_listing=str(word_list).split()

In [None]:
# define the rare category (bottom 1%)
lower_word_limit=df_test.shape[0]*0.01
# upper_word_limit=df_p.shape[0]*0.5

In [None]:
from collections import Counter
c=Counter(word_listing)
sorted_list=sorted(c.items(),key=lambda i:i[1],reverse=True)
top20=sorted_list[0:20]
top20
#top10_word=[w for (w,wc) in top10]

[('undetermined', 7521),
 ('fire', 5204),
 ('area', 4914),
 ('rubbish', 4097),
 ('unit', 4063),
 ('dwelling', 4003),
 ('formerly', 3384),
 ('vehicle', 3195),
 ('trash', 2351),
 ('grass', 2286),
 ('etc', 2276),
 ('eg', 2180),
 ('cooking', 2088),
 ('multi', 2044),
 ('garage', 2003),
 ('fir', 1989),
 ('electrical', 1977),
 ('detached', 1764),
 ('residential', 1688),
 ('kitchen', 1328)]

In [None]:
#create custom stopwords based on the output
custom_stopwords=['fire','etc','eg','incidents','use','fir','undetermined']

In [None]:
#remove the rare words and custom stopwords
sorted_clean=list(filter(lambda c: c[1]>lower_word_limit and c[0] not in custom_stopwords, sorted_list))

In [None]:
#save final list of words for the dataset
fire_word_list=[w for (w,wc) in sorted_clean]

In [None]:
#remove words that are NOT in the final list of words 
df_test['text_cleaned_final']=df_test['text_cleaned'].apply(lambda x: ' '.join([w for w in str(x).split() if w in fire_word_list]))

In [None]:
df_test[['text_cleaned_final','text_cleaned']].head(10)

Unnamed: 0,text_cleaned_final,text_cleaned
0,sidewalk street roadway highway vehicle undetermined engine area vehicle,sidewalk street roadway highway use fire incidents vehicle fire undetermined engine area vehicle
1,sidewalk street roadway highway grass rubbish undetermined trash rubbish area outside,sidewalk street roadway highway use fire incidents fire grass rubbish undetermined trash rubbish area outside
2,outdoor general auto parking highrise residential undetermined undetermined formerly,outdoor general auto parking fire highrise residential undetermined undetermined formerly
3,store commercial industrial undetermined trash rubbish area outside undetermined formerly,department store fire commercial industrial undetermined trash rubbish area outside undetermined formerly
4,lawn structure residential undetermined undetermined formerly,lawn around structure fire residential undetermined undetermined formerly
5,vehicle vehicle part vehicle undetermined engine area vehicle,vehicle vehicle part vehicle fire undetermined engine area vehicle
6,detached dwelling residential light sleeping area bedroom patient room dormitory induration,detached dwelling fire residential incandescent lamp light bulb spotlight sleeping area bedroom patient room dormitory etc induration
7,multi unit dwelling unit alarm highrise residential distribution equipment includes circuit mechanical electrical service room electrical wiring induration,multi unit dwelling unit alarm highrise residential distribution equipment includes panel boards fuses circuit br mechanical electrical service room electrical wiring induration
8,facility alarm commercial industrial equipment office paper cardboard,court facility alarm commercial industrial heating equipment office paper cardboard
9,automobile vehicle undetermined undetermined formerly,automobile vehicle fire undetermined undetermined formerly


In [None]:
#check if there's empty string as a result of text cleaning
#df_p[df_p['text_cleaned_final']==""].shape[0]

1303

Next step: Generate feature from cleaned text
Below codes are for BOWs

In [None]:
df_test['LABEL']=df_c['LABEL'][0:10000]

In [None]:
df_test['LABEL'].value_counts()

0    9590
1    410 
Name: LABEL, dtype: int64

In [None]:
# dominant category is 'no casulaty'
casualty=df_test[df_test['LABEL']==1]
no_casualty=df_test[df_test['LABEL']==0]

In [None]:
# downsampling
no_casualty = no_casualty.sample(casualty.shape[0], random_state=10)
no_casualty.shape, casualty.shape

((410, 11), (410, 11))

In [None]:
#re-balanced data
data=no_casualty.append(casualty,ignore_index=True)
data['LABEL'].value_counts()

1    410
0    410
Name: LABEL, dtype: int64

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data["text_cleaned_final"], data["LABEL"], test_size=0.2, random_state=10086, stratify=data["LABEL"])
print(X_train.shape)
print(X_test.shape)

(656,)
(164,)


In [None]:
# an alternative count vetorizer then use PCA to reduce?

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from sklearn.base import TransformerMixin
class DenseTransformer(TransformerMixin):
    def fit(self, X, y=None, **fit_params):
        return self
    def transform(self, X, y=None, **fit_params):
        return X.todense()

1) bag-of-words: extract top words with highest coefficients

In [None]:
pipeline_bow = Pipeline([('vectorizer', CountVectorizer(max_features=1000)), ('dtf', DenseTransformer()),('scaler', StandardScaler()), ('LR', LogisticRegression(random_state=2020))])
pipeline_bow.fit(X_train, y_train)

Pipeline(steps=[('vectorizer', CountVectorizer(max_features=1000)),
                ('dtf', <__main__.DenseTransformer object at 0x7fbc4cd60f90>),
                ('scaler', StandardScaler()),
                ('LR', LogisticRegression(random_state=2020))])

In [None]:
#accuracy
pipeline_bow.score(X_test,y_test)

0.6585365853658537

In [None]:
## f1 score
f1_score(y_test,pipeline_bow.predict(X_test))

0.654320987654321

In [None]:
## roc auc score
roc_auc_score(y_test, pipeline_bow.predict_proba(X_test)[:,1])

0.7153480071386078

In [None]:
#coefficients (coef_) of your logisitic regression model, which features are most positively/negatively correlated with toxic comments?
feature_names=pipeline_bow.named_steps['vectorizer'].get_feature_names()

In [None]:
len(feature_names)

184

In [None]:
feature_coeff=pipeline_bow.named_steps['LR'].coef_[0]
feature_df=pd.DataFrame()
feature_df['feature']=feature_names
feature_df['coeff']=feature_coeff
feature_df['abs']=np.abs(feature_coeff)
feature_df.sort_values(by='abs',ascending=False, inplace=True)

In [None]:
feature_df.head(50)

Unnamed: 0,feature,coeff,abs
151,soft,-1.408494,1.408494
172,undetermined,-0.98139,0.98139
8,attached,0.908297,0.908297
137,restaurant,0.877608,0.877608
87,lawn,-0.807004,0.807004
141,room,0.780192,0.780192
160,system,0.758446,0.758446
111,open,-0.756077,0.756077
49,dwyer,-0.750875,0.750875
42,cowhouse,-0.732141,0.732141


In [None]:
bow_feature=feature_df['feature'][0:50].to_list()

INCOMPLETE BELOW THIS LINE

In [None]:
bow_feature_names=[str(x)+'_encoded' for x in bow_feature]

In [None]:
#create features using bow for property_use
['combined_feature']=['text_cleaned_final'].apply(lambda x: ' '.join([w for w in str(x).split() if w in bow_feature]))

In [None]:
pipeline_feature_vector = Pipeline([('vectorizer', CountVectorizer(max_features=1000)), ('dtf', DenseTransformer())])
feature_from_Property_use=pipeline_feature_vector.fit_transform(df_p['property_use'], df_p['LABEL']) #stored as a matrix

2) word2vec 