# Text Mining The Violation Description Column of the NYC Restaurants Dataset

## Import necessary libraries

In [178]:
import pandas as pd
import numpy as np
import nltk
import os

## Import NYC Restaurants dataset

In [179]:
manhattan_restaurants = pd.read_csv('Manhattan_NYC_restaurants.csv')

## Select criticals

In [335]:
criticals_only = manhattan_restaurants[manhattan_restaurants['critical_flag'] == 'Critical']
criticals_only.head()

Unnamed: 0,camis,dba,boro,street,zipcode,cuisine_description,inspection_date,action,violation_code,violation_description,critical_flag,score,grade,grade_date,record_date,inspection_type,latitude,longitude,community_board,council_district
0,30191841,DJ REYNOLDS PUB AND RESTAURANT,Manhattan,WEST 57 STREET,10019,Irish,2019-06-06,Violations were cited in the following area(s).,06C,Food not protected from potential source of co...,Critical,8,A,2019-06-06,2022-04-03,Cycle Inspection / Initial Inspection,40.767326,-73.98431,104,3
2,40362264,P & S DELI GROCERY,Manhattan,COLUMBUS AVENUE,10025,American,2019-05-13,Violations were cited in the following area(s).,04H,"Raw, cooked or prepared food is adulterated, c...",Critical,9,A,2019-05-13,2022-04-03,Cycle Inspection / Initial Inspection,40.792621,-73.96771,107,6
3,40362274,ANGELIKA FILM CENTER,Manhattan,WEST HOUSTON STREET,10012,American,2022-02-14,Violations were cited in the following area(s).,06D,"Food contact surface not properly washed, rins...",Critical,7,A,2022-02-14,2022-04-03,Cycle Inspection / Initial Inspection,40.725744,-73.997478,102,1
4,40363298,CAFE METRO,Manhattan,8 AVENUE,10018,American,2019-05-10,Violations were cited in the following area(s).,02B,Hot food item not held at or above 140º F.,Critical,7,A,2019-05-10,2022-04-03,Cycle Inspection / Initial Inspection,40.756185,-73.990565,104,3
8,40364347,METROPOLITAN CLUB,Manhattan,EAST 60 STREET,10022,American,2019-05-30,Violations were cited in the following area(s).,04M,Live roaches present in facility's food and/or...,Critical,13,A,2019-05-30,2022-04-03,Cycle Inspection / Initial Inspection,40.764796,-73.972308,108,4


# Create a series of the violation descriptions of restaurants that are flagged as critical

In [186]:
criticals_tbl = pd.DataFrame(criticals_only, columns = ['violation_description'])
reset_col = criticals_tbl.reset_index(drop = True)
violation_desc = reset_col['violation_description']
violation_desc

0       Food not protected from potential source of co...
1       Raw, cooked or prepared food is adulterated, c...
2       Food contact surface not properly washed, rins...
3              Hot food item not held at or above 140º F.
4       Live roaches present in facility's food and/or...
                              ...                        
3086    Food not protected from potential source of co...
3087    Cold food item held above 41º F (smoked fish a...
3088    Food not protected from potential source of co...
3089    No facilities available to wash, rinse and san...
3090           Hot food item not held at or above 140º F.
Name: violation_description, Length: 3091, dtype: object

## Split sentences into single words

In [188]:
lsts_of_strings = []
for i in range(3091):
    lst = violation_desc[i].split(" ")
    lsts_of_strings.append(lst)
lsts_of_strings

[['Food',
  'not',
  'protected',
  'from',
  'potential',
  'source',
  'of',
  'contamination',
  'during',
  'storage,',
  'preparation,',
  'transportation,',
  'display',
  'or',
  'service.'],
 ['Raw,',
  'cooked',
  'or',
  'prepared',
  'food',
  'is',
  'adulterated,',
  'contaminated,',
  'cross-contaminated,',
  'or',
  'not',
  'discarded',
  'in',
  'accordance',
  'with',
  'HACCP',
  'plan.'],
 ['Food',
  'contact',
  'surface',
  'not',
  'properly',
  'washed,',
  'rinsed',
  'and',
  'sanitized',
  'after',
  'each',
  'use',
  'and',
  'following',
  'any',
  'activity',
  'when',
  'contamination',
  'may',
  'have',
  'occurred.'],
 ['Hot', 'food', 'item', 'not', 'held', 'at', 'or', 'above', '140º', 'F.'],
 ['Live',
  'roaches',
  'present',
  'in',
  "facility's",
  'food',
  'and/or',
  'non-food',
  'areas.'],
 ['Food',
  'not',
  'cooled',
  'by',
  'an',
  'approved',
  'method',
  'whereby',
  'the',
  'internal',
  'product',
  'temperature',
  'is',
  'redu

## Remove brackets inside of nested list

In [189]:
new_lst= ''.join(repr(lsts_of_strings))
remove_left_bracket = new_lst.replace('[', '')
remove_right_bracket = remove_left_bracket.replace(']', '')
remove_right_bracket
remove_slash = remove_right_bracket.replace("\'", '')
remove_semi = remove_slash.replace(";'", '')
remove_quotes = remove_semi.replace('"', '')
remove_apost = remove_quotes.replace("'", '')


## Convert string to list

In [337]:
li = list(remove_apost.split(","))
li

['Food',
 ' not',
 ' protected',
 ' from',
 ' potential',
 ' source',
 ' of',
 ' contamination',
 ' during',
 ' storage',
 '',
 ' preparation',
 '',
 ' transportation',
 '',
 ' display',
 ' or',
 ' service.',
 ' Raw',
 '',
 ' cooked',
 ' or',
 ' prepared',
 ' food',
 ' is',
 ' adulterated',
 '',
 ' contaminated',
 '',
 ' cross-contaminated',
 '',
 ' or',
 ' not',
 ' discarded',
 ' in',
 ' accordance',
 ' with',
 ' HACCP',
 ' plan.',
 ' Food',
 ' contact',
 ' surface',
 ' not',
 ' properly',
 ' washed',
 '',
 ' rinsed',
 ' and',
 ' sanitized',
 ' after',
 ' each',
 ' use',
 ' and',
 ' following',
 ' any',
 ' activity',
 ' when',
 ' contamination',
 ' may',
 ' have',
 ' occurred.',
 ' Hot',
 ' food',
 ' item',
 ' not',
 ' held',
 ' at',
 ' or',
 ' above',
 ' 140º',
 ' F.',
 ' Live',
 ' roaches',
 ' present',
 ' in',
 ' facilitys',
 ' food',
 ' and/or',
 ' non-food',
 ' areas.',
 ' Food',
 ' not',
 ' cooled',
 ' by',
 ' an',
 ' approved',
 ' method',
 ' whereby',
 ' the',
 ' internal',
 '

## Create dataframe

In [191]:
words_tbl = pd.DataFrame(li, columns = ['Words'])
words_tbl

Unnamed: 0,Words
0,Food
1,not
2,protected
3,from
4,potential
...,...
62550,at
62551,or
62552,above
62553,140º


## Convert the words to lowercase

In [294]:
words_tbl['Words'] = words_tbl['Words'].str.lower()

## Delete unnecessary characters from the words column

In [295]:
words_tbl['Words'] = words_tbl['Words'].str.strip('.')

## Find the counts of each word and add a column for counts

In [296]:
word_count = words_tbl.groupby('Words').size().reset_index(name='counts')
top_words = word_count.sort_values('counts', ascending = False)
top_words.head(30)

Unnamed: 0,Words,counts
0,,3889
102,food,3132
100,flies,3096
170,or,2367
24,and,2144
160,not,2058
130,in,1103
166,of,1000
60,contamination,973
91,f,891


## Drop words that lack meaning

In [297]:
drop_blank = top_words.drop(0)

In [298]:
drop_ones = drop_blank.drop(drop_blank[drop_blank['counts'] == 1].index)

In [301]:
drop_few_rows = drop_ones.drop([200, 196, 49, 111, 60, 137, 20, 177, 173, 24, 167, 79, 25, 92, 73, 1, 56, 64, 112, 132, 165, 180, 195])

In [303]:
drop_twos = drop_few_rows.drop(drop_few_rows[drop_few_rows['counts'] == 2].index)

In [310]:
drop_more_rows = drop_twos.drop([276, 156, 105, 123, 29, 99, 80, 21, 259, 34, 178, 170, 160, 130, 166, 106, 270, 91])

## See all the rows to delete the rest of the unnecessary words. 

In [331]:
drop_more_rows_1 = drop_more_rows.drop([8, 138, 249, 138, 118, 149, 158, 234, 243, 277, 117, 38, 12, 192, 4, 76, 271, 245, 7, 9, 23, 45, 2, 280, 97])
pd.set_option('display.max_rows', drop_more_rows_1.shape[0] + 1)
print(drop_more_rows_1)

                              Words  counts
102                            food    3132
100                           flies    3096
14                            above     870
185                     preparation     822
188                         present     688
122                            held     688
159                        non-food     688
33                            areas     688
153                            mice     682
217                       sanitized     648
260                             use     618
139                            item     576
55                          contact     574
237                         surface     531
267                          washed     531
101                       following     530
17                         activity     530
211                          rinsed     530
103   food/refuse/sewage-associated     516
127                           house     516
96                            filth     516
133                         incl

## Rename dataframe

In [333]:
words_and_counts = drop_more_rows 
words_and_counts

Unnamed: 0,Words,counts
102,food,3132
100,flies,3096
14,above,870
185,preparation,822
188,present,688
...,...,...
238,tagged/labeled;,4
68,days,4
121,heated,4
209,retained,4


## Save finalized dataframe as a csv file to directory

In [334]:
#words_and_counts.to_csv('words_and_counts.csv')