In [1]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from collections import Counter
from num2words import num2words

import nltk
import os
import string
import numpy as np
import copy
import pandas as pd
import pickle
import re
import math

In [2]:
import ast

## Preprocessing

In [3]:
def convert_lower_case(data):
    return np.char.lower(data)

def remove_stop_words(data):
    stop_words = stopwords.words('english')
    words = word_tokenize(str(data))
    new_text = ""
    for w in words:
        if w not in stop_words and len(w) > 1:
            new_text = new_text + " " + w
    return new_text

def remove_punctuation(data):
    symbols = "!\"#$%&()*+-./:;<=>?@[\]^_`{|}~\n"
    for i in range(len(symbols)):
        data = np.char.replace(data, symbols[i], ' ')
        data = np.char.replace(data, "  ", " ")
    data = np.char.replace(data, ',', '')
    return data

def remove_apostrophe(data):
    return np.char.replace(data, "'", "")

def stemming(data):
    stemmer= PorterStemmer()
    
    tokens = word_tokenize(str(data))
    new_text = ""
    for w in tokens:
        new_text = new_text + " " + stemmer.stem(w)
    return new_text

def convert_numbers(data):
    tokens = word_tokenize(str(data))
    new_text = ""
    for w in tokens:
        try:
            w = num2words(int(w))
        except:
            a = 0
        new_text = new_text + " " + w
    new_text = np.char.replace(new_text, "-", " ")
    return new_text

def preprocess(data):
    data = convert_lower_case(data)
    data = remove_punctuation(data) #remove comma seperately
    data = remove_apostrophe(data)
    data = remove_stop_words(data)
    data = convert_numbers(data)
    # data = stemming(data)
    data = remove_punctuation(data)
    data = convert_numbers(data)
    # data = stemming(data) #needed again as we need to stem the words
    data = remove_punctuation(data) #needed again as num2word is giving few hypens and commas fourty-one
    data = remove_stop_words(data) #needed again as num2word is giving stop words 101 - one hundred and one
    return data

## Creating documents out of the neighborhood-separated articles

In [4]:
processed_text = []

In [5]:
df = pd.read_csv('../Entity_Recognition/Neighborhood_Separated_Articles/2018.csv')

In [6]:
df = df.drop(['Unnamed: 0'], axis=1)
df.head()

Unnamed: 0,dorchester,roxbury,mattapan,hyde_park,fenway,beacon_hill,downtown,south_boston,east_boston,back_bay,...,charlestown,brighton,allston,west_end,roslindale,north_end,mission_hill,harbor_islands,longwood_medical_area,west_roxbury
0,('With cyclists agitating for more safety meas...,('Born a decade apart Chuck Berry and Mary Ty...,('The that injured 17 passengers on the MBTA’...,('A spout of fire that burst from a gas main o...,('“Of course — Boston ” That was after Adam J...,('n Do it Blow up the zoning laws that choke...,('n nThe idea that bigger is better in health ...,('Martin J Walsh embarked on his second term ...,('Good morning everyone Thank you to Presiden...,('The word “mixed” in mixed income housing sho...,...,('nMonday n nOpen at owners’ discretion n nO...,('Boston police gang unit officers nabbed a 35...,('A five alarm fire heavily damaged a three de...,('The father and brother of a beloved youth me...,('And now the Hub fights back Boston will li...,('Deborah Goldberg settles into a chair by the...,('A woman who stopped at a gas station to put ...,('The Columbia Point cut through choppy seas o...,('n of the US bid for the 2024 Olympic Games w...,('The Boston School Committee on Wednesday una...
1,('For the region’s firefighters this weekend ...,('Late Halloween night 16 year old Gerrod Bro...,('Massachusetts is bracing for a fast and furi...,('A spout of fire that burst from a gas main o...,('“Of course — Boston ” That was after Adam J...,('At a breakfast celebrating Martin Luther Kin...,('A black maid is chased and slapped on her re...,('Martin J Walsh embarked on his second term ...,('Good morning everyone Thank you to Presiden...,('The double shooting in Back Bay Saturday nig...,...,('nMonday n nOpen nOpen nOpen nOpen nOpen ...,('MUSIC n Harmony Tividad and Cleo Tucker’s se...,('A five alarm fire heavily damaged a three de...,('The smiling young woman took center stage h...,('No dogs are running at Wonderland Greyhound ...,('A shriveled up Christmas tree long stripped ...,('Here’s something to look forward to “SUMMER...,('The Columbia Point cut through choppy seas o...,('n of the US bid for the 2024 Olympic Games w...,('The Boston School Committee on Wednesday una...
2,('On a bitter cold evening a few days after Ma...,('Born a decade apart Chuck Berry and Mary Ty...,('Already off to a rough start to the winter ...,('A spout of fire that burst from a gas main o...,('No “Black Panther” isn’t the greatest movie...,('At a breakfast celebrating Martin Luther Kin...,('A black maid is chased and slapped on her re...,('Martin J Walsh embarked on his second term ...,('Good morning everyone Thank you to Presiden...,('The double shooting in Back Bay Saturday nig...,...,('nMonday n nOpen nOpen nOpen nOpen nOpen ...,('MUSIC n Harmony Tividad and Cleo Tucker’s se...,('Dorchester residents who watched an inferno ...,('The smiling young woman took center stage h...,('No dogs are running at Wonderland Greyhound ...,('As he zigzags mustard onto a hot dog actor ...,('A Boston man who turned away from gang life ...,('The Columbia Point cut through choppy seas o...,"('no article', 'no_id')",('The Boston School Committee on Wednesday una...
3,('n xa0A two alarm fire damaged an apartment b...,('Born a decade apart Chuck Berry and Mary Ty...,('Heavy snow predicted for eastern Massachuset...,('A spout of fire that burst from a gas main o...,('No “Black Panther” isn’t the greatest movie...,('n Do it Blow up the zoning laws that choke...,('FRAMINGHAM — The words carved on the shingle...,('Martin J Walsh embarked on his second term ...,('As he embarked on his second term Mayor Mar...,('In a brazen crime two masked men robbed a r...,...,('A venerable program for busing city children...,('If at first you don’t succeed try try agai...,('Dorchester residents who watched an inferno ...,('The father and brother of a beloved youth me...,('No dogs are running at Wonderland Greyhound ...,('One of the first responders to the Boston Ma...,('nThe Massachusetts State Police trooper who ...,('The Columbia Point cut through choppy seas o...,"('no article', 'no_id')",('n n n read at 7 p m at Harvard Book Stor...
4,('Red Line passengers are facing “severe” dela...,('Born a decade apart Chuck Berry and Mary Ty...,('Already off to a rough start to the winter ...,('A spout of fire that burst from a gas main o...,('“Of course — Boston ” That was after Adam J...,('At a breakfast celebrating Martin Luther Kin...,('One of the most furious winter storms ever m...,('Boston police arrested two men and recovered...,('Good morning everyone Thank you to Presiden...,('In a brazen crime two masked men robbed a r...,...,('A venerable program for busing city children...,('If at first you don’t succeed try try agai...,('Avi Shemtov opened the Chubby Chickpea in Ca...,('He wore a blue pinstripe suit to court on Mo...,('No dogs are running at Wonderland Greyhound ...,('The brass bell tolled as the name of each of...,('The Massachusetts State Police trooper who f...,('The Columbia Point cut through choppy seas o...,"('no article', 'no_id')",('The legislator who authored the Massachusett...


In [7]:
df = df.fillna("('no article', 'no_id')")
df['dorchester'] = df['dorchester'].apply(ast.literal_eval)
df['roxbury'] = df['roxbury'].apply(ast.literal_eval)
df['mattapan'] = df['mattapan'].apply(ast.literal_eval)
df['hyde_park'] = df['hyde_park'].apply(ast.literal_eval)
df['fenway'] = df['fenway'].apply(ast.literal_eval)
df['beacon_hill'] = df['beacon_hill'].apply(ast.literal_eval)
df['downtown'] = df['downtown'].apply(ast.literal_eval)
df['south_boston'] = df['south_boston'].apply(ast.literal_eval)
df['east_boston'] = df['east_boston'].apply(ast.literal_eval)
df['back_bay'] = df['back_bay'].apply(ast.literal_eval)
df['jamaica_plain'] = df['jamaica_plain'].apply(ast.literal_eval)
df['south_end'] = df['south_end'].apply(ast.literal_eval)
df['charlestown'] = df['charlestown'].apply(ast.literal_eval)
df['brighton'] = df['brighton'].apply(ast.literal_eval)
df['allston'] = df['allston'].apply(ast.literal_eval)
df['west_end'] = df['west_end'].apply(ast.literal_eval)
df['roslindale'] = df['roslindale'].apply(ast.literal_eval)
df['north_end'] = df['north_end'].apply(ast.literal_eval)
df['mission_hill'] = df['mission_hill'].apply(ast.literal_eval)
df['harbor_islands'] = df['harbor_islands'].apply(ast.literal_eval)
df['longwood_medical_area'] = df['longwood_medical_area'].apply(ast.literal_eval)
df['west_roxbury'] = df['west_roxbury'].apply(ast.literal_eval)

In [8]:
documents = {'hyde_park': [], 'beacon_hill': [], 'south_boston': [], 'jamaica_plain': [], 'east_boston': [],
                'south_end': [], 'back_bay': [], 'north_end': [], 'west_roxbury': [], 'mission_hill': [],
                'harbor_islands': [], 'west_end': [], 'longwood_medical_area': [],
                'dorchester': [], 'roxbury': [], 'downtown': [], 'fenway': [], 'mattapan': [], 'brighton': [],
                'charlestown': [], 'roslindale': [], 'allston': []}

for col in df.columns:
    tokens = []
    for i in range(df.shape[0]):
        article, _ = df.loc[i][col]
        if article != 'no article':
            text = word_tokenize(preprocess(article))
            tokens = tokens + text
    documents[col] = tokens
    print(col + ' DONE')

dorchester DONE
roxbury DONE
mattapan DONE
hyde_park DONE
fenway DONE
beacon_hill DONE
downtown DONE
south_boston DONE
east_boston DONE
back_bay DONE
jamaica_plain DONE
south_end DONE
charlestown DONE
brighton DONE
allston DONE
west_end DONE
roslindale DONE
north_end DONE
mission_hill DONE
harbor_islands DONE
longwood_medical_area DONE
west_roxbury DONE


In [9]:
processed_text = []
for key in documents:
    processed_text.append(documents[key])

In [10]:
processed_text

[['spout',
  'fire',
  'burst',
  'gas',
  'main',
  'hyde',
  'park',
  'avenue',
  'roslindale',
  'sunday',
  'evening',
  'expected',
  'burn',
  'monday',
  'authorities',
  'said',
  'fire',
  'ten',
  'inch',
  'gas',
  'main',
  'began',
  'shortly',
  'near',
  'three',
  'hundred',
  'forty',
  'hyde',
  'park',
  'ave',
  'mile',
  'south',
  'forest',
  'hills',
  'mbta',
  'station',
  'fire',
  'crews',
  'tried',
  'use',
  'sand',
  'smother',
  'flames',
  'steve',
  'macdonald',
  'spokesman',
  'boston',
  'fire',
  'department',
  'said',
  'national',
  'grid',
  'plan',
  'shut',
  'gas',
  'sunday',
  'night',
  'extreme',
  'cold',
  'meaning',
  'fire',
  'likely',
  'continue',
  'burn',
  'try',
  'shut',
  'gas',
  'affect',
  'hundreds',
  'households',
  'thousands',
  'macdonald',
  'said',
  'temperatures',
  'expected',
  'drop',
  'zero',
  'overnight',
  'instead',
  'shutting',
  'gas',
  'national',
  'grid',
  'plans',
  'install',
  'temporary',
 

In [11]:
DF = {}

# keep track of how many neighborhoods' documents discuss a given token
for i in range(len(df.columns)):
    tokens = processed_text[i]
    for w in tokens:
        try:
            DF[w].add(i)
        except:
            DF[w] = {i}

for i in DF:
    DF[i] = len(DF[i])

In [12]:
total_vocab_size = len(DF)
total_vocab_size

43732

In [13]:
# get the number of documents in which this word occurs
def doc_freq(word):
    c = 0
    try:
        c = DF[word]
    except:
        pass
    return c

In [14]:
doc = 0

tf_idf = {}

for i in range(len(df.columns)):
    
    # get all the tokenized text for a given neighborhood
    tokens = processed_text[i]
    
    # count the number of times each token occurs in the text for a given neighborhood
    counter = Counter(tokens)
    
    # get the total number of terms for a document (given neighborhood)
    words_count = len(tokens)
    
    for token in np.unique(tokens):
        
        # compute term frequency
        tf = counter[token] / words_count

        # compute inverse document frequency
        dfr = doc_freq(token)
        idf = np.log((len(df.columns) + 1) / (dfr + 1))
        
        # compute tf-idf score
        tf_idf[doc, token] = tf * idf

    doc += 1

In [15]:
tf_idf

{(0, '100th'): 1.8075881593071357e-05,
 (0, '107th'): 2.89291920091111e-05,
 (0, '10th'): 3.082440348906838e-05,
 (0, '11th'): 9.865669208588734e-06,
 (0, '12th'): 3.752630172043756e-05,
 (0, '13milford'): 4.3393788013666645e-05,
 (0, '14th'): 2.3874469885124573e-05,
 (0, '1800s'): 1.2508767240145854e-05,
 (0, '1870s'): 3.6151763186142714e-05,
 (0, '1900s'): 1.4090424244878134e-05,
 (0, '1940s'): 1.4090424244878134e-05,
 (0, '1950s'): 8.73673607498702e-06,
 (0, '1960s'): 4.298554855663234e-06,
 (0, '1980s'): 1.7420607026329992e-05,
 (0, '1990s'): 2.57913291339794e-05,
 (0, '19th'): 1.1559151308400641e-05,
 (0, '1a'): 6.326558557574974e-05,
 (0, '1wilmington'): 1.446459600455555e-05,
 (0, '1½'): 1.5916313256749718e-05,
 (0, '209a'): 2.89291920091111e-05,
 (0, '20s'): 3.223916141747425e-05,
 (0, '20th'): 1.0746387139158083e-05,
 (0, '21st'): 1.541220174453419e-05,
 (0, '22d'): 1.8075881593071357e-05,
 (0, '23d'): 2.0718979624628477e-05,
 (0, '28th'): 2.0718979624628477e-05,
 (0, '2day'):

In [16]:
subs = df.columns

In [17]:
tf_idf_scores = {'dorchester': [], 'roxbury': [], 'mattapan': [], 'hyde_park': [], 'fenway': [],
       'beacon_hill': [], 'downtown': [], 'south_boston': [], 'east_boston': [], 'back_bay': [],
       'jamaica_plain': [], 'south_end': [], 'charlestown': [], 'brighton': [], 'allston': [],
       'west_end': [], 'roslindale': [], 'north_end': [], 'mission_hill': [], 'harbor_islands': [],
       'longwood_medical_area': [], 'west_roxbury': []}

In [18]:
for key in tf_idf:
    sub_ind, term = key
    tf_idf_scores[subs[sub_ind]].append((term, tf_idf[key]))

In [19]:
for col in subs:
    tf_idf_scores[col].sort(reverse=True)
    temp = pd.DataFrame(tf_idf_scores[col], columns=['term', 'weight'])
    temp.to_csv('Yearly_TFIDF_Scores_by_Subneighborhood/2018/TFIDF_' + col + '.csv')