In [191]:
import os
import pandas as pd
import numpy as np
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
from gensim.parsing.preprocessing import STOPWORDS
from gensim import corpora, models
from gensim.models.phrases import Phrases, Phraser
import pprint
import csv
from nltk.corpus import words
from difflib import SequenceMatcher


Read the names of the target and acquiror companies from the deal data.

In [1]:
def find(name, path):
    for root, dirs, files in os.walk(path):
        if name in files:
            return os.path.join(root, name)

In [202]:
path = find('dealdata.xlsx', 'C:/')
data = pd.read_excel(path)
targets = data['Target Name']
targets = list(targets)
acquirors = data['Acquiror Name']
acquirors = list(acquirors)

Make the target and acquiror names lowercase.

In [203]:
targets = [t.lower() for t in targets]
acquirors = [a.lower() for a in acquirors]

Words in the names of targets and acquirors that are no English vocabulary words are also added to the list.

In [204]:
extra_words = list()
for target in targets:
    fullname = target.split()
    for name in fullname:        
        if name.lower() not in words.words():
            extra_words.append(name.lower())
for acquiror in acquirors:
    fullname = acquiror.split()
    for name in fullname:
        if name not in words.words():
            extra_words.append(name.lower())
extra_words = list(set(extra_words))

Check the extra words to verify that these are words that can be added as stop words. Otherwise add them to a wrong words list. Also add word parts to the extra words if they were not split properly.

In [208]:
wrong_words=['activities','advisors','america','american','americas','asia','asia-pacific','associates',
             'assurances','australian','austria','baltic','baptist','beaches','benefits','benelux','brasil',
             'brokers','builders','cambridge','centre','clients','clinique','concepts','contacts','cooperative',
             'coordinators','customcare','database','diagnostics',"doctors'",'doctors','eldercare','focused',
             'fond','foods','gates','georgia','hawaii','healthcare','impuls','info','initiatives','integrated',
             'investments','labs','london','madrid','managed','manufacturing','med','medicaid','mega',
             'mid-atlantic','midwest','multiplan','munich','musculoskeletal','networks','ohio','online',
             'operations','options','orleans','orthodontists','partners','peoples','physician','physicians',
             'planning','plans','points','portugal','professionals','protocols','providers','purchasing',
             'resources','scandinavian','schleswig-holstein','scripts','services','software','solutions',
             'strategies','systems','technologies','uk', 'underwriters','workers','zuricht','holdings','everyone',
            'africa','texas','russian','nyc','singapore','california','missouri','traders','claims','detroit',
             'bermuda','irish','european','argentina','investors','europe','hospitalist','acquiror','spain',
            'pro-claim','luxembourg','ventures','harvard','australia','communications']
             

Keep the extra words that are not wrong and not in the stop word list yet.

In [209]:
extra_words = [i for i in extra_words if (i not in wrong_words) and (i not in stopwords)]

Add days of the week and months as stop words.

In [210]:
stopwords.extend(['monday','tuesday','wednesday','thursday','friday','saturday','sunday',
                      'mon','mon.','tu.','tu','tue','tue.','tues','tues.','wed','wed.','th','th.','thu','thu.',
                      'thur','thur.','thurs','thurs.','fri','fri.','sat','sat.','sun','sun.',
                      'january','jan','jan.','february','feb','feb.','march','mar','mar.',
                      'april','apr','apr.','may','june','july','august','aug','aug.','september',
                      'sept','sept.','october','oct','oct.','november','nov','nov.','december',
                      'november','nov','nov.','december','dec','dec.','juni','juli','a','b','c','d','e','f','g',
                    'h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z'])
stopwords = list(set(stopwords))

Add the extra words to the stop list. Then, only keep words that are of type string, not decimal and convert everything to lower case.

In [211]:
stopwords.extend(extra_words)

Only keep strings that are not numbers and convert the strings to lowercase.

In [212]:
stopwords = [w for w in stopwords if type(w)==str]
stopwords = [w for w in stopwords if not w.isdecimal()]
stopwords = [i.lower() for i in stopwords]
stopwords = list(set(stopwords))

In [214]:
stopwords = pd.DataFrame(data=stopwords,columns=['words'])

Save the stopwords to an Excel sheet.

In [215]:
stopwords.to_excel('customstopwords.csv')