# Association Rules
## Submitted By: Anubhav Gupta
## Date: 03/02/2018

In [17]:
import nltk 
import pandas as pd
import glob
import re
import pickle
import time
import datetime

print(datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'))

def read_document_directory(dir_path):
    files=glob.glob(dir_path)
    file_rows=[]
    for file in files:
        file_rows.extend(read_single_text_file(file))
    return file_rows

def read_single_text_file(file_path):
    file=open(file_path, 'r', encoding="latin-1")
    rows = file.readlines()
    for row in rows:
        row = re.sub(r'[^\x00-\x7f]',r'', row) #Remove non-ascii chararacters
    file.close()
    return rows


dir_path_2013 = '2013/*.txt'
dir_path_2014 = '2014/*.txt'

all_rows = read_document_directory(dir_path_2013)
all_rows.extend(read_document_directory(dir_path_2014))

2018-03-04 00:21:42


In [2]:
#Sentence Tokenization
from nltk.tokenize import sent_tokenize

print(datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'))

tokenized_sentences = []
for row in all_rows:
    tokenized_sentences.append(sent_tokenize(row))

file = open('tokenized_sentences.txt','w')
for item in tokenized_sentences:
    for sentence in item:
        file.write("%s\n" %sentence.encode('utf-8'))

2018-03-03 16:28:28


In [3]:
#Word Tokenization
from nltk.tokenize import word_tokenize

print(datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'))

f=open('tokenized_sentences.txt', 'r', encoding='utf-8')

tokenized_words=[]
for row in f:
    row = row[2:-2]
    tokenized_words.append(word_tokenize(row))
f.close()

file=open('tokenized_words.txt', 'w')
for sentence in tokenized_words:
    for word in sentence:
        file.write("%s " %word)
    file.write("\n")
file.close()

2018-03-03 16:28:56


In [20]:
print(datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'))

#Method for extracting and tokenizing dictionary of ground truth
def get_token_dict(file_path):
    data_dict = {}
    f=open(file_path, 'r')
    for row in f:
        row = row.replace(',',' ')
        row = row.replace('"',' ')
        row_vec = word_tokenize(row)
        if row_vec[0] in data_dict.keys():
            if row_vec not in data_dict[row_vec[0]]:
                data_dict[row_vec[0]].append(row_vec)
        else:
            data_dict[row_vec[0]] = list()
            data_dict[row_vec[0]].append(row_vec)
    f.close()
    return data_dict

ceos_path= 'all/ceo.csv'
companies_path = 'all/companies.csv'
percentages_path = 'all/percentage.csv'
not_ceos_path= 'all/notceo.csv'
not_companies_path = 'all/notcompanies.csv'
not_percentages_path = 'all/notpercentage.csv'

ceo_dict = get_token_dict(file_path=ceos_path)
#Remove the following from ceo_dict: White, Lord, Armstrong, Read,Smith, Mike
stop_names = ['White', 'Lord', 'Armstrong', 'Read', 'Smith', 'Mike']
for name in stop_names:
    ceo_dict.pop(name)

company_dict = get_token_dict(file_path=companies_path)
percentage_dict = get_token_dict(file_path=percentages_path)

not_ceo_dict = get_token_dict(file_path=not_ceos_path)
not_company_dict = get_token_dict(file_path=not_companies_path)
not_percentage_dict = get_token_dict(file_path=not_percentages_path)

2018-03-04 00:27:52


In [21]:
print(datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'))


#Find matching sentences from the corpus
def find_matching_sentences(dict_tokens, tokenized_corpus):
    results = []
    for tokenized_sentence in tokenized_corpus:
        for idx, token in enumerate(tokenized_sentence):
            if token in dict_tokens:
                for all_tokens in dict_tokens[token]:
                    i=0
                    flag = True
                    for tok in all_tokens:
                        if(idx + i >= len(tokenized_sentence) or tokenized_sentence[idx + i] != tok):
                            flag=False
                        i+=1
                    if flag:
                        matched_sent = []
                        matched_sent.append(tokenized_sentence)
                        matched_sent.append(all_tokens)
                        results.append(matched_sent)
                        break
    return results

#Positive Samples
df_ceos = pd.DataFrame(find_matching_sentences(ceo_dict, tokenized_words), columns=['sentence','token'])
df_comp = pd.DataFrame(find_matching_sentences(company_dict, tokenized_words), columns=['sentence','token'])
df_percentages = pd.DataFrame(find_matching_sentences(percentage_dict, tokenized_words), columns=['sentence','token'])

#Negative Samples
df_not_ceos = pd.DataFrame(find_matching_sentences(not_ceo_dict, tokenized_words), columns=['sentence','token'])
df_not_comp = pd.DataFrame(find_matching_sentences(not_company_dict, tokenized_words), columns=['sentence','token'])
df_not_percentages = pd.DataFrame(find_matching_sentences(not_percentage_dict, tokenized_words), columns=['sentence','token'])

2018-03-04 00:28:18


In [40]:
#Sample negative set

df_not_ceos = df_not_ceos.sample(frac=(df_ceos.shape[0]/(2*df_not_ceos.shape[0]))).reset_index().drop(['index'], axis=1)
df_not_comp = df_not_comp.sample(frac=(df_comp.shape[0]/(2*df_not_comp.shape[0]))).reset_index().drop(['index'], axis=1)
df_not_percentages = df_not_percentages.sample(frac=(df_percentages.shape[0]/(2*df_not_percentages.shape[0]))).reset_index().drop(['index'], axis=1)

In [41]:
print(datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'))

#pos tagged sentence extraction
df_ceos["pos_tagged"] = df_ceos.apply(lambda x: nltk.pos_tag(x["sentence"]), axis = 1)
df_comp["pos_tagged"] = df_comp.apply(lambda x: nltk.pos_tag(x["sentence"]), axis = 1)
df_percentages["pos_tagged"] = df_percentages.apply(lambda x: nltk.pos_tag(x["sentence"]), axis = 1)

df_not_ceos["pos_tagged"] = df_not_ceos.apply(lambda x: nltk.pos_tag(x["sentence"]), axis = 1)
df_not_comp["pos_tagged"] = df_not_comp.apply(lambda x: nltk.pos_tag(x["sentence"]), axis = 1)
df_not_percentages["pos_tagged"] = df_not_percentages.apply(lambda x: nltk.pos_tag(x["sentence"]), axis = 1)

2018-03-04 00:38:24


In [42]:
print(datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'))

#Remove stopwords
from nltk.corpus import stopwords

set_stopwords = set(stopwords.words('English'))
set_stopwords.remove('when')

def remove_stop(pos_tagged_sentence):
    return [word_tuple for word_tuple in pos_tagged_sentence if word_tuple[0] not in set_stopwords]

df_ceos["stopword_removed"] = df_ceos.apply(lambda x: remove_stop(x["pos_tagged"]), axis = 1)
df_comp["stopword_removed"] = df_comp.apply(lambda x: remove_stop(x["pos_tagged"]), axis = 1)
df_percentages["stopword_removed"] = df_percentages.apply(lambda x: remove_stop(x["pos_tagged"]), axis = 1)

df_not_ceos["stopword_removed"] = df_not_ceos.apply(lambda x: remove_stop(x["pos_tagged"]), axis = 1)
df_not_comp["stopword_removed"] = df_not_comp.apply(lambda x: remove_stop(x["pos_tagged"]), axis = 1)
df_not_percentages["stopword_removed"] = df_not_percentages.apply(lambda x: remove_stop(x["pos_tagged"]), axis = 1)

2018-03-04 01:08:58


In [43]:
#Define function for calculating word shape
def shape(word):
    word_shape = 'other'
    if re.match('[0-9]+(\.[0-9]*)?|[0-9]*\.[0-9]+$', word):
        word_shape = 'number'
    elif re.match('\W+$', word):
        word_shape = 'punct'
    elif re.match('[A-Z][a-z]+$', word):
        word_shape = 'capitalized'
    elif re.match('[A-Z]+$', word):
        word_shape = 'uppercase'
    elif re.match('[a-z]+$', word):
        word_shape = 'lowercase'
    elif re.match('[A-Z][a-z]+[A-Z][a-z]+[A-Za-z]*$', word):
        word_shape = 'camelcase'
    elif re.match('[A-Za-z]+$', word):
        word_shape = 'mixedcase'
    elif re.match('__.+__$', word):
        word_shape = 'wildcard'
    elif re.match('[A-Za-z0-9]+\.$', word):
        word_shape = 'ending-dot'
    elif re.match('[A-Za-z0-9]+\.[A-Za-z0-9\.]+\.$', word):
        word_shape = 'abbreviation'
    elif re.match('[A-Za-z0-9]+\-[A-Za-z0-9\-]+.*$', word):
        word_shape = 'contains-hyphen'
 
    return word_shape

In [None]:
print(datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'))

def create_feature_set(df_sentences, calc_neg=False, max_neg = 30000, class_true = 1, class_false = 0):
    datapoints = []
    neg_datapoints = []
    for index, row in df_sentences.iterrows():
        index_token = 0
        for idx, token_tag in enumerate(row["stopword_removed"]):
            if token_tag[0] == row["token"][0]:
                index_token = idx
                break
        #Found the position of token in the string, now prepare feature vector
        lookup_range = range(index_token, index_token+len(row["token"]))
        if calc_neg and max_neg > 0:
            lookup_range = range(0, len(row["stopword_removed"]))
        
        prev_word_class = class_false
        for i in lookup_range:
            feature_dict = {}
            try:
                feature_dict["word"] = row["stopword_removed"][i][0]
                feature_dict["word_shape"] = shape(row["stopword_removed"][i][0])
                feature_dict["word_len"] = len(row["stopword_removed"][i][0])
                feature_dict["word_pos_tag"] = row["stopword_removed"][i][1]
            
                if i-1 >= 0:
                    feature_dict["prev_word"] = row["stopword_removed"][i-1][0]
                    feature_dict["prev_word_shape"] = shape(row["stopword_removed"][i-1][0])
                    feature_dict["prev_word_len"] = len(row["stopword_removed"][i-1][0])
                    feature_dict["prev_word_pos_tag"] = row["stopword_removed"][i-1][1]
                    #feature_dict["prev_word_class"] = prev_word_class

                if i+1 < len(row["stopword_removed"]):
                    feature_dict["next_word"] = row["stopword_removed"][i+1][0]
                    feature_dict["next_word_shape"] = shape(row["stopword_removed"][i+1][0])
                    feature_dict["next_word_len"] = len(row["stopword_removed"][i+1][0])
                    feature_dict["next_word_pos_tag"] = row["stopword_removed"][i+1][1]
                
            except IndexError:
                #print("Error: ", row["stopword_removed"], row["token"], i)
            if i in range(index_token, index_token+len(row["token"])):
                datapoints.append(feature_dict)
                prev_word_class = class_true
            else:
                max_neg -= 1
                if max_neg > 0:
                    neg_datapoints.append(feature_dict)
                    prev_word_class = class_false
    return datapoints, neg_datapoints

ceo_feature_set, not_ceo_feature_set= create_feature_set(df_ceos, calc_neg=True, max_neg =25000)
comp_feature_set, not_comp_feature_set = create_feature_set(df_comp, calc_neg=True, max_neg =100000)
percentages_feature_set, not_percentages_feature_set = create_feature_set(df_percentages, calc_neg=True, max_neg =140000)

a,b = create_feature_set(df_not_ceos, calc_neg=True, max_neg =1)
not_ceo_feature_set.extend(a)

a,b = create_feature_set(df_not_comp, calc_neg=True, max_neg =1)
not_comp_feature_set.extend(a)
                            
a,b=create_feature_set(df_not_percentages, calc_neg=True, max_neg =1)
not_percentages_feature_set.extend(a)


In [80]:
print(datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'))

def get_one_hot_encoded_df(feature_set):
    categorical_features = ["next_word_pos_tag", "next_word_shape","next_word_len" , "prev_word_pos_tag", 
                            "prev_word_len", "prev_word_shape", "word_shape", "word_pos_tag", "word_len"]
    df_features = pd.DataFrame(feature_set)
    df_features = pd.get_dummies(df_features, columns=categorical_features)
    return df_features

df_ceo_hot = get_one_hot_encoded_df(ceo_feature_set)
df_comp_hot = get_one_hot_encoded_df(comp_feature_set)
df_percentages_hot = get_one_hot_encoded_df(percentages_feature_set)

df_not_ceo_hot = get_one_hot_encoded_df(not_ceo_feature_set)
df_not_comp_hot = get_one_hot_encoded_df(not_comp_feature_set)
df_not_percentages_hot = get_one_hot_encoded_df(not_percentages_feature_set)


2018-03-04 01:41:51


In [81]:
print(datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'))

def create_full_dataset(df_positive, df_negative, positive_class_label=1):
    df1 = df_positive.copy()
    df1['Y'] = 1
    
    df2 = df_negative.copy()
    df2['Y'] = 0
    
    return pd.concat([df1, df2])

ceo_final_dataset = create_full_dataset(df_ceo_hot, df_not_ceo_hot)
comp_final_dataset = create_full_dataset(df_comp_hot, df_not_comp_hot)
percentages_final_dataset = create_full_dataset(df_percentages_hot, df_not_percentages_hot)

ceo_final_dataset.to_csv("ceo_final_dataset.csv", index=False, header=True)
comp_final_dataset.to_csv("comp_final_dataset.csv", index=False, header=True)
percentages_final_dataset.to_csv("percentages_final_dataset.csv", index=False, header=True)

2018-03-04 01:42:12


In [82]:
print(datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'))

#Drop irrelevant columns
irr = ["word", "next_word", "prev_word"]
ceo_final_dataset = ceo_final_dataset.drop(irr, axis=1)
comp_final_dataset = comp_final_dataset.drop(irr, axis=1)
percentages_final_dataset = percentages_final_dataset.drop(irr, axis=1)

2018-03-04 01:45:40


In [86]:
print(datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'))

from sklearn import linear_model,datasets
import pandas as pd  
from sklearn import cross_validation, metrics
from sklearn.linear_model import LogisticRegression
from sklearn import svm

def dataset_modeller(df, train_col_order, frac):
    df_copy = df.copy()
    train = df_copy.sample(frac=frac)
    train_y = train['Y']
    train_x = train.drop(['Y'], axis = 1).fillna(0)
    train_x = train_x[train_col_order]

    test=df_copy.drop(train.index)
    test_y = test['Y']
    test_x = test.drop(['Y'], axis = 1).fillna(0)
    test_x = test_x[train_col_order]
    
    return train_x, train_y, test_x, test_y

def train_model(train_x, train_y):
    classification = svm.SVC(kernel = 'rbf')
    classification.fit(train_x,train_y)
    return classification

def test_model(classification, test_x, test_y):
    result = classification.predict(test_x)
    print(metrics.accuracy_score(test_y, result))
    print(metrics.classification_report(result, test_y))

def perform_all(df, train_col_order, frac=0.8):
    train_x, train_y, test_x, test_y = dataset_modeller(df, train_col_order, frac)
    classification =train_model(train_x, train_y)
    test_model(classification,test_x, test_y)
    return classification


train_ceo_col_order = list(ceo_final_dataset)
train_ceo_col_order.remove('Y')
classifier_ceo = perform_all(ceo_final_dataset, train_ceo_col_order)

train_comp_col_order = list(comp_final_dataset)
train_comp_col_order.remove('Y')
classifier_comp = perform_all(comp_final_dataset, train_comp_col_order, frac = 0.5)

train_percentages_col_order = list(percentages_final_dataset)
train_percentages_col_order.remove('Y')
classifier_percentages = perform_all(percentages_final_dataset, train_percentages_col_order, frac = 0.3)

2018-03-04 09:19:52
0.999863389414
             precision    recall  f1-score   support

          0       1.00      1.00      1.00     43951
          1       1.00      1.00      1.00     51210

avg / total       1.00      1.00      1.00     95161

0.998913958125
             precision    recall  f1-score   support

          0       1.00      1.00      1.00    118673
          1       1.00      1.00      1.00    122570

avg / total       1.00      1.00      1.00    241243



In [84]:
classifier_ceo

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [87]:
print(datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'))

from sklearn.externals import joblib
joblib.dump(classifier_ceo, 'classifier_ceo.pkl') 
joblib.dump(classifier_comp, 'classifier_comp.pkl') 
joblib.dump(classifier_percentages, 'classifier_percentages.pkl') 

2018-03-04 14:19:21


['classifier_percentages.pkl']

In [None]:
print(datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'))

#Extract CEOs
#Process n sentences at a time
def create_inf_dataset(sent_tokens):
    inference_words = []
    for sentence in sent_tokens:
        sentence = nltk.pos_tag(sentence)
        sentence = remove_stop(sentence)
        for i, token in enumerate(sentence):
            feature_dict = {}
            feature_dict["word"] = sentence[i][0]
            feature_dict["word_shape"] = shape(sentence[i][0])
            feature_dict["word_len"] = len(sentence[i][0])
            feature_dict["word_pos_tag"] = sentence[i][1]

            if i-1 >= 0:
                feature_dict["prev_word"] = sentence[i-1][0]
                feature_dict["prev_word_shape"] = shape(sentence[i-1][0])
                feature_dict["prev_word_len"] = len(sentence[i-1][0])
                feature_dict["prev_word_pos_tag"] = sentence[i-1][1]
                feature_dict["prev_word_class"] = 0 #Removing this for now

            if i+1 < len(sentence):
                feature_dict["next_word"] = sentence[i+1][0]
                feature_dict["next_word_shape"] = shape(sentence[i+1][0])
                feature_dict["next_word_len"] = len(sentence[i+1][0])
                feature_dict["next_word_pos_tag"] = sentence[i+1][1]
            inference_words.append(feature_dict)
    return inference_words

def reshape_inf(inf_feature_df, features_list):
    cols_inf = list(inf_feature_df)
    set_feature_list = set(features_list)
    cols_in_features = []
    for col in cols_inf:
        if col in set_feature_list:
            cols_in_features.append(col)
    inf_feature_df = inf_feature_df[cols_in_features]
    #Adding cols which might not be in 
    set_feature_list = set(cols_in_features)
    for col in features_list:
        if col not in set_feature_list:
            inf_feature_df[col] = 0
    return inf_feature_df[features_list].fillna(0)

def do_inference(classifier, feature_x):
    return classifier.predict(feature_x)


def get_inferences_on_n(tokenized_words, start_index, n):
    inf_feature_set = create_inf_dataset(tokenized_words[start_index:start_index+n])
    inf_feature_df = get_one_hot_encoded_df(inf_feature_set)

    get_col = ["word"]
    word_vec = inf_feature_df[get_col]

    ceo_feature_df = reshape_inf(inf_feature_df.copy(), train_ceo_col_order)
    result_ceo = do_inference(classifier_ceo, ceo_feature_df)
    indices_ceo = [i for i, x in enumerate(result_ceo) if x == 1]

    comp_feature_df = reshape_inf(inf_feature_df.copy(), train_comp_col_order)
    result_comp = do_inference(classifier_comp, comp_feature_df)
    indices_comp = [i for i, x in enumerate(result_comp) if x == 1]

    percentages_feature_df = reshape_inf(inf_feature_df.copy(), train_percentages_col_order)
    result_percentages = do_inference(classifier_percentages, percentages_feature_df)
    indices_percentages = [i for i, x in enumerate(result_percentages) if x == 1]

    return word_vec.iloc[indices_ceo], word_vec.iloc[indices_comp], word_vec.iloc[indices_percentages]

2018-03-04 14:19:21


In [None]:
print(datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'))

n = 500 #Process 500 sentences at a time
remaining_sentences = len(tokenized_words)
start_index = 0
while remaining_sentences > 0:
    if remaining_sentences < n:
        n = remaining_sentences

    words_ceo, words_comp, words_percentages = get_inferences_on_n(tokenized_words, start_index, n)
    with open('ceo_list.csv', 'a') as f:
        words_ceo.to_csv(f, header=False, index=False)
    with open('companies_list.csv', 'a') as f:
        words_comp.to_csv(f, header=False, index=False)
    with open('percentages_list.csv', 'a') as f:
        words_percentages.to_csv(f, header=False, index=False)
    
    remaining_sentences -= n
    start_index += n
    

2018-03-04 14:19:21


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
