In [1]:
import pandas as pd
import numpy as np
import os
from bs4 import BeautifulSoup

## Figure out promising indicators

In [2]:
inds = pd.read_csv('/Users/caged/Dropbox/RSN Mining/indicator_data_matched_with_cik_data.csv') 
inds = inds.drop('MA4_9_2', axis=1)
inds.columns

exclude = ['Unnamed: 0', 'UID', 'CompanyName', 'MTD14_Pilot', 'Industry', 'Sector', 'Peer Group', '2015_filing', '2014_filing', 
 'primarysymbol', 'siccode', 'sicdescription', 'feed_url', 'SD_url_2014', 'SD_url_2015', 'SD_url_2016', 
 'feed_retrieved_date', 'dir_name','dir_path', 'cik', 'companyname', 'entityid', 'extracted_cik_2015', 'name_clean', 'name_upper', 'primaryexchange']

In [3]:
indicator_names = list(inds.columns.difference(exclude))
indicator_names[0:5]

['MA1_1A_1', 'MA1_1A_2', 'MA1_1A_3', 'MA1_1A_4', 'MA1_1B_1']

In [4]:
data_only = inds[indicator_names]
data_only = data_only.astype(float)
data_only.head()

Unnamed: 0,MA1_1A_1,MA1_1A_2,MA1_1A_3,MA1_1A_4,MA1_1B_1,MA1_1B_2,MA1_2_1,MA1_2_2,MA1_2_3,MA2_3_1,...,MA5_18_1,MA5_18_2,MA5_19_1,MA5_19_2,MA5_19_3,MA5_19_4,MA5_19_5,MA5_19_6,MA5_20_1,MA5_20_2
0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,,...,,1.0,,1.0,,,,,0.0,
1,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,1.0,...,,,1.0,1.0,,,,,0.0,
2,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,,...,0.0,1.0,,1.0,,,,,0.0,
3,0.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,...,,,,,,,,,,
4,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.0,1.0,,1.0,,,,,0.0,


In [5]:
t = data_only.T
t['sum'] = t.apply(np.sum, axis=1)
t['nans'] = t.apply(lambda x: sum(pd.isnull(x)), axis=1)
t['non_nans'] = 125 - t['nans']
t['fraction'] = t['sum'] / t['non_nans']
t['good'] = t.fraction.apply(lambda x: (x > 0.4) & (x < 0.6))
t.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,121,122,123,124,125,sum,nans,non_nans,fraction,good
MA1_1A_1,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,...,1.0,0.0,1.0,1.0,0.0,71.5,0,125,0.572,True
MA1_1A_2,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,,0.0,0.0,1.0,0.0,22.0,5,120,0.183333,False
MA1_1A_3,1.0,1.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,105.0,0,125,0.84,False
MA1_1A_4,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,0.0,104.0,0,125,0.832,False
MA1_1B_1,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,...,,1.0,0.0,,1.0,74.5,8,117,0.636752,False


In [6]:
promising_indicators = t[t.good == True].index
promising_indicators

Index(['MA1_1A_1', 'MA2_3_5', 'MA2_5_2', 'MA2_6_2', 'MA2_6_3', 'MA3_7_1',
       'MA3_7_5', 'MA3_7_6', 'MA3_8_1', 'MA4_14_2', 'MA4_15_1', 'MA4_15_2',
       'MA4_15_4', 'MA5_16_2', 'MA5_18_2'],
      dtype='object')

### Investigate

In [31]:
dd = pd.read_csv('disclosure_docs_list.csv', dtype='str')
dd = dd.drop('Unnamed: 0', axis=1)
dd = dd.rename(columns={'req_cik': 'req_company_name', 'req_company_name': 'req_cik'})
dd.ret_filing_date = pd.to_datetime(dd.ret_filing_date)
dd_2015 = dd[dd.ret_filing_date.dt.year == 2015]
dd_2015.head()

Unnamed: 0,req_company_name,req_cik,req_url,ret_cik,ret_company_name,ret_description,ret_filing_date,ret_sec_accession_number,ret_title,ret_url
2,AARON'S INC,706688,http://www.sec.gov/Archives/edgar/data/706688/...,706688,AARON'S INC,sdconflictminerals2014.htm,2015-05-22 12:06:05,0000706688-15-000148,SD_CONFLICT_MINERALS_2014,https://www.sec.gov/Archives/edgar/data/706688...
3,AARON'S INC,706688,http://www.sec.gov/Archives/edgar/data/706688/...,706688,AARON'S INC,0000706688-15-000148.txt,2015-05-22 12:06:05,0000706688-15-000148,Complete submission text file,https://www.sec.gov/Archives/edgar/data/706688...
7,AAON INC,824142,http://www.sec.gov/Archives/edgar/data/824142/...,824142,AAON INC,a2014formsd.htm,2015-05-29 11:36:18,0000824142-15-000071,SD,https://www.sec.gov/Archives/edgar/data/824142...
8,AAON INC,824142,http://www.sec.gov/Archives/edgar/data/824142/...,824142,AAON INC,exhibit101.htm,2015-05-29 11:36:18,0000824142-15-000071,EXHIBIT 1.01,https://www.sec.gov/Archives/edgar/data/824142...
9,AAON INC,824142,http://www.sec.gov/Archives/edgar/data/824142/...,824142,AAON INC,0000824142-15-000071.txt,2015-05-29 11:36:18,0000824142-15-000071,Complete submission text file,https://www.sec.gov/Archives/edgar/data/824142...


## Try and predict promising indicators

In [83]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.svm import LinearSVC
from sklearn import metrics
from sklearn.cross_validation import ShuffleSplit

In [58]:
promising_indicators

Index(['MA1_1A_1', 'MA2_3_5', 'MA2_5_2', 'MA2_6_2', 'MA2_6_3', 'MA3_7_1',
       'MA3_7_5', 'MA3_7_6', 'MA3_8_1', 'MA4_14_2', 'MA4_15_1', 'MA4_15_2',
       'MA4_15_4', 'MA5_16_2', 'MA5_18_2'],
      dtype='object')

In [189]:

INDICATOR = promising_indicators[14]
print(INDICATOR)

inds = pd.read_csv('/Users/caged/Dropbox/RSN Mining/indicator_data_matched_with_cik_data.csv', dtype='str', usecols=[INDICATOR, 'cik', 'primarysymbol'])
inds = inds.rename(columns={INDICATOR: 'label'})
inds['dir_name'] = inds.apply(lambda row: ' '.join([row.primarysymbol, '--', str(row.cik)]), axis=1)
inds['dir_path'] = inds.dir_name.apply(lambda x: os.path.join('supporting_docs_2015', x))
inds = inds.dropna()
inds.head()

MA5_18_2


Unnamed: 0,label,cik,primarysymbol,dir_name,dir_path
0,1.0,66740,MMM,MMM -- 66740,supporting_docs_2015/MMM -- 66740
1,,937966,ASML,ASML -- 937966,supporting_docs_2015/ASML -- 937966
2,1.0,1800,ABT,ABT -- 1800,supporting_docs_2015/ABT -- 1800
3,,1144215,AYI,AYI -- 1144215,supporting_docs_2015/AYI -- 1144215
4,1.0,886125,ALU,ALU -- 886125,supporting_docs_2015/ALU -- 886125


In [190]:
len(inds)

126

In [183]:
text_file_paths = []
list_of_labels = []
bad_counter = 0
for i, row in inds.iterrows():
    results = dd_2015[(dd_2015.req_cik == row.cik) & (dd_2015.ret_title == 'Complete submission text file')].reset_index()
    if len(results) == 1:
        dir_path = row.dir_path
        text_file_name = results['ret_description'][0]
        text_file_paths.append(os.path.join(dir_path, text_file_name))
        list_of_labels.append(int(float(row.label)))
    else:
        bad_counter +=1
bad_counter

1

In [184]:
print(len(text_file_paths))
print(len(list_of_labels))

76
76


In [185]:
list_of_text_docs = []
for file_path in text_file_paths:
    with open(file_path, 'r') as f:
        text = f.read()
    soup = BeautifulSoup(text)
    documents = soup.findAll('document')
    text_docs = ''
    for doc in documents:
        text_docs = ''.join([text_docs, doc.get_text()])
    list_of_text_docs.append(text_docs)

In [186]:
cv = ShuffleSplit(n=len(list_of_labels), n_iter=1, test_size=0.3, random_state=0)

for train, test in cv:
    train_data = np.array(list_of_text_docs)[train]
    train_labels = np.array(list_of_labels)[train]
    test_data = np.array(list_of_text_docs)[test]
    test_labels = np.array(list_of_labels)[test]

In [187]:
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(train_data)
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
clf = LinearSVC().fit(X_train_tfidf, train_labels)
X_new_counts = count_vect.transform(test_data)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)
predicted = clf.predict(X_new_tfidf)

In [188]:
print(INDICATOR)
print(len(list_of_labels))
print(metrics.classification_report(test_labels, predicted))

MA5_18_2
76
             precision    recall  f1-score   support

          0       0.62      0.80      0.70        10
          1       0.80      0.62      0.70        13

avg / total       0.72      0.70      0.70        23



```
['MA1_1A_1', 'MA2_3_5', 'MA2_5_2', 'MA2_6_2', 'MA2_6_3', 'MA3_7_1', 'MA3_7_5', 'MA3_7_6', 'MA3_8_1', 'MA4_14_2', 'MA4_15_1', 'MA4_15_2', 'MA4_15_4', 'MA5_16_2', 'MA5_18_2']


'MA1_1A_1'
             precision    recall  f1-score   support

          0       0.55      0.40      0.46        15
          1       0.67      0.78      0.72        23

avg / total       0.62      0.63      0.62        38

'MA2_3_5'
             precision    recall  f1-score   support

          0       0.00      0.00      0.00        14
          1       0.46      1.00      0.63        12

avg / total       0.21      0.46      0.29        26

MA2_5_2
             precision    recall  f1-score   support

          0       0.33      0.67      0.44         6
          1       0.00      0.00      0.00         8

avg / total       0.14      0.29      0.19        14


MA2_6_2
92
             precision    recall  f1-score   support

          0       0.83      0.77      0.80        13
          1       0.81      0.87      0.84        15

avg / total       0.82      0.82      0.82        28


MA2_6_3
81
             precision    recall  f1-score   support

          0       0.68      1.00      0.81        13
          1       1.00      0.50      0.67        12

avg / total       0.84      0.76      0.74        25


MA3_7_1
102
             precision    recall  f1-score   support

          0       0.83      0.45      0.59        11
          1       0.76      0.95      0.84        20

avg / total       0.79      0.77      0.75        31


MA3_7_5
69
             precision    recall  f1-score   support

          0       0.56      0.38      0.45        13
          1       0.33      0.50      0.40         8

avg / total       0.47      0.43      0.43        21


MA3_7_6
55
             precision    recall  f1-score   support

          0       0.71      0.71      0.71         7
          1       0.80      0.80      0.80        10

avg / total       0.76      0.76      0.76        17


MA3_8_1
46
             precision    recall  f1-score   support

          0       1.00      0.38      0.55         8
          1       0.55      1.00      0.71         6

avg / total       0.81      0.64      0.61        14


MA4_14_2
107
             precision    recall  f1-score   support

          0       0.61      1.00      0.75        20
          1       0.00      0.00      0.00        13

avg / total       0.37      0.61      0.46        33


MA4_15_1
115
             precision    recall  f1-score   support

          0       0.82      1.00      0.90        14
          1       1.00      0.86      0.92        21

avg / total       0.93      0.91      0.92        35


MA4_15_2
85
             precision    recall  f1-score   support

          0       0.85      0.92      0.88        12
          1       0.92      0.86      0.89        14

avg / total       0.89      0.88      0.88        26


MA4_15_4
91
             precision    recall  f1-score   support

          0       0.38      0.42      0.40        12
          1       0.53      0.50      0.52        16

avg / total       0.47      0.46      0.47        28


MA5_16_2
68
             precision    recall  f1-score   support

          0       0.75      0.75      0.75        12
          1       0.67      0.67      0.67         9

avg / total       0.71      0.71      0.71        21


MA5_18_2
76
             precision    recall  f1-score   support

          0       0.62      0.80      0.70        10
          1       0.80      0.62      0.70        13

avg / total       0.72      0.70      0.70        23

```


## Sandpit

In [None]:
from nltk.tokenize import RegexpTokenizer, SpaceTokenizer
from nltk.corpus import stopwords

# Pull out any urls
word_tokenizer = SpaceTokenizer()
simple_tokens = word_tokenizer.tokenize(documents[0].get_text())
for token in simple_tokens:
    if token.startswith('http'):
        print(token)

In [None]:
# Don't need to do this (I think)
tokenizer = RegexpTokenizer(r'\w+')
def tkize(doc_text):
    tokens = tokenizer.tokenize(text_docs)
    tokens = [tk.lower() for tk in tokens]
    return tokens

words_list = []
for doc in documents:
    doc_text = doc.get_text()
    tks = tkize(doc_text)
    words = [w for w in tks if not w in stopwords.words("english")]
    words_list.append(words)
len(words_list)

In [66]:
#old shuffle - not great!
x = 90
train_data = list_of_text_docs[0:x]
train_labels = list_of_labels[0:x]
test_data = list_of_text_docs[x:]
test_labels = list_of_labels[x:]

print(sum(train_labels))
print(sum(test_labels))

50
0
