# Read & Vectorize Hillary Emails
---

_**Import Packages**_

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import avro.schema
from avro.datafile import DataFileReader
from avro.io import DatumReader

_**File Location**_

In [None]:
file_location = "/Users/tdavid/Downloads/data.avro"

_**Read in Text from AVRO**_

In [12]:
data = []

reader = DataFileReader(open(file_location,"r"), DatumReader())

for user in reader:
    data.append(user)

_**Clean Text**_

In [61]:
new_list = []
for i in range(0,len(data)):
    s = data[i]['contents'].encode('utf-8').replace('\n',' ')
    s1 = ' '.join(re.findall(r"[\w']+|[.,!?:;-]", s)).replace(' .','.').replace(' ,',',').replace(' : ',':').replace(' - ','-').replace(' ;',';').replace(' ?','?').replace(' !','!')
    new_list.append(s1)

In [76]:
data[0]

{u'author': u'Sullivan, Jacob J <Sullivan11@state.gov>',
 u'contents': u'UNCLASSIFIED\nU.S. Department of State\nCase No. F-2015-04841\nDoc No. C05739545\nDate: 05/13/2015\nSTATE DEPT. - PRODUCED TO HOUSE SELECT BENGHAZI COMM.\nSUBJECT TO AGREEMENT ON SENSITIVE INFORMATION & REDACTIONS. NO FOIA WAIVER.\nRELEASE IN FULL\nFrom: Sullivan, Jacob J <Sullivan11@state.gov>\nSent: Wednesday, September 12, 2012 10:16 AM\nTo:\nSubject: FW: Wow\nFrom: Brose, Christian (Armed Services) (mailto:Christian_Brose@armed-servic,essenate.govi\nSent: Wednesday, September 12, 2012 10:09 AM\nTo: Sullivan, Jacob J\nSubject: Wow\nWhat a wonderful, strong and moving statement by your boss. please tell her how much Sen. McCain appreciated it. Me\ntoo\nUNCLASSIFIED\nU.S. Department of State\nCase No. F-2015-04841\nDoc No. C05739545\nDate: 05/13/2015\nSTATE DEPT. - PRODUCED TO HOUSE SELECT BENGHAZI COMM.\nSUBJECT TO AGREEMENT ON SENSITIVE INFORMATION & REDACTIONS. NO FOIA WAIVER. STATE-5CB0045247\n\x0c',
 u'hash'

_**Vectorize**_

In [63]:
vect = TfidfVectorizer()

td = vect.fit_transform(new_list)

_**Define Functions**_

In [64]:
def cosine_similarity(new_docs, old_docs):
    """
    Returns a similarity matrix where the first row is an array of
    similarities of the first new_doc compared with each of the old
    docs.
    """
    return new_docs*old_docs.T

def find_closest_matches(similarity_matrix, n_matches_to_return=1):
    """
    Expects a dense array of the form [[1., .5, .2],
                                       [.3, 1., .1],
                                       [.2, .4, 1.]]
    where rows correspond to similarities.
    """
    top_indices = np.apply_along_axis(func1d=lambda x: x.argsort()[-n_matches_to_return:][::-1], 
                                      axis=1, 
                                      arr=similarity_matrix)
    return top_indices

_**Similarities & Matches**_

In [65]:
similarities = cosine_similarity(td, td).todense()
matches = find_closest_matches(similarities, 2)

_**Calculations**_

In [67]:
enumerate(matches[:])

<enumerate at 0x109e16190>

In [66]:
top_score = 0

for new_text, old_texts in enumerate(matches[:]):
    max_score = max([float(similarities[[new_text],[ind]]) for ind in old_texts[1:]])
    if top_score < max_score:
        top_score = max_score
        print max_score
        similar_texts = [(float(similarities[[new_text],[ind]]), new_list[ind]) for ind in old_texts[1:]]
        print new_list[new_text]
        print similar_texts
        print new_text, old_texts
        print

0.53694635124
UNCLASSIFIED U. S. Department of State Case No. F-2015-04841 Doc No. C05739545 Date:05 13 2015 STATE DEPT.-PRODUCED TO HOUSE SELECT BENGHAZI COMM. SUBJECT TO AGREEMENT ON SENSITIVE INFORMATION REDACTIONS. NO FOIA WAIVER. RELEASE IN FULL From:Sullivan, Jacob J Sullivan11 state. gov Sent:Wednesday, September 12, 2012 10:16 AM To:Subject:FW:Wow From:Brose, Christian Armed Services mailto:Christian_Brose armed-servic, essenate. govi Sent:Wednesday, September 12, 2012 10:09 AM To:Sullivan, Jacob J Subject:Wow What a wonderful, strong and moving statement by your boss. please tell her how much Sen. McCain appreciated it. Me too UNCLASSIFIED U. S. Department of State Case No. F-2015-04841 Doc No. C05739545 Date:05 13 2015 STATE DEPT.-PRODUCED TO HOUSE SELECT BENGHAZI COMM. SUBJECT TO AGREEMENT ON SENSITIVE INFORMATION REDACTIONS. NO FOIA WAIVER. STATE-5CB0045247
[(0.5369463512398643, "UNCLASSIFIED U. S. Department of State Case No. F-2015-04841 Doc No. C05739685 Date:05 13 2015 