## Significant words

### Part 1

Write a function that generates the most “significant” words in a file by comparing it to a larger corpus.
The function should take three inputs:

* A path to a corpus of texts [This can be the plaintext Shakespeare plays we’ve been working with, or any other corpus of texts you might want to put together]. e.g. ~/Desktop/texts/*.txt
* A path for an individual file. e.g. ~/Desktop/texts/hamlet.txt
* A number N representing the top N significant words to return

You will need to compute tf-idf scores for all words in the individual file against the rest of the corpus.
The function should return a list of tuples in the form [(wd1, score1), (wd2, score2), .... (wdN, scoreN)]

### Part 2
write a function that takes the following:

* A path to a corpus of texts e.g. ~/Desktop/texts/*.txt
* A path for an individual file. e.g. ~/Desktop/texts/hamlet.txt
* A path for a second individual file. e.g. ~/Desktop/texts/macbeth.txt
* A number N representing the top N significant words to return

The function should find out which words are common among the topN words for each document and plot their tf-idf scores against each other as a scatterplot. If no words are common in the topN, the function should handle that case gracefully and print an error message suggesting the user increase the value of N.


In [None]:
%matplotlib inline

import glob, os
import matplotlib.pyplot as plt1
from collections import defaultdict

In [None]:
def preprocess(s, case = "L"):
    if case == "L":
        s = s.lower()
    elif case == "U":
        s = s.upper()
    return s

#new tokenize function
def tokenize(s, tokenize_char=None):
    punctuations = "-,.?!;: \n\t"
    s = [t.strip(punctuations) for t in s.split(tokenize_char)]
    s = [t for t in s if t != '']
    return s

def rawf_df(s, df=None):
    rawf = defaultdict(int)
    this_df = defaultdict(int)
    for t in s:
        rawf[t]+=1
        this_df[t] = 1
    if df != None:
        df = defaultdict(int, df)#df casts an ordinary dict to default dict (in case someone supplies the wrong kind)
    else:
        df = defaultdict(int)
    for t in this_df:
        df[t]+=1
    return rawf, df

def rawf_df(s, df=None):
    rawf = defaultdict(int)
    this_df = defaultdict(int)
    for t in s:
        rawf[t]+=1
        this_df[t] = 1
    if df != None:
        df = defaultdict(int, df)
    else:
        df = defaultdict(int)
    for t in this_df:
        df[t]+=1
    return rawf, df

def score_00(tf):
    scores = sorted(tf.items(), key = lambda x: x[1], reverse = True)
    return scores

def score_01(tf, df, n):
    #scores = [(t, tf[t]*(n/df[t])) for t in tf]
    scores = [(t, float(tf[t]) * (float(n)/float(df[t])) ) for t in tf]
    scores = sorted(scores, key = lambda x:x[1], reverse = True)
    return scores

import math
def tf_idf(tf, df, n):
    token_scores = []
    for t in tf:
        #xcore = tf[t]*math.log10(n/df[t])
        score = tf[t]*math.log10(n/df[t])
        token_scores.append((t,score))
    return sorted(token_scores, key = lambda x: x[1], reverse = True)

def rf_text_over_rf_corpus(raw_freqs):
    
    all_text_relative_freqs = [rawf_to_tf(one_text_raw_freq) for one_text_raw_freq in raw_freqs]
    
    corpus_raw_freq = defaultdict(int)
    
    for one_text_raw_freq in raw_freqs:
        for t, n in one_text_raw_freq.iteritems():
            corpus_raw_freq[t] += n
            
    corpus_relative_freq = rawf_to_tf(corpus_raw_freq)
        
    all_texts_scores = []
    
    for one_text_relative_freq in all_text_relative_freqs:
        one_text_scores = []
        for t, relative_freq in one_text_relative_freq.iteritems():
            one_text_scores.append([t, relative_freq / corpus_relative_freq[t]])
        all_texts_scores.append(one_text_scores)
        
    return all_texts_scores



In [None]:
# ---------------------------------------------------------------------------
# PART 1
# ---------------------------------------------------------------------------

def most_sig_words (path_to_corpus, path_to_indv, top_n):
    '''
    most_sig_words takes the path to all corpus, path to individual file, and 
    top n and computes the tf-idf for all words in the individual file against
    the rest of the corpus
    pram: str - path_to_corpus
    pram: str - path_to_indv
    pram: int - top_n
    return list of tuples (eg: [(wd1, score1), (wd2, score2), .... (wdN, scoreN)])
    '''
    plays = path_to_corpus
    files = glob.glob(plays)
    playnames = [os.path.split(play)[1][:-4].replace('_', ' ').title() for play in files]

    keytext = os.path.split(path_to_indv)[1][:-4].replace('_', ' ').title()
    key_index = playnames.index(keytext)

    raw_freqs = []
    df = {}

    for f in files:
        text = tokenize(preprocess(open(f, 'r').read()))
        rawf, df = rawf_df(text, df)
        raw_freqs.append(rawf)

    scores = tf_idf(raw_freqs[key_index], df, len(raw_freqs))
    print
    print scores[:top_n]


    all_texts_scores = rf_text_over_rf_corpus(raw_freqs)
    scores = sorted(all_texts_scores[key_index], key = lambda x:x[1], reverse = True)
    print
    print scores[:top_n]
    
# most_sig_words("../corpora/shakespeare_plaintext/*.txt", "../corpora/shakespeare_plaintext/macbeth.txt", 25 )


In [None]:
# ---------------------------------------------------------------------------
# PART 2
# ---------------------------------------------------------------------------
def common_topn(path_to_corpus, path_to_indv1, path_to_indv2, top_n):
    '''
    common_topn takes the path to all corpus, path to 2 individual files, and 
    top n and return common words among the topN words for each document and plot results
    pram: str - path_to_corpus
    pram: str - path_to_indv1
    pram: str - path_to_indv2
    pram: int - top_n
    return list of tuples (eg: [(wd1, score1file1, score1file2), (wd2, score2file1, score2file2), .... (wdN, scoreNfile2, scoreNfile2)])
            and scatter plot scorefile1 vs scorefile2
    '''
    plays = path_to_corpus
    files = glob.glob(plays)
    playnames = [os.path.split(play)[1][:-4].replace('_', ' ').title() for play in files]

    keytext1 = os.path.split(path_to_indv1)[1][:-4].replace('_', ' ').title()
    keytext2 = os.path.split(path_to_indv2)[1][:-4].replace('_', ' ').title()
    key_index1 = playnames.index(keytext1)
    key_index2 = playnames.index(keytext2)

    raw_freqs = []
    df = {}

    for f in files:
        text = tokenize(preprocess(open(f, 'r').read()))
        rawf, df = rawf_df(text, df)
        raw_freqs.append(rawf)

    scores1 = tf_idf(raw_freqs[key_index1], df, len(raw_freqs))
    scores2 = tf_idf(raw_freqs[key_index2], df, len(raw_freqs))

    common_words = []
    tf_idf_1 = []
    tf_idf_2 = []
    labels = []
    for s1 in scores1[:top_n]:
        for s2 in scores2[:top_n]:
            if s1[0] == s2[0]:
                tf_idf_1.append(s1[1])
                tf_idf_2.append(s2[1])
                labels.append(s1[0])
                common_words.append((s1[0], s1[1], s2[1]))
    
    #if common words then plot
    if common_words:
        print common_words
        plt1.figure(figsize=(15,12))
        plt1.xlabel("tf-idf file 1")
        plt1.ylabel("tf-idf file 2")
        plt1.scatter(tf_idf_1, tf_idf_2, s=200, alpha=.5)
        for i, l in enumerate(labels):
            plt1.text(tf_idf_1[i], tf_idf_2[i], l)
    else:
        print "Increase value of N to find common terms"

#     all_texts_scores = rf_text_over_rf_corpus(raw_freqs)
#     scores = sorted(all_texts_scores[key_index], key = lambda x:x[1], reverse = True)
#     print
#     print scores[:top_n]

common_topn("../corpora/shakespeare_plaintext/*.txt", "../corpora/shakespeare_plaintext/macbeth.txt", "../corpora/shakespeare_plaintext/othello.txt", 60 )
