### sim_py_textJEWK.ipynb

### Text mining: compare two books

# Packages

### Install packages

"codecs" is for reading the text files, 
"re" (regular expretions) and "collections" for working with tokens,
"nltk" (natural language toolkit)

In [None]:
!pip install pandas
!pip install numpy
!pip install scipy
!pip install sklearn
!pip install nltk
!pip install matplotlib

### Import packages

In [None]:
import codecs
import re
import copy
import collections

In [None]:
import numpy as np
import pandas as pd

In [None]:
import nltk
from nltk.stem import PorterStemmer
from nltk.tokenize import WordPunctTokenizer

In [None]:
from __future__ import division


In [None]:
import matplotlib
% matplotlib inline

# Download stopwords

### Some specialized functions from NLTK
You can also download everything in NLTK with nltk.download(), but it will take time!

In [None]:
nltk.download('stopwords')

Import the stopwords package from NLTK

In [None]:
from nltk.corpus import stopwords

# Data

### Read data for Windows

In [None]:
with codecs.open('JaneEyre.txt', "r", encoding="utf-8") as f:
    text_JE = f.read()
with codecs.open('WutheringHeights.txt', "r", encoding="utf-8") as f:
    text_WH = f.read()


# Process data
Check for stopwords

In [None]:
esw = stopwords.words('english')
esw.append("would")

Filter tokens (using regular expressions)

In [None]:
word_pattern = re.compile("^\w+$")

Create a token counter function

In [None]:
def get_text_counter(text):
    tokens = WordPunctTokenizer().tokenize(PorterStemmer().stem(text))
    tokens = list(map(lambda x: x.lower(), tokens))
    tokens = [token for token in tokens if re.match(word_pattern, token) and token not in esw]
    return collections.Counter(tokens), len(tokens)

Create a function to calculate the absolute frequency of the most commen words.

In [None]:
def make_df(counter, size):
    abs_freq = np.array([el[1] for el in counter])
    rel_freq = abs_freq / size
    index = [el[0] for el in counter]
    df = pd.DataFrame(data=np.array([abs_freq, rel_freq]).T, index=index, columns=["Absolute frequency", "Relative frequency"])
    df.index.name = "Most common words"
    return df

# Analysis

## Analyze individual texts

Calculate the most common words of Jane Eyre and display the 15 most common.

In [None]:
je_counter, je_size = get_text_counter(text_JE)


In [None]:
make_df(je_counter.most_common(15), je_size)

Save the 1000 most common words of Jane Eyre to .csv

In [None]:
je_df = make_df(je_counter.most_common(1000), je_size)
je_df.to_csv("JE2_1000.csv")

Calculate the most common words of Withering Hights and display the 15 most common.

In [None]:
wh_counter, wh_size = get_text_counter(text_WH)

In [None]:
make_df(wh_counter.most_common(15), wh_size)

Save the 1000 most common words of Withering Hights to .csv

In [None]:
wh_df = make_df(wh_counter.most_common(1000), wh_size)
wh_df.to_csv("WH2_1000.csv")

# Compare texts

Find the most common words across the two documents.

In [None]:
all_counter = wh_counter + je_counter

In [None]:
all_df = make_df(wh_counter.most_common(1000), 1)
most_common_words = all_df.index.values

Create a data frame with the differences in word frequency

In [None]:
df_data = []
for word in most_common_words:
    je_c = je_counter.get(word, 0) / je_size
    wh_c = wh_counter.get(word, 0) / wh_size
    d = abs(je_c - wh_c)
    df_data.append([je_c, wh_c, d])
    
    

In [None]:
diff_df = pd.DataFrame(data=df_data, index=most_common_words,
                          columns=["JE relative frequency", "WH relative frequency", "Differences in relative frequency"])
diff_df.index.name = "Most common words"
diff_df.sort_values("Differences in relative frequency", ascending=False, inplace=True)
    

Display the most 20 distinctive words.

In [None]:
diff_df.head(20)

Save the full list of distinctive words to a dist_JEWH.csv

In [None]:
diff_df.to_csv("dist_JEWH.csv")