#### Imports

In [70]:
import string
from collections import Counter
import os
import pickle

import numpy as np
import pandas as pd

#
# Domain specific libraries to handle text
#
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer


#### Fist time Use

This notebook downloads some data, and generates some pre-processed files we will neede later.

Set the `fist_time` flag below to True **once**, after that, it will be faster to run the notebook
with the flag set to false.

In [71]:
first_time=True

#### Data Directories

You must download the [Reuters 50 from this link](https://archive.ics.uci.edu/ml/datasets/Reuter_50_50), unzip it and save it to the `raw` data directory so that data is organized as per the variables below.

In [72]:
raw_data_dir="Train_2018"   # original data set used for training
test_dir    ="Test_2018"  # original test data set
data_dir    ="Intermediate_Result_2018"  # directory to save intermediate results
if not os.path.exists(data_dir):
    os.mkdir(data_dir)

In [73]:
# This only needs to be run once, to get access to data used by nltk
if first_time:
    import nltk
    nltk.download('punkt') # punctuation items required by tokenizer
    nltk.download('stopwords') # for stop words

[nltk_data] Downloading package punkt to C:\Users\Heyang
[nltk_data]     Huang\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\Heyang
[nltk_data]     Huang\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [74]:
def stem_tokenizer(text):
    return [porter_stemmer.stem(token) for token in word_tokenize(text.lower().replace("'"," "))]

In [75]:
punctuation=list(string.punctuation)
stop=stopwords.words("english")+punctuation+['``',"''"]

## Read in the 10-K

In [76]:
documents=pd.DataFrame()
documents["filename"]=os.listdir("Train_2018")
companyName=np.array([])
companyPath=np.array([])
for txt in documents["filename"].values:
    new_cn=txt[:-4]
    new_path="Train_2018/"+txt
    companyName=np.append(companyName,new_cn)
    companyPath=np.append(companyPath,new_path)
documents["companyName"]=companyName
documents["companyPath"]=companyPath

In [77]:
documents

Unnamed: 0,filename,companyName,companyPath
0,CHIPOTLE MEXICAN GRILL INC.txt,CHIPOTLE MEXICAN GRILL INC,Train_2018/CHIPOTLE MEXICAN GRILL INC.txt
1,CHOICE HOTELS INTERNATIONAL INC _DE.txt,CHOICE HOTELS INTERNATIONAL INC _DE,Train_2018/CHOICE HOTELS INTERNATIONAL INC _DE...
2,CHOICEONE FINANCIAL SERVICES INC.txt,CHOICEONE FINANCIAL SERVICES INC,Train_2018/CHOICEONE FINANCIAL SERVICES INC.txt
3,CHRISTOPHER & BANKS CORP.txt,CHRISTOPHER & BANKS CORP,Train_2018/CHRISTOPHER & BANKS CORP.txt
4,ChromaDex Corp..txt,ChromaDex Corp.,Train_2018/ChromaDex Corp..txt
5,CHS INC.txt,CHS INC,Train_2018/CHS INC.txt
6,Chubb Ltd.txt,Chubb Ltd,Train_2018/Chubb Ltd.txt
7,CHUGACH ELECTRIC ASSOCIATION INC.txt,CHUGACH ELECTRIC ASSOCIATION INC,Train_2018/CHUGACH ELECTRIC ASSOCIATION INC.txt
8,CHURCH & DWIGHT CO INC _DE_.txt,CHURCH & DWIGHT CO INC _DE_,Train_2018/CHURCH & DWIGHT CO INC _DE_.txt
9,CHURCHILL DOWNS Inc.txt,CHURCHILL DOWNS Inc,Train_2018/CHURCHILL DOWNS Inc.txt


## Initiate Vectorizers

In [78]:
countVectorizer=CountVectorizer(input="filename",tokenizer=stem_tokenizer,stop_words=stop)
tfidfVectorizer=TfidfVectorizer(min_df=1, max_df=0.9, stop_words=stop, decode_error='ignore')

In [None]:
# Fit the training and transform testing dataset
X_count=countVectorizer.fit_transform(documents["companyPath"])
X_tfidf=tfidfVectorizer.fit_transform(documents["companyPath"])

In [None]:
X_count_test=countVectorizer.transform(documents["companyPath"])
X_tfidf_test=tfidfVectorizer.transform(documents["companyPath"])

Save the temporary result

In [None]:
count_vectorizer_filename=   data_dir+"/count_vectorizer.p"
count_features_filename=   data_dir+"/count_features.p"
count_test_features_filename=data_dir+"/count_test_features.p"

In [None]:
tfidf_vectorizer_filename=data_dir+"/tfidf_vectorizer.p"
tfidf_features_filename=data_dir+"/tfidf_features.p"
tfidf_test_features_filename=data_dir+"/tfidf_test_features.p"

In [None]:
pickle.dump(countVectorizer, open( count_vectorizer_filename, "wb" ) )
pickle.dump(X_count,         open( count_features_filename, "wb" ) )
pickle.dump(X_count_test,    open( count_test_features_filename, "wb" ) )

pickle.dump(tfidfVectorizer, open( tfidf_vectorizer_filename, "wb" ) )
pickle.dump(X_tfidf,              open( tfidf_features_filename, "wb" ) )
pickle.dump(X_tfidf_test,         open( tfidf_test_features_filename, "wb" ) )