In [None]:
import re
import os
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer


def text_data(pathdata):
    # read data
    with open(pathdata,'r') as f:
        text=f.read()
    text=text.lower()  #Convert text to lowercase
    text=re.sub(r'\d+',' ',text) # Remove numbers
    text=re.sub(r'_',' ',text)  # Remove '_' character
    words=re.split(r'\W+',text) # Remove punctuation and Tokenization
    words=[ PorterStemmer().stem(word)
            for word in words
            if word not in stopwords.words('english') ]  # Remove stopwords
    text=' '.join(words)
    return text

def collect_data(pathin):
    data=[]
    label=0
    listfolder=os.listdir(pathin) # load folders name
    for folder in listfolder:
        pathdocument=pathin+"/"+folder
        listdocument=os.listdir(pathdocument)
        for doct in listdocument:
            text=text_data(pathdocument+"/"+doct)
            data.append(str(label)+"<<>>"+doct+"<<>>"+text)
        label+=1
    text='\n'.join(data)
    return text

text_train=collect_data("/home/lnq/Desktop/20192/code_lab/Text Preprocessing/20news-bydate-train")
text_test=collect_data("/home/lnq/Desktop/20192/code_lab/Text Preprocessing/20news-bydate-test")
with open("/home/lnq/Desktop/20192/code_lab/Text Preprocessing/20news-bydate-test/text_test",'w') as f:
    f.write(text_test)
with open("/home/lnq/Desktop/20192/code_lab/Text Preprocessing/20news-bydate-traintext_train",'w') as f:
    f.write(text_train)

In [1]:
# compute feature vector
import numpy as np

# compute idf function
def compute_idf(numducument, df):
    assert df > 0
    return np.log10(numducument/df*1.)

# compute tf-idf function
def compute_tfidf(fwd,tfmax,idf):
    return idf*1.0*fwd/tfmax
# find index word in list words with binary search
def isappear(word,wordslist):
    l=0
    r=len(wordslist)-1
    while(l<=r):
        m=int((r+l)/2)
        if(word==wordslist[m]):
            return m
        else:
            if(word<wordslist[m]):
                r=m-1
            else:
                l=m+1
    return -1

# compute feature and idf
def word_feature(pathin):
    words_list = []
    cntdocument = 0
    with open(pathin, 'r') as f:
        data = f.read().splitlines()
    #find words list
    for document in data:
        text_data = document.split("<<>>")
        text = text_data[-1].split()
        words_list += list(set(text))
    words_list = list(set(words_list))
    words_list.sort()
    words_df = dict.fromkeys(words_list, 0)
    # compute df
    for document in data:
        text_data = document.split("<<>>")
        text = set(text_data[-1].split())
        for word in text:
            words_df[word] += 1
        cntdocument += 1
    # compute words_idf
    words_idf = { word : compute_idf(cntdocument, words_df[word])
                 for word in list(words_df.keys())
                 if words_df[word] > 3}
    return words_idf


# write words list and idf function
def write_idf(words_idf):
    n=len(words_idf)
    words=list(words_idf.keys())
    idf=list(words_idf.values())
    with open("/home/lnq/Desktop/20192/code_lab/Text Preprocessing/words_list",'w') as f:
        f.write(' '.join(words))
    with open("/home/lnq/Desktop/20192/code_lab/Text Preprocessing/words_idf", 'w') as f:
        for i in range(n):
            f.write(words[i]+"<<>>"+str(idf[i])+"\n")


def find_feature_vector(pathdct):
    words_idf=word_feature(pathdct) # find words list and compute idf for each word in words list
    # write idf
    write_idf(words_idf)
    feature_words=list(words_idf.keys())
    idf=list(words_idf.values())
    n=len(words_idf)
    feature_matrix=[]
    with open(pathdct,'r') as f:
        data=f.read().splitlines()  # load data
    #compute tf-idf for each document
    for dct in data:
        ifo=dct.split("<<>>")
        listwords=ifo[2].split()
        setwords=set(listwords)
        # find the words in document that is appear in words list  
        words=[ word for word in setwords
                if (isappear(word,feature_words)!=-1)]
        tfmax=max([ listwords.count(word) for word in words ])
        dct_vector=ifo[0]+"<<>>"+ifo[1]+"<<>>"
        # compute tf-idf for each word
        for word in words:
            pos=isappear(word,feature_words)
            dct_vector+=str(pos)+":"+str(compute_tfidf(listwords.count(word),tfmax,idf[pos]))+" "
        feature_matrix.append(dct_vector)
    return feature_matrix

tf_idf=find_feature_vector("/home/lnq/Desktop/20192/code_lab/Text Preprocessing/text_train")
# write feature matrix
with open("/home/lnq/Desktop/20192/code_lab/Text Preprocessing/train_tfidf_df=3",'w') as f:
    f.write('\n'.join(tf_idf))
