In [2]:
import csv
import re
import string
import nltk

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

import numpy as np

from sklearn.model_selection import train_test_split

from nltk.stem import WordNetLemmatizer 
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer


[nltk_data] Downloading package stopwords to /home/gui/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/gui/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/gui/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [3]:
with open('../dataset/train.csv') as csv_file:
    csv_reader = csv.reader(csv_file)
    data = list(csv_reader)
data = np.array(data)

In [4]:
data.shape

(7501, 3)

In [5]:
text = data[1:,1]
arxiv_label = data[1:,2]

In [6]:
n= 1340
text[n], arxiv_label[n]

('  We report on the detection of dark matter in the cluster Abell 2218 using the\nweak gravitational distortion of background galaxies. We find a highly\nsignificant, coherent detection of the distortion in the images of the\nbackground galaxies. The inferred 2D mass distribution has a peak that is\ncoincident with the optical and X-ray centroid. The qualitative distributions\nof the cluster light, the X-ray emission and the dark matter are similar and\nthe projected total mass, gas, and light surface densities are consistent with\na $r^{-1}$ profile at distance of $r > 180^{\\prime\\prime}$ from the cluster cD\ngalaxy. Using the weak lensing technique, we determine a lower bound for the\ntotal mass in A2218 of $(3.9 \\pm 0.7) \\times 10^{14}$~h$^{-1}$~M$_\\odot$ within\na fiducial aperture of radius 0.4~h$^{-1}$Mpc. The associated cluster\nmass-to-light ratio is $(440 \\pm 80)$~h~$M_\\odot/L_{\\odot B}$. The mass\nestimated by the weak lensing method is consistent with that inferred 

In [7]:
categories = np.unique(arxiv_label)

In [8]:
categories

array(['astro-ph', 'astro-ph.CO', 'astro-ph.GA', 'astro-ph.SR',
       'cond-mat.mes-hall', 'cond-mat.mtrl-sci', 'cs.LG', 'gr-qc',
       'hep-ph', 'hep-th', 'math.AP', 'math.CO', 'physics.optics',
       'quant-ph', 'stat.ML'], dtype='<U2273')

In [9]:
dist = {}
for _,_, cat in data[1:]:
    if cat in dist:
        dist[cat] += 1
    else:
        dist[cat] = 1

In [10]:
dist

{'astro-ph': 500,
 'astro-ph.CO': 500,
 'astro-ph.GA': 500,
 'astro-ph.SR': 500,
 'cond-mat.mes-hall': 500,
 'cond-mat.mtrl-sci': 500,
 'cs.LG': 500,
 'gr-qc': 500,
 'hep-ph': 500,
 'hep-th': 500,
 'math.AP': 500,
 'math.CO': 500,
 'physics.optics': 500,
 'quant-ph': 500,
 'stat.ML': 500}

In [11]:
# functions to determine the type of a word
def is_noun(tag):
    return tag in ['NN', 'NNS', 'NNP', 'NNPS']


def is_verb(tag):
    return tag in ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']


def is_adverb(tag):
    return tag in ['RB', 'RBR', 'RBS']


def is_adjective(tag):
    return tag in ['JJ', 'JJR', 'JJS']

# transform tag forms
def penn_to_wn(tag):
    if is_adjective(tag):
        return nltk.stem.wordnet.wordnet.ADJ
    elif is_noun(tag):
        return nltk.stem.wordnet.wordnet.NOUN
    elif is_adverb(tag):
        return nltk.stem.wordnet.wordnet.ADV
    elif is_verb(tag):
        return nltk.stem.wordnet.wordnet.VERB
    return nltk.stem.wordnet.wordnet.NOUN

In [12]:
def clean(comment_string, lemmatizer):
        clean_tokens = []

        token = comment_string
        
        for thing, tag in nltk.pos_tag(token.split()):
            if (thing not in string.punctuation):
                clean_tokens.append(lemmatizer.lemmatize(thing, penn_to_wn(tag)))
        
        clean_tokens = [word for word in clean_tokens if word not in stopwords.words('english')]
            
        token = ' '.join(clean_tokens)
                
        matches = re.findall(r'\\\w*', token)
        matches = [re.sub(r'\\', '', word) for word in list(set(matches))]
        token = re.sub(r'\\\w*', ' ', token)
        token = token + ' '.join(matches)
        
        token = re.sub(r'\[', ' ', token)
        token = re.sub(r'\]', ' ', token)

        
        token = re.sub(r'\n', ' ', token)

        token = re.sub(r'\?', ' ', token)
        token = re.sub(r'\"', ' ', token)
        token = re.sub(r'\!', ' ', token)
        token = re.sub(r'\,', ' ', token)
        token = re.sub(r'\.', ' ', token)
        token = re.sub(r'\:', ' ', token)
        token = re.sub(r'\;', ' ', token)
        token = re.sub(r'\)', ' ', token)
        token = re.sub(r'\(', ' ', token)

        token = re.sub(r"\'", ' ', token)
        token = re.sub(r'\+', ' ', token)
        token = re.sub(r"\-", ' ', token)
        token = re.sub(r"\~", ' ', token)
        token = re.sub(r"\*", ' ', token)
        token = re.sub(r"\&", ' ', token)
        token = re.sub(r"\{", ' ', token)
        token = re.sub(r"\}", ' ', token)
        token = re.sub(r"\|", ' ', token)
        token = re.sub(r"\/", ' ', token)
        token = re.sub(r"\#", ' # ', token)
        token = re.sub(' +', ' ', token)

        token = re.sub(r' 200\d ', ' [year]', token)
        token = re.sub(r' 20\d\d ', ' [year]', token)
        token = re.sub(r' 199\d ', ' [year]', token)

        token = re.sub(r' \d+', ' [number] ', token)

        token = token.lower()
        
#         token = re.sub(r'\$( )*\$', ' ', token)
# 
#         token = re.sub(r'\$.*\$', ' ', token)
# 

        token = re.sub(r'\_', ' ', token)

        return ' '.join(token.split())

In [13]:
lemmatizer = WordNetLemmatizer()
n = 3933
print(clean(text[n], lemmatizer), arxiv_label[n])

a recent paper demonstrate considerable degree self similarity rr lyrae star atomic scale analogues excite helium atom undergo single level transition n [number] n [number] discrete self similarity fractal analogue indentified term masses radii oscillation periods basic morphology kinematics in second paper subject extremely large carefully analyzed sample rr lyrae oscillation period provide evidence unique match predicted set discrete periods base exclusively known helium spectrum discrete scaling equation fractal cosmological paradigm observed period spectrum rr lyrae stars astro-ph


In [20]:
def pre_process(file_path, data = 'train', vectorizer = 'tfidf', max_features = None, existing_vectorizer=None):    
    ARXIV = ['astro-ph', 'astro-ph.CO', 'astro-ph.GA', 'astro-ph.SR',
       'cond-mat.mes-hall', 'cond-mat.mtrl-sci', 'cs.LG', 'gr-qc',
       'hep-ph', 'hep-th', 'math.AP', 'math.CO', 'physics.optics',
       'quant-ph', 'stat.ML']
    
    lemmatizer = WordNetLemmatizer()
    
    if existing_vectorizer:
        vectorizer = existing_vectorizer
    else:
        if vectorizer == 'tfidf':
            vectorizer = TfidfVectorizer(max_features = max_features)
        elif vectorizer == 'count':
            vectorizer = CountVectorizer(max_features = max_features)
        elif vectorizer == 'binary':
            vectorizer = CountVectorizer(max_features = max_features, binary = True)


    with open(file_path) as csv_file:
        csv_reader = csv.reader(csv_file)
        colnames = next(csv_reader)

        print('cleaning...')
        if data == 'train':
            raw_data = [[_, clean(comment, lemmatizer), ARXIV.index(cl)] for _, comment, cl in list(csv_reader)]
            X, y = np.array(raw_data)[:, 1], np.array(raw_data)[:, 2]
            
        elif data == 'test':
            raw_data = [[_, clean(comment, lemmatizer)] for _, comment in list(csv_reader)]
            X, y = np.array(raw_data)[:, 1], None

    if data == 'train':

        print('vectorizing...')
        X = vectorizer.fit_transform(X).toarray()
        
        print('done!')
        return X, y, vectorizer
      
    elif data == 'test':
        print('vectorizing...')
        X = vectorizer.transform(X).toarray()

        print('done!')
        return X, None

In [21]:
X, y, vectorizer = pre_process('../dataset/train.csv', data = 'train', vectorizer = 'tfidf', max_features=10)

cleaning...
vectorizing...
done!


In [22]:
y

array(['0', '8', '6', ..., '9', '11', '9'], dtype='<U1807')