In [1]:
import os
os.chdir('../')

In [2]:
import csv
import re
import string
import nltk

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

import numpy as np
import pandas as pd

from nltk.stem import WordNetLemmatizer 
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline


[nltk_data] Downloading package stopwords to /home/gui/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/gui/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/gui/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [3]:
# list of word types (nouns and adjectives) to leave in the text
defTags = ['NN', 'NNS', 'NNP', 'NNPS', 'JJ', 'JJS', 'JJR']#, 'RB', 'RBS', 'RBR', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']

# functions to determine the type of a word
def is_noun(tag):
    return tag in ['NN', 'NNS', 'NNP', 'NNPS']


def is_verb(tag):
    return tag in ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']


def is_adverb(tag):
    return tag in ['RB', 'RBR', 'RBS']


def is_adjective(tag):
    return tag in ['JJ', 'JJR', 'JJS']

# transform tag forms
def penn_to_wn(tag):
    if is_adjective(tag):
        return nltk.stem.wordnet.wordnet.ADJ
    elif is_noun(tag):
        return nltk.stem.wordnet.wordnet.NOUN
    elif is_adverb(tag):
        return nltk.stem.wordnet.wordnet.ADV
    elif is_verb(tag):
        return nltk.stem.wordnet.wordnet.VERB
    return nltk.stem.wordnet.wordnet.NOUN

In [4]:
def clean(comment_string, lemmatizer):
        clean_tokens = []
        replacement_count = []

        
        token = comment_string
        
        for thing, tag in nltk.pos_tag(token.split()):
            if (thing not in string.punctuation):
                clean_tokens.append(lemmatizer.lemmatize(thing, penn_to_wn(tag)))
        
        clean_tokens = [word for word in clean_tokens if word not in stopwords.words('english')]
            
        token = ' '.join(clean_tokens)
                
        matches = re.findall(r'\\\w*', token)
        matches = [re.sub(r'\\', '', word) for word in list(set(matches))]
        token, n = re.subn(r'\\\w*', ' ', token)
        
        token = token + ' '.join(matches)
        
        token, n = re.subn(r'\[', ' ', token)
        replacement_count.append(n)
        token, n = re.subn(r'\]', ' ', token)
        replacement_count.append(n)
        
        token, n = re.subn(r'\n', ' ', token)
        replacement_count.append(n)


        token, n = re.subn(r'\?', ' ', token)
        replacement_count.append(n)
        token, n = re.subn(r'\"', ' ', token)
        replacement_count.append(n)

        token, n = re.subn(r'\!', ' ', token)
        replacement_count.append(n)
        token, n = re.subn(r'\,', ' ', token)
        replacement_count.append(n)
        token, n = re.subn(r'\.', ' ', token)
        replacement_count.append(n)
        token, n = re.subn(r'\:', ' ', token)
        replacement_count.append(n)
        token, n = re.subn(r'\;', ' ', token)
        replacement_count.append(n)
        token, n = re.subn(r'\)', ' ', token)
        replacement_count.append(n)
        token, n = re.subn(r'\(', ' ', token)
        replacement_count.append(n)

        token, n = re.subn(r"\'", ' ', token)
        replacement_count.append(n)
        token, n = re.subn(r'\+', ' ', token)
        replacement_count.append(n)
        token, n = re.subn(r"\-", ' ', token)
        replacement_count.append(n)
        token, n = re.subn(r"\~", ' ', token)
        replacement_count.append(n)
        token, n = re.subn(r"\*", ' ', token)
        replacement_count.append(n)
        token, n = re.subn(r"\&", ' ', token)
        replacement_count.append(n)
        token, n = re.subn(r"\{", ' ', token)
        replacement_count.append(n)
        token, n = re.subn(r"\}", ' ', token)
        replacement_count.append(n)
        token, n = re.subn(r"\|", ' ', token)
        replacement_count.append(n)
        token, n = re.subn(r"\/", ' ', token)
        replacement_count.append(n)
        token, n = re.subn(r"\#", ' # ', token)
        replacement_count.append(n)
        token, n = re.subn(' +', ' ', token)
        replacement_count.append(n)
    
    
        token, n = re.subn(r' 200\d ', ' [year]', token)
        replacement_count.append(n)
        token, n = re.subn(r' 20\d\d ', ' [year]', token)
        replacement_count.append(n)
        token, n = re.subn(r' 199\d ', ' [year]', token)
        replacement_count.append(n)

      
        token, n = re.subn(r' \d+', ' [number] ', token)
        replacement_count.append(n)

        token = token.lower()
        
#         token, n = re.subn(r'\$( )*\$', ' ', token)
#         replacement_count.append(n)
        
#         token, n = re.subn(r'\$.*\$', ' ', token)
#         replacement_count.append(n)

        token, n = re.subn(r'\_', ' ', token)
        replacement_count.append(n)
            
        return ' '.join(token.split()), replacement_count

In [53]:
with open('dataset/train.csv') as csv_file:
    csv_reader = csv.reader(csv_file)
    data = list(csv_reader)
data = np.array(data)

In [83]:
text = data[1:,1]
arxiv_label = data[1:,2]

lemmatizer = WordNetLemmatizer()
n = 341
print(clean(text[n], lemmatizer), arxiv_label[n])

('the hybrid organic inorganic perovskites hoips attract much attention potential application novel optoelectronic devices remarkably rashba band splitting together specific spin orientation k space i e spin texture find relevant optoelectronic performances in work use first principle calculation symmetry analyses study electric polarization magnetism spin texture property antiferromagnetic afm hoip ferroelectric tmcm mncl3 tmcm ch33nch2cl trimethylchloromethyl ammonium this recently synthesize compound prototype order disorder displacement type ferroelectric large piezoelectric response high ferroelectric transition temperature excellent photoluminescence property you et al science [number] [number] [year]the interesting result inversion symmetry break couple spin orbit couple give rise rashba like band splitting related robust persistent spin texture pst and or typical spiral spin texture manipulate tune ferroelectric or surprisingly also afm magnetic order parameter the tunability s

In [5]:
def clean_data(file_path, data = 'train'):    
    ARXIV = ['astro-ph', 'astro-ph.CO', 'astro-ph.GA', 'astro-ph.SR',
       'cond-mat.mes-hall', 'cond-mat.mtrl-sci', 'cs.LG', 'gr-qc',
       'hep-ph', 'hep-th', 'math.AP', 'math.CO', 'physics.optics',
       'quant-ph', 'stat.ML']
    
    lemmatizer = WordNetLemmatizer()
    
    total_replacement = []
    
    with open(file_path) as csv_file:
        csv_reader = csv.reader(csv_file)
        colnames = next(csv_reader)

        print('cleaning...')
        if data == 'train':
            raw_data = [[_, clean(comment, lemmatizer)[0], ARXIV.index(cl)] for _, comment, cl in list(csv_reader)]
            X, y = np.array(raw_data)[:, 1], np.array(raw_data)[:, 2]
            for _, comment, _ in list(csv_reader):
                total_replacement.append(clean(comment, lemmatizer)[1])
            print('done!')
            return X, y 
            
        elif data == 'test':
            raw_data = [[_, clean(comment, lemmatizer)[0]] for _, comment in list(csv_reader)]
            X, y = np.array(raw_data)[:, 1], None
            print('done!')
            return X, None

In [74]:
total = clean_data('dataset/train.csv')

cleaning...
done!


In [75]:
total = np.array(total)

In [77]:
np.sum(total, axis=0)

array([   899,    872,      0,     67,   1443,     53,  41448,  50875,
         1205,    758,  13556,  13383,   2089,   1860,  34167,   1019,
          437,     51,   7307,   7283,    534,   3237,     12, 739364,
          336,    358,    143,  26194,   5844])

In [93]:
total.shape

(7500, 37)

In [78]:
def pre_process(file_path, data = 'train', vectorizer = 'tfidf', max_features = 20000):    
    ARXIV = ['astro-ph', 'astro-ph.CO', 'astro-ph.GA', 'astro-ph.SR',
       'cond-mat.mes-hall', 'cond-mat.mtrl-sci', 'cs.LG', 'gr-qc',
       'hep-ph', 'hep-th', 'math.AP', 'math.CO', 'physics.optics',
       'quant-ph', 'stat.ML']
    
    lemmatizer = WordNetLemmatizer()
    
    if vectorizer == 'tfidf':
        vectorizer = TfidfVectorizer(max_features = max_features)
    elif vectorizer == 'count':
        vectorizer = CountVectorizer(max_features = max_features)
    elif vectorizer == 'binary':
        vectorizer = CountVectorizer(max_features = max_features, binary = True)


    with open(file_path) as csv_file:
        csv_reader = csv.reader(csv_file)
        colnames = next(csv_reader)

        print('cleaning...')
        if data == 'train':
            raw_data = [[_, clean(comment, lemmatizer)[0], ARXIV.index(cl)] for _, comment, cl in list(csv_reader)]
            X, y = np.array(raw_data)[:, 1], np.array(raw_data)[:, 2]
            
        elif data == 'test':
            raw_data = [[_, clean(comment, lemmatizer)] for _, comment in list(csv_reader)]
            X, y = np.array(raw_data)[:, 1], None

    if data == 'train':

        print('vectorizing...')
        X = vectorizer.fit_transform(X).toarray()
        
        print('done!')
        return X, y, vectorizer
      
    elif data == 'test':
        print('vectorizing...')
        X = vectorizer.transform(X).toarray()

        print('done!')
        return X, None

In [96]:
X, y, vectorizer = pre_process('dataset/train.csv',
                                 data = 'train',
                                 vectorizer = 'tfidf',
                                 max_features = 5000)

mnb = MultinomialNB(0.4)
cross_scores = cross_val_score(mnb, X, y, cv=5)

cleaning...
vectorizing...
done!


In [97]:
cross_scores.mean()

0.7861333333333334

In [6]:
X, y = clean_data('dataset/train.csv')

cleaning...
done!


In [10]:
pipe = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', MultinomialNB())
])
parameters = {
    'tfidf__max_features': (None, 1000, 2000, 3000, 4000, 5000, 6000, 7000),
    'clf__alpha': (0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1)
}

grid = GridSearchCV(estimator=pipe, param_grid=parameters, verbose=10)

best_model = grid.fit(X, y)

Fitting 5 folds for each of 80 candidates, totalling 400 fits
[CV] clf__alpha=0.1, tfidf__max_features=None ........................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  clf__alpha=0.1, tfidf__max_features=None, score=0.805, total=   0.9s
[CV] clf__alpha=0.1, tfidf__max_features=None ........................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.9s remaining:    0.0s


[CV]  clf__alpha=0.1, tfidf__max_features=None, score=0.783, total=   0.9s
[CV] clf__alpha=0.1, tfidf__max_features=None ........................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    1.8s remaining:    0.0s


[CV]  clf__alpha=0.1, tfidf__max_features=None, score=0.785, total=   0.9s
[CV] clf__alpha=0.1, tfidf__max_features=None ........................


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    2.7s remaining:    0.0s


[CV]  clf__alpha=0.1, tfidf__max_features=None, score=0.794, total=   0.9s
[CV] clf__alpha=0.1, tfidf__max_features=None ........................


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    3.6s remaining:    0.0s


[CV]  clf__alpha=0.1, tfidf__max_features=None, score=0.795, total=   0.9s
[CV] clf__alpha=0.1, tfidf__max_features=1000 ........................


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    4.6s remaining:    0.0s


[CV]  clf__alpha=0.1, tfidf__max_features=1000, score=0.753, total=   0.9s
[CV] clf__alpha=0.1, tfidf__max_features=1000 ........................


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:    5.5s remaining:    0.0s


[CV]  clf__alpha=0.1, tfidf__max_features=1000, score=0.743, total=   0.9s
[CV] clf__alpha=0.1, tfidf__max_features=1000 ........................


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:    6.4s remaining:    0.0s


[CV]  clf__alpha=0.1, tfidf__max_features=1000, score=0.729, total=   0.9s
[CV] clf__alpha=0.1, tfidf__max_features=1000 ........................


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:    7.4s remaining:    0.0s


[CV]  clf__alpha=0.1, tfidf__max_features=1000, score=0.740, total=   0.9s
[CV] clf__alpha=0.1, tfidf__max_features=1000 ........................


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:    8.3s remaining:    0.0s


[CV]  clf__alpha=0.1, tfidf__max_features=1000, score=0.732, total=   0.9s
[CV] clf__alpha=0.1, tfidf__max_features=2000 ........................
[CV]  clf__alpha=0.1, tfidf__max_features=2000, score=0.769, total=   0.9s
[CV] clf__alpha=0.1, tfidf__max_features=2000 ........................
[CV]  clf__alpha=0.1, tfidf__max_features=2000, score=0.764, total=   0.9s
[CV] clf__alpha=0.1, tfidf__max_features=2000 ........................
[CV]  clf__alpha=0.1, tfidf__max_features=2000, score=0.767, total=   0.9s
[CV] clf__alpha=0.1, tfidf__max_features=2000 ........................
[CV]  clf__alpha=0.1, tfidf__max_features=2000, score=0.767, total=   1.0s
[CV] clf__alpha=0.1, tfidf__max_features=2000 ........................
[CV]  clf__alpha=0.1, tfidf__max_features=2000, score=0.760, total=   0.9s
[CV] clf__alpha=0.1, tfidf__max_features=3000 ........................
[CV]  clf__alpha=0.1, tfidf__max_features=3000, score=0.787, total=   0.9s
[CV] clf__alpha=0.1, tfidf__max_features=3000 ...

[CV]  clf__alpha=0.2, tfidf__max_features=5000, score=0.777, total=   0.9s
[CV] clf__alpha=0.2, tfidf__max_features=5000 ........................
[CV]  clf__alpha=0.2, tfidf__max_features=5000, score=0.782, total=   0.9s
[CV] clf__alpha=0.2, tfidf__max_features=5000 ........................
[CV]  clf__alpha=0.2, tfidf__max_features=5000, score=0.789, total=   0.9s
[CV] clf__alpha=0.2, tfidf__max_features=5000 ........................
[CV]  clf__alpha=0.2, tfidf__max_features=5000, score=0.787, total=   0.9s
[CV] clf__alpha=0.2, tfidf__max_features=6000 ........................
[CV]  clf__alpha=0.2, tfidf__max_features=6000, score=0.796, total=   0.9s
[CV] clf__alpha=0.2, tfidf__max_features=6000 ........................
[CV]  clf__alpha=0.2, tfidf__max_features=6000, score=0.776, total=   0.9s
[CV] clf__alpha=0.2, tfidf__max_features=6000 ........................
[CV]  clf__alpha=0.2, tfidf__max_features=6000, score=0.781, total=   0.9s
[CV] clf__alpha=0.2, tfidf__max_features=6000 ...

[CV]  clf__alpha=0.4, tfidf__max_features=None, score=0.789, total=   0.9s
[CV] clf__alpha=0.4, tfidf__max_features=None ........................
[CV]  clf__alpha=0.4, tfidf__max_features=None, score=0.785, total=   0.9s
[CV] clf__alpha=0.4, tfidf__max_features=1000 ........................
[CV]  clf__alpha=0.4, tfidf__max_features=1000, score=0.753, total=   0.9s
[CV] clf__alpha=0.4, tfidf__max_features=1000 ........................
[CV]  clf__alpha=0.4, tfidf__max_features=1000, score=0.737, total=   0.9s
[CV] clf__alpha=0.4, tfidf__max_features=1000 ........................
[CV]  clf__alpha=0.4, tfidf__max_features=1000, score=0.728, total=   0.9s
[CV] clf__alpha=0.4, tfidf__max_features=1000 ........................
[CV]  clf__alpha=0.4, tfidf__max_features=1000, score=0.745, total=   0.9s
[CV] clf__alpha=0.4, tfidf__max_features=1000 ........................
[CV]  clf__alpha=0.4, tfidf__max_features=1000, score=0.734, total=   0.9s
[CV] clf__alpha=0.4, tfidf__max_features=2000 ...

[CV]  clf__alpha=0.5, tfidf__max_features=4000, score=0.795, total=   0.9s
[CV] clf__alpha=0.5, tfidf__max_features=4000 ........................
[CV]  clf__alpha=0.5, tfidf__max_features=4000, score=0.772, total=   0.9s
[CV] clf__alpha=0.5, tfidf__max_features=4000 ........................
[CV]  clf__alpha=0.5, tfidf__max_features=4000, score=0.783, total=   0.9s
[CV] clf__alpha=0.5, tfidf__max_features=4000 ........................
[CV]  clf__alpha=0.5, tfidf__max_features=4000, score=0.787, total=   0.9s
[CV] clf__alpha=0.5, tfidf__max_features=4000 ........................
[CV]  clf__alpha=0.5, tfidf__max_features=4000, score=0.777, total=   0.9s
[CV] clf__alpha=0.5, tfidf__max_features=5000 ........................
[CV]  clf__alpha=0.5, tfidf__max_features=5000, score=0.797, total=   0.9s
[CV] clf__alpha=0.5, tfidf__max_features=5000 ........................
[CV]  clf__alpha=0.5, tfidf__max_features=5000, score=0.779, total=   0.9s
[CV] clf__alpha=0.5, tfidf__max_features=5000 ...

[CV]  clf__alpha=0.6, tfidf__max_features=7000, score=0.782, total=   0.9s
[CV] clf__alpha=0.6, tfidf__max_features=7000 ........................
[CV]  clf__alpha=0.6, tfidf__max_features=7000, score=0.789, total=   0.9s
[CV] clf__alpha=0.6, tfidf__max_features=7000 ........................
[CV]  clf__alpha=0.6, tfidf__max_features=7000, score=0.780, total=   0.9s
[CV] clf__alpha=0.7, tfidf__max_features=None ........................
[CV]  clf__alpha=0.7, tfidf__max_features=None, score=0.797, total=   0.9s
[CV] clf__alpha=0.7, tfidf__max_features=None ........................
[CV]  clf__alpha=0.7, tfidf__max_features=None, score=0.779, total=   1.0s
[CV] clf__alpha=0.7, tfidf__max_features=None ........................
[CV]  clf__alpha=0.7, tfidf__max_features=None, score=0.778, total=   0.9s
[CV] clf__alpha=0.7, tfidf__max_features=None ........................
[CV]  clf__alpha=0.7, tfidf__max_features=None, score=0.791, total=   0.9s
[CV] clf__alpha=0.7, tfidf__max_features=None ...

[CV]  clf__alpha=0.8, tfidf__max_features=2000, score=0.766, total=   0.9s
[CV] clf__alpha=0.8, tfidf__max_features=3000 ........................
[CV]  clf__alpha=0.8, tfidf__max_features=3000, score=0.781, total=   0.9s
[CV] clf__alpha=0.8, tfidf__max_features=3000 ........................
[CV]  clf__alpha=0.8, tfidf__max_features=3000, score=0.765, total=   0.9s
[CV] clf__alpha=0.8, tfidf__max_features=3000 ........................
[CV]  clf__alpha=0.8, tfidf__max_features=3000, score=0.772, total=   0.9s
[CV] clf__alpha=0.8, tfidf__max_features=3000 ........................
[CV]  clf__alpha=0.8, tfidf__max_features=3000, score=0.779, total=   0.9s
[CV] clf__alpha=0.8, tfidf__max_features=3000 ........................
[CV]  clf__alpha=0.8, tfidf__max_features=3000, score=0.771, total=   0.9s
[CV] clf__alpha=0.8, tfidf__max_features=4000 ........................
[CV]  clf__alpha=0.8, tfidf__max_features=4000, score=0.793, total=   0.9s
[CV] clf__alpha=0.8, tfidf__max_features=4000 ...

[CV]  clf__alpha=0.9, tfidf__max_features=6000, score=0.777, total=   0.9s
[CV] clf__alpha=0.9, tfidf__max_features=6000 ........................
[CV]  clf__alpha=0.9, tfidf__max_features=6000, score=0.782, total=   0.9s
[CV] clf__alpha=0.9, tfidf__max_features=6000 ........................
[CV]  clf__alpha=0.9, tfidf__max_features=6000, score=0.787, total=   0.9s
[CV] clf__alpha=0.9, tfidf__max_features=6000 ........................
[CV]  clf__alpha=0.9, tfidf__max_features=6000, score=0.777, total=   0.9s
[CV] clf__alpha=0.9, tfidf__max_features=7000 ........................
[CV]  clf__alpha=0.9, tfidf__max_features=7000, score=0.791, total=   0.9s
[CV] clf__alpha=0.9, tfidf__max_features=7000 ........................
[CV]  clf__alpha=0.9, tfidf__max_features=7000, score=0.777, total=   0.9s
[CV] clf__alpha=0.9, tfidf__max_features=7000 ........................
[CV]  clf__alpha=0.9, tfidf__max_features=7000, score=0.784, total=   0.9s
[CV] clf__alpha=0.9, tfidf__max_features=7000 ...

[Parallel(n_jobs=1)]: Done 400 out of 400 | elapsed:  6.2min finished


In [11]:
print(best_model.best_score_, best_model.best_params_)

0.7925333333333333 {'clf__alpha': 0.1, 'tfidf__max_features': None}
