# Natural language processing

In [1]:
import numpy as np
import pandas as pd

from sklearn import model_selection as ms, feature_extraction as fe, ensemble

from scipy.sparse import hstack

import spacy

from gensim.matutils import Sparse2Corpus
from gensim.models import LdaModel, Word2Vec

In [2]:
H2020_URL = 'http://cordis.europa.eu/data/cordis-h2020projects.csv'

## Pre-processing using spaCy

Initialise spaCy.

In [3]:
nlp = spacy.load('en')

In [4]:
parsed = nlp('The serpentine syntax of legal language is often used to ' +
             'obfuscate meaning and confuse those outside the law.')

Extract information.

In [5]:
for word in parsed:
    print("{:15}{:15}{:15}{:15}{:15}".format(word.text, word.pos_, word.dep_,
                                             word.lemma_, word.head.lemma_))

The            DET            det            the            syntax         
serpentine     ADJ            amod           serpentine     syntax         
syntax         NOUN           nsubjpass      syntax         use            
of             ADP            prep           of             syntax         
legal          ADJ            amod           legal          language       
language       NOUN           pobj           language       of             
is             VERB           auxpass        be             use            
often          ADV            advmod         often          use            
used           VERB           ROOT           use            use            
to             PART           aux            to             obfuscate      
obfuscate      VERB           xcomp          obfuscate      use            
meaning        NOUN           dobj           meaning        obfuscate      
and            CCONJ          cc             and            obfuscate      
confuse     

## Bag of words and random forests

Read in the H2020 dataset.

In [6]:
h2020 = pd.read_csv(H2020_URL, sep=';', decimal=',')

Keep only signed contracts.

In [7]:
h2020 = h2020[h2020['status'] == 'SIGNED']

Remove missing values.

In [8]:
h2020 = h2020[h2020['totalCost'].notna()]

Create a new variable representing whether the project was fully funded by the European Commission.

In [9]:
h2020['fully_funded'] = ~(h2020['ecMaxContribution'] < h2020['totalCost'])

Count words and 2-grams (combinations of two words) in the 'objective', keeping only those that occur at least 5 times.

In [10]:
vectorizer = fe.text.CountVectorizer(
    stop_words='english',
    ngram_range=(1, 2),
    min_df=5
)

Prepare the data.

In [11]:
X = vectorizer.fit_transform(h2020['objective'])
y = h2020['fully_funded'].astype('int')

Include total project cost and coordinator country (using the UK as reference).

In [12]:
country_dummies = pd.get_dummies(h2020['coordinatorCountry']).drop('UK', axis=1)
X = hstack([X, h2020['totalCost'][:, np.newaxis], country_dummies])

Train a random forest with 20 decision trees.

In [13]:
rf1 = ensemble.RandomForestClassifier(n_estimators=20)
rf1.fit(X, y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=20, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

Define stratified folds for cross-validation.

In [14]:
ten_fold_cv = ms.StratifiedKFold(n_splits=10, shuffle=True)

Compute average AUC across folds.

In [15]:
aucs = ms.cross_val_score(rf1, X, y, scoring='roc_auc', cv=ten_fold_cv)
np.mean(aucs)

0.91290779178710313

Extract variable importances and sort in descending order.

In [16]:
importances = pd.DataFrame({
    'variable': vectorizer.get_feature_names() + ['totalCost'] + list(country_dummies.columns),
    'importance': rf1.feature_importances_
})
importances.sort_values('importance', ascending=False, inplace=True)
importances.head(10)

Unnamed: 0,importance,variable
65006,0.016295,totalCost
35075,0.013903,market
7515,0.011872,business
22635,0.010726,feasibility
12719,0.009264,costs
12613,0.006917,cost
61801,0.004843,understanding
54881,0.004469,solution
45629,0.004299,product
30031,0.003922,innovation


Compute tf–idf (alternatively use `TfidfTransformer` on the output of `CountVectorizer`).

In [17]:
vectorizer = fe.text.TfidfVectorizer(
    stop_words='english',
    ngram_range=(1, 2),
    min_df=5
)

Prepare the data (as above).

In [18]:
X_tfidf = vectorizer.fit_transform(h2020.objective)
X_tfidf = hstack([X_tfidf, np.asmatrix(h2020.totalCost).T, country_dummies])

Train a random forest with 20 decision trees.

In [19]:
rf2 = ensemble.RandomForestClassifier(n_estimators=20)
rf2.fit(X_tfidf, y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=20, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

Compute average AUC across folds.

In [20]:
aucs = ms.cross_val_score(rf2, X_tfidf, y, scoring='roc_auc', cv=ten_fold_cv)
np.mean(aucs)

0.9079928545789604

Extract variable importances and sort in descending order.

In [21]:
importances = pd.DataFrame({
    'variable': vectorizer.get_feature_names() + ['totalCost'] + list(country_dummies.columns),
    'importance': rf2.feature_importances_
})
importances.sort_values('importance', ascending=False, inplace=True)
importances.head(10)

Unnamed: 0,importance,variable
65006,0.025284,totalCost
35075,0.01891,market
22635,0.009149,feasibility
13672,0.006114,customers
7515,0.005407,business
61801,0.005182,understanding
54881,0.005031,solution
12719,0.004583,costs
54994,0.004474,solutions
42102,0.004404,patented


## LDA using `gensim`

Count words in the 'objective', keeping only those that occur at least 5 times.

In [22]:
vectorizer = fe.text.CountVectorizer(
    stop_words='english',
    min_df=5
)
X = vectorizer.fit_transform(h2020.objective)

Convert to `gensim` format.

In [23]:
corpus = Sparse2Corpus(X, documents_columns=False)

Create mapping from word IDs (integers) to words (strings).

In [24]:
id2word = dict(enumerate(vectorizer.get_feature_names()))

Fit LDA model with 10 topics.

In [25]:
lda = LdaModel(corpus=corpus, num_topics=10, id2word=id2word)

Show top 5 words for each of the 10 topics.

In [26]:
lda.show_topics(num_topics=10, num_words=5)

[(0,
  '0.006*"proteins" + 0.006*"molecular" + 0.006*"project" + 0.005*"mechanisms" + 0.005*"protein"'),
 (1,
  '0.009*"cancer" + 0.007*"disease" + 0.007*"clinical" + 0.007*"patients" + 0.006*"treatment"'),
 (2,
  '0.013*"research" + 0.007*"new" + 0.005*"cell" + 0.005*"project" + 0.004*"understanding"'),
 (3,
  '0.007*"project" + 0.007*"climate" + 0.007*"data" + 0.005*"new" + 0.005*"health"'),
 (4,
  '0.010*"materials" + 0.009*"production" + 0.009*"process" + 0.008*"project" + 0.007*"food"'),
 (5,
  '0.011*"market" + 0.010*"data" + 0.009*"energy" + 0.008*"project" + 0.008*"technology"'),
 (6,
  '0.013*"project" + 0.008*"research" + 0.007*"new" + 0.007*"european" + 0.006*"social"'),
 (7,
  '0.009*"new" + 0.007*"materials" + 0.007*"high" + 0.007*"quantum" + 0.006*"systems"'),
 (8,
  '0.010*"project" + 0.008*"european" + 0.007*"research" + 0.005*"new" + 0.004*"public"'),
 (9,
  '0.016*"innovation" + 0.010*"project" + 0.010*"market" + 0.009*"management" + 0.009*"smes"')]

## word2vec using `gensim`

Convert adjectives and verbs to corresponding lemmas using spaCy.

In [27]:
def to_lemmas(text):
    return [
        x.lemma_ if x.pos in (spacy.parts_of_speech.ADJ, spacy.parts_of_speech.VERB)
        else x.text
        for x in nlp(text)
    ]

objectives = h2020['objective'].apply(to_lemmas)

Fit word2vec model.

In [28]:
w2c = Word2Vec(sentences=objectives, size=100, window=5, min_count=5)

Which words are most similar to 'UK'?

In [29]:
w2c.wv.most_similar('UK')

[('Spain', 0.9113762378692627),
 ('Netherlands', 0.9079961776733398),
 ('Germany', 0.9077062606811523),
 ('Portugal', 0.8999629020690918),
 ('Italy', 0.8987637758255005),
 ('France', 0.8933299779891968),
 ('Sweden', 0.8925649523735046),
 ('Poland', 0.8824748396873474),
 ('Austria', 0.8678978681564331),
 ('Ireland', 0.8600377440452576)]

Which words are most similar to 'UK' but not related to 'France'?

In [30]:
w2c.wv.most_similar(positive=['UK'], negative=['France'])

[('portfolio', 0.5283050537109375),
 ('adoption', 0.48575708270072937),
 ('mission', 0.48084893822669983),
 ('sector', 0.4524659812450409),
 ('excellence', 0.448476105928421),
 ('ambition', 0.4471786618232727),
 ('competitiveness', 0.44599270820617676),
 ('economy', 0.4452618658542633),
 ('company', 0.4411161541938782),
 ('capability', 0.4308639168739319)]

Which word doesn't go with the others?

In [31]:
w2c.wv.doesnt_match(['Italy', 'Japan', 'France', 'UK'])

'Japan'