### Loading packages 

In [1]:
import pandas as pd
import numpy as np
import os
import subprocess
from spacy import displacy
from sklearn.feature_extraction.text import CountVectorizer
from tqdm import tqdm_notebook
import multiprocessing as mp # parallelization purpose.
import json 

In [2]:
use_multi thread = True
language = 'en' # This notebook work only for english/french.

### Loading Data

In [3]:
#topic = 'Electric vehicle' if language == 'en' else 'vehicule electrique'
#topic="energy" if language == 'en' else 'energie'
#topic= "hydrogen" if language=='en' else 'hydrogène'
topic = 'autonomous'

In [4]:
path = f'../../cacib-work/data/cacib_jsonfull_20200218/{language}/'
folders = [folder for folder in os.listdir(path) if folder != '.DS_Store']
texts = []
for folder in tqdm_notebook(folders):
    files = os.listdir(path + folder + '/')
    for file in files:
        if '.json' in file:
            with open(path + folder + '/' + file) as f:
                data = json.load(f)
                if 'Concept' in data['annotations']:
                    for concept in data['annotations']['Concept']:
                        if topic.lower() in concept['replacewith'].lower():
                            texts.append(data['text'])
                            break
len(texts)

HBox(children=(IntProgress(value=0, max=2576), HTML(value='')))




1827

### Post Tagging and Lemmatization using Spacy

In [5]:
import spacy

#### Spacy Integration

In [6]:
def download_model(name=''): 
    """ download spacy model small, medium or large"""
    _ = subprocess.run(["python", "-m", "spacy","download",name])
    return _.returncode

In [7]:
model_name = "en_core_web_md" if language == 'en' else 'fr_core_news_md'
nlp = None
while nlp is None: 
    try: nlp = spacy.load(model_name)
    except: download_model(model_name)

#### POS Tag to load build CUSTOM stopwords list

In [8]:
#doc = nlp(texts[0])
#displacy.render(doc, style="ent")

#### Stopwords 

In [9]:
"""
stopwords = []
for text in tqdm_notebook(texts):
    model = nlp(text)
    for token in model: 
        if token.pos_ not in ['PROPN','NOUN','ADJ']:
            stopwords.append(token.text)"""

"\nstopwords = []\nfor text in tqdm_notebook(texts):\n    model = nlp(text)\n    for token in model: \n        if token.pos_ not in ['PROPN','NOUN','ADJ']:\n            stopwords.append(token.text)"

In [10]:
stopwords = spacy.lang.en.stop_words.STOP_WORDS if language=='en' else spacy.lang.fr.stop_words.STOP_WORDS

#### Lemmatization 

In [11]:
_texts = texts

In [12]:
def get_lemma(text): 
    """ Get the lemmatization of an input text """
    new_text = ''
    for token in nlp(text):
        if token.pos_ in ['PROPN', 'NOUN', 'ADJ'] and not token.is_stop and not token.like_num:
            new_text += ' ' + token.lemma_
    return new_text
print('------ ORIGINAL TEXT ------\n')
print(_texts[0][:200])
print('\n------ TEXT CLEANED ------')
get_lemma(_texts[0][:200])

------ ORIGINAL TEXT ------

Nearly 50,000 General Motors Workers Go On Strike: Trump Implies Iran May Be Responsible For Saudi Oil Attack: Oil Prices Spike After Saudi Attack, U.S. Blames Iran: Multiple People Hurt In Explosion 

------ TEXT CLEANED ------


' General Motors worker strike trump Iran responsible saudi Oil attack oil price Spike Saudi Attack U.S. Iran multiple People explosion'

In [13]:
texts = []
if use_multithread:
    n_core = mp.cpu_count()
    batch_size = 500 if len(_texts) > 1000 else 250
    for doc in tqdm_notebook(nlp.pipe(_texts, disable=["ner", "parser"], n_threads=n_core, batch_size=batch_size)):
        new_text = ''
        for token in doc:
            if token.pos_ in ['PROPN', 'NOUN', 'ADJ'] and not token.is_stop and not token.like_num:
                new_text += ' ' + token.lemma_
        texts.append(new_text)
else: 
    for text in tqdm_notebook(_texts):
        model = nlp(text)
        new_text = ''
        for token in model:
            if token.pos_ in ['PROPN', 'NOUN', 'ADJ'] and not token.is_stop and not token.like_num:
                new_text += ' ' + token.lemma_
        texts.append(new_text)
print('The number of texts cleaned = {}'.format(len(texts)))

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


The number of texts cleaned = 1827


### Bag of Words using Count Vectorizer 

In [14]:
cv = CountVectorizer(ngram_range=(1,3), stop_words=stopwords)
cv_fit = cv.fit_transform(texts)
word_list = cv.get_feature_names()
count_list = cv_fit.toarray().sum(axis=0)

  'stop_words.' % sorted(inconsistent))


In [15]:
result_dict = dict(zip(word_list,count_list))
len(result_dict.keys())

772912

In [16]:
df = pd.DataFrame.from_dict(result_dict, orient='index', columns=['count'])
df.sort_values(by='count', ascending=False, inplace=True)
df.head(100)

Unnamed: 0,count
vehicle,18594
company,8958
autonomous,8437
year,7892
car,6586
system,6430
new,6333
technology,6044
market,4126
autonomous vehicle,3735


In [17]:
df.reset_index(inplace=True)
df['nb_words'] = df['index'].apply(lambda x: len(x.split()))
df.head()

Unnamed: 0,index,count,nb_words
0,vehicle,18594,1
1,company,8958,1
2,autonomous,8437,1
3,year,7892,1
4,car,6586,1


In [18]:
df[df['nb_words'] == 2].sort_values(by='count', ascending=False)

Unnamed: 0,index,count,nb_words
9,autonomous vehicle,3735,2
66,bg base,1506,2
64,base font,1506,2
65,embed color,1506,2
69,font accent,1496,2
82,autonomous driving,1347,2
103,electric vehicle,1134,2
104,year contract,1132,2
120,products services,1024,2
122,new products,1021,2


In [19]:
df.describe()

Unnamed: 0,count,nb_words
count,772912.0,772912.0
mean,3.214379,2.550779
std,41.538422,0.561737
min,1.0,1.0
25%,1.0,2.0
50%,1.0,3.0
75%,2.0,3.0
max,18594.0,3.0


In [20]:
path = f'outputs/bow_keywords_{topic}.xlsx'
path

'outputs/bow_keywords_autonomous.xlsx'

In [21]:
sheetnames = {
    '1' : 'unigram',
    '2' : 'bigram', 
    '3' : 'trigram'
}

In [24]:
writer = pd.ExcelWriter(path, engine = 'xlsxwriter')
for key in sheetnames.keys():
    df[(df['nb_words'] == int(key)) & (df['count'] >= 10)].to_excel(writer, sheet_name =  sheetnames[key])
writer.save()
writer.close()