<a href="https://colab.research.google.com/github/Heity94/TWSM_Lab/blob/main/Project/Notebooks/PH_Extract_NounPhrase.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import pickle
import spacy

In [2]:
#load spacy model but exclude not required components for tagging
nlp = spacy.load("en_core_web_sm", disable=["lemmatizer", "ner"]) 

In [3]:
#show pipeline components
nlp.pipeline

[('tok2vec', <spacy.pipeline.tok2vec.Tok2Vec at 0x7f7b38ecb280>),
 ('tagger', <spacy.pipeline.tagger.Tagger at 0x7f7b38ecb130>),
 ('parser', <spacy.pipeline.dep_parser.DependencyParser at 0x7f7b38dbab50>),
 ('attribute_ruler',
  <spacy.pipeline.attributeruler.AttributeRuler at 0x7f7b38cc14b0>)]

In [4]:
# Declare whether you are on Colab or local
colab = True

In [5]:
if colab==True:
  
  #Mount drive
  from google.colab import drive
  drive.mount('/content/drive')

  #set path to data in Google Drive
  data_path = "/content/drive/MyDrive/2022_Analytics Lab Student Projects/Data/All Topics"

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [6]:
sentences = pd.read_csv(data_path+"/Fulltext of Corpus/sentences.csv")#, nrows=1000)

In [7]:
sentences_clean = sentences.loc[~sentences.sentence_type.isin(["EMPTY", "TAG"])] #which sentence types can we also ignore???
del sentences
sentences_clean = sentences_clean.drop_duplicates(subset="sentence")
sentences_clean = sentences_clean[["sentence_id", "sentence"]]

In [8]:
sentences_clean.head(2)

Unnamed: 0,sentence_id,sentence
2,1_2_22,Examining interdependence between product user...
5,1_24_25,A


In [9]:
sentences_clean.shape

(3231381, 2)

## Create spacy pipe functions & joblib functions

In [10]:
def pos_noun_pipe(doc):
    noun_list = [tok.text for tok in doc.noun_chunks] 
    return noun_list

In [11]:
from joblib import Parallel, delayed

def chunker(iterable, total_length, chunksize):
    return (iterable[pos: pos + chunksize] for pos in range(0, total_length, chunksize))

def flatten(list_of_lists):
    "Flatten a list of lists to a combined list"
    return [item for sublist in list_of_lists for item in sublist]

def process_chunk(texts):
    preproc_pipe = []
    for doc in nlp.pipe(texts, batch_size=1000):
        preproc_pipe.append(pos_noun_pipe(doc))
    return preproc_pipe

def preprocess_parallel(texts, chunksize=5000):
    executor = Parallel(n_jobs=7, backend='multiprocessing', prefer="processes")
    do = delayed(process_chunk)
    tasks = (do(chunk) for chunk in chunker(texts, len(texts), chunksize=chunksize))
    result = executor(tasks)
    return flatten(result)

In [12]:
sentences_clean.shape

(3231381, 2)

## Test different batch and chunk sizes

In [13]:
# test
tst = sentences_clean.iloc[:100].copy()

In [14]:
%%time
tst['noun_phrases'] = preprocess_parallel(tst['sentence'].astype("object"), chunksize=10_000)

CPU times: user 326 ms, sys: 502 ms, total: 828 ms
Wall time: 1.6 s


- 10000 in 17 bei 5000 chunk and 1000 batch
- 10000 in 22 bei 500 chunk and 1000 batch
- 10000 in 22 bei 10000 chunk and 5000 batch
- 10000 in 20 bei 10000 chunk and 1000 batch

- 20_000 in 37.3 bei 5000 chunk and 1000 batch
- 20_000 in 37.9 bei 5000 chunk and 2000 batch
- 20_000 in 35.3 bei 5000 chunk and 100 batch
- 20_000 in 33.1 bei 10_000 chunk and 100 batch
- 20_000 in 35.2 bei 10_000 chunk and 50 batch
- 20_000 in 40.7 bei 10_000 chunk and 200 batch

In [15]:
# Assuming the speed would be constant it would take 1.5 hrs to do all sentences
sentences_clean.shape[0]/600/60/60

1.4960097222222224

## POS Tag all sentences
With 10_000 chunksize and 1000 batch size

In [16]:
%%time
sentences_clean['noun_phrases'] = preprocess_parallel(sentences_clean['sentence'].astype(str), chunksize=10_000)

CPU times: user 1min 13s, sys: 17 s, total: 1min 30s
Wall time: 1h 33min 50s


In [17]:
# Lets check some sample rows
sentences_clean.sample(5)

Unnamed: 0,sentence_id,sentence,noun_phrases
5328168,5464_19948_19961,"In comparison, the catalog group averages were...","[comparison, the catalog group averages, both ..."
3282534,3416_4198_4213,perceptions and expectations of the system and...,"[perceptions, expectations, the system, organi..."
4528287,4634_23914_23918,−6415.4 \n0.050 \n \n,[−6415.4]
2671193,2860_14753_14782,"With behavior controls, sensemakers are told h...","[behavior controls, sensemakers, their work, s..."
4921098,5032_6663_6675,forming all the substrings that still include ...,"[all the substrings, that, the symbols]"


In [19]:
sentences_clean.shape

(3231381, 3)

In [20]:
# Drop sentence from the dataframe
sentences_tagged = sentences_clean.drop(columns=["sentence"])

In [27]:
# Explode list of noun_phrases per sentence
sentences_tagged = sentences_tagged.explode("noun_phrases")

In [28]:
sentences_tagged.shape

(17483785, 2)

In [29]:
# Lets check some sample rows
sentences_tagged.sample(5)

Unnamed: 0,sentence_id,noun_phrases
3076220,3237_13455_13493,the availability
1370134,1485_3235_3263,the pathway
2202976,2389_16132_16168,principles
2797178,2964_1654_1656,Power
2601318,2786_3985_4019,the performance


In [30]:
data_path_group = data_path[:-10]+"Topic 1/Data_Team1/"

In [31]:
sentences_tagged.to_csv(data_path_group+"sentences_noun_phrs_cmpl.csv")