In [3]:
import fitz
import os
import re
import spacy
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm
import pickle

In [13]:
def extract_text_from_pdf(pdf_path):
    print(f'Extracting Text from PDF {pdf_path}')
    document = fitz.open(pdf_path)
    text = ""
    for page_num in range(len(document)):
        page = document.load_page(page_num)
        text += page.get_text()
    document.close()
    return text

In [14]:
def remove_et_al(text):
    print('Removing et al')
    cleaned_text = re.sub(r'\(.*?\)', '', text)
    cleaned_text_2 = re.sub(r'\b[\w\s.,\[\]()\'"—–\-:;&]*?(?:&|and)?\s*(\n|\s)*et al\.\s*\(?\d{4}\)?\.?\b', '', cleaned_text, flags=re.IGNORECASE | re.UNICODE)
    return cleaned_text_2

In [15]:
nlp = spacy.load('en_core_web_sm')

def split_into_sentences(text):
    print('Splitting Sentences')
    doc = nlp(text)
    sentences = [sent.text for sent in doc.sents]
    return sentences

In [18]:
def process_pdfs_in_folder(folder_path):
    all_sentences = []
    for filename in tqdm(os.listdir(folder_path)):
        if filename.endswith('.pdf'):
            pdf_path = os.path.join(folder_path, filename)
            extracted_text = extract_text_from_pdf(pdf_path)
            text_wo_etal = remove_et_al(extracted_text)
            sentences = split_into_sentences(text_wo_etal)
            all_sentences.extend(sentences)
            print('Extended the list\n')
    return all_sentences


In [19]:
folder_path = 'papersToProcess'
all_sentences = process_pdfs_in_folder(folder_path)


  0%|          | 0/61 [00:00<?, ?it/s]

Extracting Text from PDF papersToProcess/GOLDSTONE_demographic,conflict_2002.pdf
Removing et al
Splitting Sentences


  2%|▏         | 1/61 [00:05<05:17,  5.30s/it]

Extended the list

Extracting Text from PDF papersToProcess/Theisen_conflict,clashes,weather_2012.pdf
Removing et al
Splitting Sentences


  3%|▎         | 2/61 [00:11<05:54,  6.00s/it]

Extended the list

Extracting Text from PDF papersToProcess/2022_SchonKoren.pdf
Removing et al
Splitting Sentences


  5%|▍         | 3/61 [00:16<05:19,  5.51s/it]

Extended the list

Extracting Text from PDF papersToProcess/Hsiang, Burke&Miguel_quantifying,conflict_2013.pdf
Removing et al
Splitting Sentences


  7%|▋         | 4/61 [00:27<07:06,  7.47s/it]

Extended the list

Extracting Text from PDF papersToProcess/Schon&Koren_afrogrid,conflict_2022.pdf
Removing et al
Splitting Sentences


  8%|▊         | 5/61 [00:32<06:05,  6.53s/it]

Extended the list

Extracting Text from PDF papersToProcess/Weezel_precipitation,communal conflict_2019.pdf
Removing et al
Splitting Sentences


 10%|▉         | 6/61 [00:37<05:38,  6.15s/it]

Extended the list

Extracting Text from PDF papersToProcess/Goldstone_flashpoints,tipping points,security_2008.pdf
Removing et al
Splitting Sentences


 11%|█▏        | 7/61 [00:40<04:29,  5.00s/it]

Extended the list

Extracting Text from PDF papersToProcess/Buhaug_variability,food production,conflict_2015.pdf
Removing et al
Splitting Sentences


 13%|█▎        | 8/61 [00:44<04:14,  4.80s/it]

Extended the list

Extracting Text from PDF papersToProcess/2022-NATIONAL-DEFENSE-STRATEGY-NPR-MDR.pdf
Removing et al
Splitting Sentences


 15%|█▍        | 9/61 [01:07<09:01, 10.41s/it]

Extended the list

Extracting Text from PDF papersToProcess/Meierding_conflict,small talk_2013.pdf
Removing et al
Splitting Sentences


 16%|█▋        | 10/61 [01:12<07:26,  8.75s/it]

Extended the list

Extracting Text from PDF papersToProcess/Korotayev_trap,instability,social systems_2011.pdf
Removing et al
Splitting Sentences


 18%|█▊        | 11/61 [01:22<07:46,  9.33s/it]

Extended the list

Extracting Text from PDF papersToProcess/Scartozzi_conflicts,reframing_2021.pdf
Removing et al
Splitting Sentences


 20%|█▉        | 12/61 [01:31<07:22,  9.03s/it]

Extended the list

Extracting Text from PDF papersToProcess/Turchin_dynamic,growth_2005.pdf
Removing et al
Splitting Sentences


 21%|██▏       | 13/61 [01:35<06:07,  7.67s/it]

Extended the list

Extracting Text from PDF papersToProcess/Exenberger_mass violence,climate change_2013.pdf
Removing et al
Splitting Sentences


 23%|██▎       | 14/61 [01:38<04:53,  6.25s/it]

Extended the list

Extracting Text from PDF papersToProcess/Brzoska et al_migration,conflict,adaptation_2015.pdf
Removing et al
Splitting Sentences


 25%|██▍       | 15/61 [01:45<04:48,  6.28s/it]

Extended the list

Extracting Text from PDF papersToProcess/Theisen et al_climate wars,drought_2012.pdf
Removing et al
Splitting Sentences


 26%|██▌       | 16/61 [01:52<04:55,  6.57s/it]

Extended the list

Extracting Text from PDF papersToProcess/Stephenson et al_dynamics,links_2010.pdf
Removing et al
Splitting Sentences


 28%|██▊       | 17/61 [01:54<03:54,  5.34s/it]

Extended the list

Extracting Text from PDF papersToProcess/Levy et al_collective violence_2017.pdf
Removing et al
Splitting Sentences


 30%|██▉       | 18/61 [02:00<03:59,  5.58s/it]

Extended the list

Extracting Text from PDF papersToProcess/Schilling_rains,rais,relation_2012.pdf
Removing et al
Splitting Sentences


 31%|███       | 19/61 [03:28<21:09, 30.23s/it]

Extended the list

Extracting Text from PDF papersToProcess/Bergholt_natural disasters,growth,conflict_2010.pdf
Removing et al
Splitting Sentences


 33%|███▎      | 20/61 [08:41<1:18:44, 115.22s/it]

Extended the list

Extracting Text from PDF papersToProcess/Devitt&Tol_war,development_2012.pdf
Removing et al
Splitting Sentences


 34%|███▍      | 21/61 [08:47<54:56, 82.41s/it]   

Extended the list

Extracting Text from PDF papersToProcess/Hendrix&Salehyan_rainfall,social conflict_2012.pdf
Removing et al
Splitting Sentences


 36%|███▌      | 22/61 [08:56<39:09, 60.25s/it]

Extended the list

Extracting Text from PDF papersToProcess/Koubi et al_variability,economic growth_2012.pdf
Removing et al
Splitting Sentences


 38%|███▊      | 23/61 [09:02<27:56, 44.11s/it]

Extended the list

Extracting Text from PDF papersToProcess/Burke et al_warming,civil war_2009.pdf
Removing et al
Splitting Sentences


 41%|████      | 25/61 [09:05<14:34, 24.29s/it]

Extended the list

Extracting Text from PDF papersToProcess/Landis_seasonality,incosistency_2014.pdf
Removing et al
Splitting Sentences


 43%|████▎     | 26/61 [09:10<11:26, 19.63s/it]

Extended the list

Extracting Text from PDF papersToProcess/Buhaug_climate,civil wars_2010.pdf
Removing et al
Splitting Sentences


 44%|████▍     | 27/61 [09:13<08:37, 15.21s/it]

Extended the list

Extracting Text from PDF papersToProcess/2015_BurkeEtAl.pdf
Removing et al
Splitting Sentences


 46%|████▌     | 28/61 [09:23<07:40, 13.94s/it]

Extended the list

Extracting Text from PDF papersToProcess/Slettebak_natural disaster,civil conflict_2012.pdf
Removing et al
Splitting Sentences


 48%|████▊     | 29/61 [09:30<06:21, 11.93s/it]

Extended the list

Extracting Text from PDF papersToProcess/Goldstone et al_model,forecasting,instability_2010.pdf
Removing et al
Splitting Sentences


 49%|████▉     | 30/61 [09:37<05:24, 10.48s/it]

Extended the list

Extracting Text from PDF papersToProcess/Benjaminsen et al_land-use,conflict_2012.pdf
Removing et al
Splitting Sentences


 51%|█████     | 31/61 [09:43<04:30,  9.02s/it]

Extended the list

Extracting Text from PDF papersToProcess/Scheffran et al_conflict nexus,assessment,pathways_2012.pdf
Removing et al
Splitting Sentences


 52%|█████▏    | 32/61 [09:47<03:43,  7.72s/it]

Extended the list

Extracting Text from PDF papersToProcess/2019_Koubi.pdf
Removing et al
Splitting Sentences


 54%|█████▍    | 33/61 [09:53<03:21,  7.18s/it]

Extended the list

Extracting Text from PDF papersToProcess/Schleussner et al_armed,disasters_2016.pdf
Removing et al
Splitting Sentences


 56%|█████▌    | 34/61 [09:57<02:46,  6.15s/it]

Extended the list

Extracting Text from PDF papersToProcess/Homer-Dixon_scarcities,conflict_1994.pdf
Removing et al
Splitting Sentences


 57%|█████▋    | 35/61 [10:09<03:27,  7.98s/it]

Extended the list

Extracting Text from PDF papersToProcess/Burke et al_climate,conflict_2015.pdf
Removing et al
Splitting Sentences


 59%|█████▉    | 36/61 [10:20<03:39,  8.78s/it]

Extended the list

Extracting Text from PDF papersToProcess/ash et al_syria,drought,migration_2019.pdf
Removing et al
Splitting Sentences


 61%|██████    | 37/61 [10:27<03:23,  8.47s/it]

Extended the list

Extracting Text from PDF papersToProcess/Adano et al_conflict,institutions,drylands_2012.pdf
Removing et al
Splitting Sentences


 62%|██████▏   | 38/61 [10:37<03:20,  8.70s/it]

Extended the list

Extracting Text from PDF papersToProcess/Song et al_GeoAlign,unaligned partitions_2018.pdf
Removing et al
Splitting Sentences


 64%|██████▍   | 39/61 [10:44<03:00,  8.19s/it]

Extended the list

Extracting Text from PDF papersToProcess/Zhang et al_war,population decline_2007.pdf
Removing et al
Splitting Sentences


 66%|██████▌   | 40/61 [10:48<02:26,  6.97s/it]

Extended the list

Extracting Text from PDF papersToProcess/Raleigh&Kniveton_variability,conflict_2012.pdf
Removing et al
Splitting Sentences


 67%|██████▋   | 41/61 [10:52<02:04,  6.21s/it]

Extended the list

Extracting Text from PDF papersToProcess/SALEHYAN_conflict,consensus_2008.pdf
Removing et al
Splitting Sentences


 69%|██████▉   | 42/61 [10:56<01:44,  5.48s/it]

Extended the list

Extracting Text from PDF papersToProcess/Gleditsch_conflict,weather_2012.pdf
Removing et al
Splitting Sentences


 70%|███████   | 43/61 [10:58<01:21,  4.54s/it]

Extended the list

Extracting Text from PDF papersToProcess/Pirages_demographic changes,security_1997.pdf
Removing et al
Splitting Sentences


 72%|███████▏  | 44/61 [11:03<01:19,  4.67s/it]

Extended the list

Extracting Text from PDF papersToProcess/O_loughlin_temperature,variability,violence_2014.pdf
Removing et al
Splitting Sentences


 74%|███████▍  | 45/61 [11:07<01:09,  4.37s/it]

Extended the list

Extracting Text from PDF papersToProcess/Jones et al_scarcity,unrest_2017.pdf
Removing et al
Splitting Sentences


 75%|███████▌  | 46/61 [11:14<01:17,  5.18s/it]

Extended the list

Extracting Text from PDF papersToProcess/Raleigh_political marginalization,conflict_2010.pdf
Removing et al
Splitting Sentences


 77%|███████▋  | 47/61 [11:20<01:13,  5.29s/it]

Extended the list

Extracting Text from PDF papersToProcess/Urdal_pressure,degradation_2005.pdf
Removing et al
Splitting Sentences


 79%|███████▊  | 48/61 [11:25<01:08,  5.26s/it]

Extended the list

Extracting Text from PDF papersToProcess/Sellers et al_health,stability,interlinkages_2019.pdf
Removing et al
Splitting Sentences


 80%|████████  | 49/61 [11:28<00:56,  4.74s/it]

Extended the list

Extracting Text from PDF papersToProcess/Hsiang_social stability_2014.pdf
Removing et al
Splitting Sentences


 82%|████████▏ | 50/61 [11:32<00:50,  4.58s/it]

Extended the list

Extracting Text from PDF papersToProcess/Nel et al_natural disasters,civil conflict_2008.pdf
Removing et al
Splitting Sentences


 84%|████████▎ | 51/61 [11:40<00:54,  5.47s/it]

Extended the list

Extracting Text from PDF papersToProcess/Schiavon&Zecchin_adaptation,vulnerability_2007.pdf
Removing et al
Splitting Sentences


 85%|████████▌ | 52/61 [11:41<00:37,  4.17s/it]

Extended the list

Extracting Text from PDF papersToProcess/Sun_matrix,electron,video_2022.pdf
Removing et al
Splitting Sentences


 87%|████████▋ | 53/61 [11:47<00:37,  4.74s/it]

Extended the list

Extracting Text from PDF papersToProcess/Mach et al_risk,armed,conflict_2019.pdf
Removing et al
Splitting Sentences


 89%|████████▊ | 54/61 [11:55<00:38,  5.53s/it]

Extended the list

Extracting Text from PDF papersToProcess/2019_MachEtAl.pdf
Removing et al
Splitting Sentences


 90%|█████████ | 55/61 [12:02<00:36,  6.08s/it]

Extended the list

Extracting Text from PDF papersToProcess/Homer-Dixon_threshold,conflict_1991.pdf
Removing et al
Splitting Sentences


 92%|█████████▏| 56/61 [12:13<00:38,  7.68s/it]

Extended the list

Extracting Text from PDF papersToProcess/Uexkull_conflict,coping,capacity_2016.pdf
Removing et al
Splitting Sentences


 93%|█████████▎| 57/61 [12:36<00:48, 12.01s/it]

Extended the list

Extracting Text from PDF papersToProcess/Koubi_climate,conflict_2019.pdf
Removing et al
Splitting Sentences


 95%|█████████▌| 58/61 [12:41<00:30, 10.13s/it]

Extended the list

Extracting Text from PDF papersToProcess/Cunningham,Dahl&Fruge_strategies,diversification_2017.pdf
Removing et al
Splitting Sentences


 97%|█████████▋| 59/61 [12:47<00:17,  8.75s/it]

Extended the list

Extracting Text from PDF papersToProcess/O’Loughlin_variability,conflict,Africa_2012.pdf
Removing et al
Splitting Sentences


 98%|█████████▊| 60/61 [12:50<00:07,  7.22s/it]

Extended the list

Extracting Text from PDF papersToProcess/Klomp&Bulte_weather shocks,conflict_2013.pdf
Removing et al
Splitting Sentences


100%|██████████| 61/61 [13:03<00:00, 12.84s/it]

Extended the list






In [23]:
len(all_sentences)

35957

In [25]:
output_file_path = 'all_sentences.pkl'
with open(output_file_path, 'wb') as file:
    pickle.dump(all_sentences, file)

In [21]:
all_sentences

['_____________________________ \n',
 'Journal of International Affairs, Fall 2002, vol.',
 '56, no. 1.',
 '©',
 'The Trustees of \nColumbia University in the City of New York. \n \n',
 'Population \nand \nSecurity: \nHow \nDemographic \nChange \nCan \nLead \nto \nViolent \nConflict \n \n \nJACK \nA. \nGOLDSTONE \n \n_________________________ \n \n',
 '“While the marked decrease in population growth in many countries and \n regions is good news for those concerned about global population, it \noffers no clear relief for concerns about the security implications  \nof population change.”  \n_________________________ \n \n \n',
 's \nwe \ncross \ninto \nthe \nnew \ncentury, \nthe \nworld \nseems \nfinally \nto \nhave turned the corner on population growth.',
 'A \ncombination of increased education for women, national and \ninternational \nsupport \nfor \npolicies \nof \npopulation \nplanning \nand \nthe \nspread \nof \neconomic \ndevelopment \nand \naccompanying \nmovement \nalong the de

In [6]:
import pickle
import re
import spacy
from tqdm import tqdm

# Load the sentences from the provided pickle file
file_path = 'all_sentences.pkl'
with open(file_path, 'rb') as file:
    all_sentences = pickle.load(file)

# Function to clean each sentence with additional checks
def clean_sentence(sentence):
    # Remove newline characters and multiple spaces
    sentence = re.sub(r'\s+', ' ', sentence)
    # Remove special characters except for common punctuation
    sentence = re.sub(r'[^A-Za-z0-9 ,.\'?!]', '', sentence)
    # Strip leading and trailing spaces
    sentence = sentence.strip()
    return sentence

# Function to determine if a sentence is valid with additional checks
def is_valid_sentence(sentence):
    # Check if the sentence is long enough and contains alphabetic characters
    return len(sentence) > 10 and any(c.isalpha() for c in sentence)

# Additional function to detect non-content text like headings and list markers
def is_non_content(sentence):
    # Simple check to see if a sentence starts with a number or a common section marker
    return bool(re.match(r'^\d+|^[A-Za-z]\)|^\s*$', sentence))

# Load spaCy's English language model
nlp = spacy.load("en_core_web_sm")

# Function to evaluate the grammatical correctness of a sentence using spaCy
def evaluate_sentence_spacy(sentence):
    doc = nlp(sentence)
    errors = []
    # Example checks
    if len(doc) < 5:
        errors.append("Sentence is too short.")
    if not any(token.pos_ == "VERB" for token in doc):
        errors.append("Sentence lacks a verb.")
    return errors if errors else "No issues detected."

# Clean, filter, and evaluate the sentences with additional checks
cleaned_sentences = [clean_sentence(sentence) for sentence in all_sentences]
filtered_sentences = [
    sentence for sentence in cleaned_sentences 
    if is_valid_sentence(sentence) and not is_non_content(sentence)
]

# Evaluate sentences using spaCy with progress bar
evaluated_sentences = []
for sentence in tqdm(filtered_sentences, desc="Evaluating sentences"):
    evaluation = evaluate_sentence_spacy(sentence)
    evaluated_sentences.append((sentence, evaluation))

# Save the cleaned and evaluated sentences to a new pickle file
output_file_path = 'cleaned_and_evaluated_sentences.pkl'
with open(output_file_path, 'wb') as file:
    pickle.dump(evaluated_sentences, file)

print(f"Cleaned and evaluated sentences have been saved to {output_file_path}")


Evaluating sentences: 100%|██████████| 30454/30454 [04:10<00:00, 121.81it/s]


Cleaned and evaluated sentences have been saved to cleaned_and_evaluated_sentences.pkl


In [10]:
final_cleaned_sentences = [
    sentence for sentence, evaluation in evaluated_sentences if evaluation == "No issues detected."
]

In [15]:
output_file_path = 'final_cleaned_sentences.pkl'
with open(output_file_path, 'wb') as file:
    pickle.dump(final_cleaned_sentences, file)

In [12]:
len(final_cleaned_sentences)

19838

In [13]:
final_cleaned_sentences

['Population and Security How Demographic Change Can Lead to Violent Conflict JACK A. GOLDSTONE',
 'While the marked decrease in population growth in many countries and regions is good news for those concerned about global population, it offers no clear relief for concerns about the security implications of population change.',
 's we cross into the new century, the world seems finally to have turned the corner on population growth.',
 'A combination of increased education for women, national and international support for policies of population planning and the spread of economic development and accompanying movement along the demographic transition frontier have led to falling population growth rates around the world.',
 'Whether among the behemothsChina and Indiaor among the smaller but rapidly growing nationssuch as Saudi Arabia, Kenya and Malawi population growth rates have dropped dramatically in the last decade.i Yet while population growth rates have dropped around the world, th

In [8]:
len(evaluated_sentences)

30454

In [14]:
len(filtered_sentences)

30454

In [23]:
import pickle
import re

# Load the final cleaned sentences from the pickle file
file_path = 'final_cleaned_sentences.pkl'
with open(file_path, 'rb') as file:
    final_cleaned_sentences = pickle.load(file)

# Define regex patterns for causal keywords
causal_patterns = [
    r'\bbecause\b',
    r'\bdue to\b',
    r'\btherefore\b',
    r'\bas a result\b',
    r'\bthus\b',
    r'\bconsequently\b',
    r'\bleads to\b',
    r'\bresults in\b',
    r'\bcauses\b',
    r'\binduces\b',
    r'\bbrings about\b',
    r'\bis responsible for\b',
    r'\bso\b',  # when used to indicate cause and effect
    r'\bhence\b'
    # r'\bas\b',  # when used in the context of "as a result"
]

# Compile the regex patterns
causal_regex = re.compile('|'.join(causal_patterns), re.IGNORECASE)

# Function to flag sentences containing causal relationships
def contains_causal_relationship(sentence):
    return bool(causal_regex.search(sentence))

# Identify and flag sentences with causal relationships
causal_sentences = [
    (sentence, "contains cause/effect" if contains_causal_relationship(sentence) else "no cause/effect")
    for sentence in final_cleaned_sentences
]

# Print some examples of flagged sentences
# for sentence, flag in causal_sentences[:10]:  # Display first 10 sentences as an example
#     print(f"Sentence: {sentence}")
#     print(f"Flag: {flag}\n")

# Save the flagged sentences to a new pickle file
output_file_path = 'causal_sentences.pkl'
with open(output_file_path, 'wb') as file:
    pickle.dump(causal_sentences, file)

print(f"Causal sentences have been flagged and saved to {output_file_path}")


Causal sentences have been flagged and saved to causal_sentences.pkl


In [24]:
final_causal_sentences = [
    sentence for sentence, cause in causal_sentences if cause == "contains cause/effect"
]

In [25]:
output_file_path = 'final_causal_sentences.pkl'
with open(output_file_path, 'wb') as file:
    pickle.dump(final_causal_sentences, file)

In [26]:
output_txt_path = 'final_causal_sentences.txt'

# Save the sentences to a text file
with open(output_txt_path, 'w') as file:
    for sentence in final_causal_sentences:
        file.write(sentence + '\n')

In [27]:
len(final_causal_sentences)

1955