In [7]:
import nltk
from nltk import pos_tag, word_tokenize

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

# tagging PoS in inputted text
text = word_tokenize("Be careful with that butter knife.")
nltk.pos_tag(text)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Administrator\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Administrator\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.


[('Be', 'VB'),
 ('careful', 'JJ'),
 ('with', 'IN'),
 ('that', 'DT'),
 ('butter', 'NN'),
 ('knife', 'NN'),
 ('.', '.')]

In [38]:
import nltk

# tokenization is the process of splitting strings into their individual "tokens"
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords

nltk.download('punkt')
nltk.download('stopwords')

# to import a .txt file we use the "open" function, giving it the path to our text file and an instrution about what we want to do with the file
# here, we would like to "read" our file into a variable so 
transcript = open('gismapping.txt', encoding='utf-8').read().lower()

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Administrator\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Administrator\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [39]:
# we could then tokenize by sentence, which splits the text into sentences
transcript_sentences = sent_tokenize(transcript)
transcript_sentences

['367\narcheologia e calcolatori\n30, 2019, 367-385\ngis mapping of the archaeological sites\nin the molise region (italy)\n1. introduction\nthe molise region is in the northern sector of southern italy and it is\nspread from the apennine chain to the adriatic coast.',
 'this area experienced\nhuman presence since prehistoric times, as it is testified by its rich archaeological\nand architectural heritage (de benedittis 1979; coarelli, la regina 1984)1\n.',
 'the oldest human presence in molise is represented by the upper palaeolithic settlement of isernia la pineta, dated back at about 600.000 years\nbp (coltorti et al.',
 '1982, 2005 and references; peretto 2013).',
 'the site\nrepresents a unique example in the history of the human frequentation in\neurope for the presence of a considerable number of paleontological finds\nassociated to lithic artefacts.',
 'other more recent and relevant prehistoric sites\nhave been also found in the municipalities of pescopennataro (rio verde site

In [40]:
# or more commonly, we can tokenize into words, which splits the sentences into its parts of speech
transcript_words = word_tokenize(transcript)
transcript_words

['367',
 'archeologia',
 'e',
 'calcolatori',
 '30',
 ',',
 '2019',
 ',',
 '367-385',
 'gis',
 'mapping',
 'of',
 'the',
 'archaeological',
 'sites',
 'in',
 'the',
 'molise',
 'region',
 '(',
 'italy',
 ')',
 '1.',
 'introduction',
 'the',
 'molise',
 'region',
 'is',
 'in',
 'the',
 'northern',
 'sector',
 'of',
 'southern',
 'italy',
 'and',
 'it',
 'is',
 'spread',
 'from',
 'the',
 'apennine',
 'chain',
 'to',
 'the',
 'adriatic',
 'coast',
 '.',
 'this',
 'area',
 'experienced',
 'human',
 'presence',
 'since',
 'prehistoric',
 'times',
 ',',
 'as',
 'it',
 'is',
 'testified',
 'by',
 'its',
 'rich',
 'archaeological',
 'and',
 'architectural',
 'heritage',
 '(',
 'de',
 'benedittis',
 '1979',
 ';',
 'coarelli',
 ',',
 'la',
 'regina',
 '1984',
 ')',
 '1',
 '.',
 'the',
 'oldest',
 'human',
 'presence',
 'in',
 'molise',
 'is',
 'represented',
 'by',
 'the',
 'upper',
 'palaeolithic',
 'settlement',
 'of',
 'isernia',
 'la',
 'pineta',
 ',',
 'dated',
 'back',
 'at',
 'about',
 '

In [41]:
# now remember that huge block of stopwords manually typed out in the sample block of code from the first lesson? That comes built in to NLTK as you may have guessed from the earlier import statment
# we can assign the NLTK stopwords to a variable like so:
stop_words = stopwords.words('english')

# and then remove the stopwords from out text using a loop to check if each word in the transcript and only keep the words that are NOT in out stopword list
filtered_transcript_words = []
for word in transcript_words:
    if word not in stop_words:
        filtered_transcript_words.append(word)

In [42]:
# finally, we can simply find word frequeny with NLTK's frequnecy distribution function
from nltk import FreqDist

transcript_fdist = FreqDist(filtered_transcript_words)
transcript_fdist.most_common(10)

[(',', 643),
 ('.', 305),
 (')', 187),
 ('(', 181),
 ('archaeological', 128),
 ('sites', 126),
 ('molise', 78),
 ('region', 63),
 (';', 56),
 ('site', 51)]

In [43]:
# now, as you can see, our list is topped by punctuation and contractions!

# to remove punctuation, we can use Python's string library to create a list of punctuation
from string import punctuation
punctuation = list(punctuation)

# and luckily, you can modify your stopwords and punctuation lists like any other list!
# let's add "n't", "'s", and "would"
# to add multiple elements to a list at once, we use extend() rather that append()
stop_words.extend(["n't", "'s", 'would'])

In [44]:
# let's re-run with our new stopwords and punctuation list to see the improved results
filtered_transcript_words = []
for word in transcript_words:
    if word not in stop_words and word not in punctuation:
        filtered_transcript_words.append(word)

transcript_fdist = FreqDist(filtered_transcript_words)
transcript_fdist.most_common(10)

[('archaeological', 128),
 ('sites', 126),
 ('molise', 78),
 ('region', 63),
 ('site', 51),
 ('di', 49),
 ('fig', 42),
 ('age', 35),
 ('–', 32),
 ('del', 31)]

In [45]:
# now that we have a word frequency list, we can even use NLTK for concordance analysis (seeing word in context)
# we can choose a word from the word frequency list, and search the original tokenized text for it after making it a Text object
from nltk.text import Text

text_list = Text(transcript_words)
text_list.concordance("work", lines=52)

Displaying 3 of 3 matches:
esigns . 2. material and methods the work has been based on separate phases of
te images , topographic maps , field work or aerial photos ; – georeference qu
h , bibliographic research and field work ; – description ( string , length 25


In [47]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

from string import punctuation
punctuation = list(punctuation)

nltk.download('punkt')
nltk.download('stopwords')

transcript = open('gismapping.txt', encoding="utf-8").read().lower()

transcript_words = word_tokenize(transcript)

stop_words = stopwords.words('english')
stop_words.extend(["n't", "'s", 'would'])

filtered_transcript_words = []
for word in transcript_words:
    if word not in stop_words and word not in punctuation:
        filtered_transcript_words.append(word)

transcript_fdist = FreqDist(filtered_transcript_words)
transcript_fdist.most_common(10)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Administrator\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Administrator\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


[('archaeological', 128),
 ('sites', 126),
 ('molise', 78),
 ('region', 63),
 ('site', 51),
 ('di', 49),
 ('fig', 42),
 ('age', 35),
 ('–', 32),
 ('del', 31)]

In [50]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

from string import punctuation
punctuation = list(punctuation)

nltk.download('punkt')
nltk.download('stopwords')

transcript = open('gismapping.txt', encoding="utf-8").read().lower()

transcript_words = word_tokenize(transcript)

stop_words = stopwords.words('english')
stop_words.extend(["n't", "'s", 'would'])

filtered_transcript_words = []
for word in transcript_words:
    if word not in stop_words and word not in punctuation:
        filtered_transcript_words.append(word)

transcript_fdist = FreqDist(filtered_transcript_words)
transcript_fdist.most_common(10)

# now that we have a word frequency list, we can even use NLTK for concordance analysis (seeing word in context)
# we can choose a word from the word frequency list, and search the original tokenized text for it after making it a Text object
from nltk.text import Text

text_list = Text(transcript_words)

# Get the 10 most common words
common_words = [word for word, _ in transcript_fdist.most_common(10)]

# Loop through the common words and get concordance for each
for word in common_words:
    print(f"Concordance for '{word}':")
    text_list.concordance(word, lines=52)
    print("\n")

Concordance for 'archaeological':
Displaying 52 of 128 matches:
19 , 367-385 gis mapping of the archaeological sites in the molise region ( it
 as it is testified by its rich archaeological and architectural heritage ( de
toric times , the most relevant archaeological findings can be chronologically
 and volumes published for each archaeological site , without considering the 
oth the superintendency for the archaeological heritage of the molise region a
emic teams carried out numerous archaeological excavations and explorations in
ise region . as a result , some archaeological contexts can now rely on a rich
tematic study . moreover , many archaeological materials derived from emergenc
dardize and store the myriad of archaeological data from the molise region , f
orkflow : – census of all known archaeological findings of the molise region t
ime , of a detailed and updated archaeological database . – integration of thi
c maps . 369 gis mapping of the archaeological sites in the molise 

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Administrator\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Administrator\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [51]:
# Specify the file path where you want to save the text file
file_path = "output.txt"

# Open the file in write mode ('w') and write the content to it
with open(file_path, "w", encoding="utf-8") as text_file:
    text_file.write(abstracts_as_text)

print("Text saved to", file_path)


Text saved to output.txt


In [52]:
#Now let's restart the process with the next text file: output.txt

import nltk

# tokenization is the process of splitting strings into their individual "tokens"
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords

nltk.download('punkt')
nltk.download('stopwords')

# to import a .txt file we use the "open" function, giving it the path to our text file and an instrution about what we want to do with the file
# here, we would like to "read" our file into a variable so 
transcript = open('output.txt', encoding='utf-8').read().lower()

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Administrator\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Administrator\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [53]:
# we could then tokenize by sentence, which splits the text into sentences
transcript_sentences = sent_tokenize(transcript)
transcript_sentences

["abella \nabellinum\nacerrae\naceruntia \naecae\naeclanum\naefula\naequana\naequum tuticum\naesernia\nafilae\nalba fucens\nalbano di lucania\naletrium\nallifae\nalsium\naltamura\nameria\namiternum\namyclae\nanagnia\nangitiae lucus \nangulum\nantemnae\nantinum \nantium\nanxanum \nanxia\naquilonia\naquinum\narcipete \nardea\naricia\narpi \narpinum\nartena\natella\naternum \natina\natina\naufidena\naufinum\nausculum\naveia\naxia\nbagnoregio\nbantia\nbaragiano\nbarduli\nbarium\nbellegra\nbeneventum \nblera\nbovianum \nbovillae\nbuon riposo\nbutuntum \ncaelia1\ncaere \ncaiatia\ncalatia\ncales\ncallifae\ncampo del pozzo\ncamporosso\ncannae\ncanusium\ncapena\ncapua\ncarseoli \ncarsulae\ncasalini sottana\ncasilinum\ncasinum\ncastel canonico\ncastellina\ncastello monteforte\ncastellone\ncastrimoenium\ncastrum inui\ncastrum novum1\ncaudium\ncencelle\ncentum cellae\ncereatae marianae\ncesoli\ncima ramne\ncirceii \ncivita danzica\ncivita del fosso d'arlena\ncivita delle bianche\ncivita di tricari

In [54]:
# now remember that huge block of stopwords manually typed out in the sample block of code from the first lesson? That comes built in to NLTK as you may have guessed from the earlier import statment
# we can assign the NLTK stopwords to a variable like so:
stop_words = stopwords.words('english')

# and then remove the stopwords from out text using a loop to check if each word in the transcript and only keep the words that are NOT in out stopword list
filtered_transcript_words = []
for word in transcript_words:
    if word not in stop_words:
        filtered_transcript_words.append(word)

In [56]:
# finally, we can simply find word frequeny with NLTK's frequnecy distribution function
from nltk import FreqDist

transcript_fdist = FreqDist(filtered_transcript_words)
transcript_fdist.most_common(20)

[(',', 643),
 ('.', 305),
 (')', 187),
 ('(', 181),
 ('archaeological', 128),
 ('sites', 126),
 ('molise', 78),
 ('region', 63),
 (';', 56),
 ('site', 51),
 ('di', 49),
 ('fig', 42),
 ('age', 35),
 ('–', 32),
 ('del', 31),
 ('gis', 28),
 ('m.', 28),
 ('e', 26),
 ('de', 22),
 ('italy', 21)]

In [57]:
# now, as you can see, our list is topped by punctuation and contractions!

# to remove punctuation, we can use Python's string library to create a list of punctuation
from string import punctuation
punctuation = list(punctuation)

# and luckily, you can modify your stopwords and punctuation lists like any other list!
# let's add "n't", "'s", and "would"
# to add multiple elements to a list at once, we use extend() rather that append()
stop_words.extend(["n't", "'s", 'would'])

In [58]:
# let's re-run with our new stopwords and punctuation list to see the improved results
filtered_transcript_words = []
for word in transcript_words:
    if word not in stop_words and word not in punctuation:
        filtered_transcript_words.append(word)

transcript_fdist = FreqDist(filtered_transcript_words)
transcript_fdist.most_common(10)

[('archaeological', 128),
 ('sites', 126),
 ('molise', 78),
 ('region', 63),
 ('site', 51),
 ('di', 49),
 ('fig', 42),
 ('age', 35),
 ('–', 32),
 ('del', 31)]

In [59]:
text_list = Text(transcript_words)

# Get the 10 most common words
common_words = [word for word, _ in transcript_fdist.most_common(10)]

# Loop through the common words and get concordance for each
for word in common_words:
    print(f"Concordance for '{word}':")
    text_list.concordance(word, lines=52)
    print("\n")

Concordance for 'archaeological':
Displaying 52 of 128 matches:
19 , 367-385 gis mapping of the archaeological sites in the molise region ( it
 as it is testified by its rich archaeological and architectural heritage ( de
toric times , the most relevant archaeological findings can be chronologically
 and volumes published for each archaeological site , without considering the 
oth the superintendency for the archaeological heritage of the molise region a
emic teams carried out numerous archaeological excavations and explorations in
ise region . as a result , some archaeological contexts can now rely on a rich
tematic study . moreover , many archaeological materials derived from emergenc
dardize and store the myriad of archaeological data from the molise region , f
orkflow : – census of all known archaeological findings of the molise region t
ime , of a detailed and updated archaeological database . – integration of thi
c maps . 369 gis mapping of the archaeological sites in the molise 

## end of nltk##

##Time for pandas


In [15]:
import pandas as pd

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
nltk.download('stopwords')

pd.options.display.max_rows = 100

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Administrator\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [26]:
litrev_df = pd.read_csv('chatgptgabii.csv', delimiter=",")



litrev_df

Unnamed: 0,Primary_Name,X,Y,Z,Geog_certainty,Occ_Date_U,Occ_Date_L,Occ_Date_Cert
0,Abella,14.608472,40.961516,236,certain,-800,500,High
1,Abellinum,14.830911,40.922125,313,certain,-300,500,Medium
2,Acerrae,14.371233,40.946092,30,certain,-300,500,Medium
3,Aceruntia,15.940567,40.796975,833,certain,-600,500,Low
4,Aecae,15.308894,41.361189,439,certain,-600,500,Medium
...,...,...,...,...,...,...,...,...
352,Visentium,11.874036,42.573358,397,certain,-400,500,Low
353,Volcei,15.379029,40.633914,663,certain,-325,500,High
354,Volsinii,11.986114,42.644852,522,certain,-250,500,High
355,Volturnum,13.941734,41.033657,2,certain,-194,500,Medium


In [32]:
# Check the first few rows of the DataFrame
print(litrev_df.head())

litrev_df = litrev_df[litrev_df["Primary_Name"].notna()]

abstracts_as_text = ""

for i in litrev_df["Primary_Name"]:
    abstracts_as_text += i + "\n"    
    
abstractTokens = word_tokenize(abstracts_as_text.lower())

cleaned_abstractTokens = []

for word in list(abstractTokens):
    if word not in stopwords.words("english") and word.isalpha():
        cleaned_abstractTokens.append(word)

abstracts_df = pd.DataFrame(cleaned_abstractTokens, columns =['uniqueWords'])
        
keywords = abstracts_df["uniqueWords"].value_counts()

if 100000000000 < len(keywords):
    specific_keyword = keywords.iloc[100000000000]
    print("Specific keyword:", specific_keyword)
else:
    print("Index is out of bounds.")

keywords[10]

print(keywords)



  Primary_Name          X          Y    Z Geog_certainty Occ_Date_U  \
0      Abella   14.608472  40.961516  236        certain       -800   
1    Abellinum  14.830911  40.922125  313        certain       -300   
2      Acerrae  14.371233  40.946092   30        certain       -300   
3   Aceruntia   15.940567  40.796975  833        certain       -600   
4        Aecae  15.308894  41.361189  439        certain       -600   

  Occ_Date_L Occ_Date_Cert  
0        500          High  
1        500        Medium  
2        500        Medium  
3        500           Low  
4        500        Medium  
Index is out of bounds.
uniqueWords
monte        39
san          14
di           13
colle        13
santa         8
             ..
croccia       1
presepe       1
cozzo         1
corfinium     1
vulci         1
Name: count, Length: 386, dtype: int64


  keywords[10]
