# 1.&nbsp;Libraries & Paths



In [1]:
# Operating System libraries
import os
import sys
import subprocess
import zipfile

# Scraping and text libraries
import requests
import re

# Date and time libraries
import time
import datetime

# Scientific Python libraries
import numpy as np
import pandas as pd

# Visualization libraries
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns ; sns.set_theme()

# nltk libraries
try:
    import nltk
except:
    _ = subprocess.run(["pip", "install", "nltk"])
    import nltk

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('punkt') # for Used to analyze and divide sentences
nltk.download('stopwords') # Download stop words



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [2]:
file_link_Google_Drive = "https://drive.google.com/file/d/17Q4Gn0ZrFtrmf3GAj1b-nljFzHlDLaII/view?usp=sharing" # Faeze Google Drive
path1 = "/content/drive/MyDrive/project-IR/cisi.txt" # Faeze Google Drive


In [None]:
os.listdir()

['.config', '.ipynb_checkpoints', 'drive', 'sample_data']

# 1.&nbsp; To download file from Google Drive

In [None]:
file_id = file_link_Google_Drive.split("/file/d/")[1].split("/view")[0]

# Constructing the download link
download_link = f"https://drive.google.com/uc?export=download&id={file_id}"

# Downloading the content
response = requests.get(download_link)

# Ensure the request was successful
if response.status_code == 200:
    # Decoding the content and printing
    file_content = response.text
    print("Downloaded successfully!")
else:
    print("Failed to download the file.")


Downloaded successfully!


## 2.&nbsp;Tokenize

In [None]:
list_line_by_line = file_content.splitlines() # A list of the text, line by line.
sentences = nltk.sent_tokenize(" ".join(list_line_by_line)) # It is used to parse the content of a text file into sentences
words     = nltk.word_tokenize(" ".join(list_line_by_line)) # It is used to parse the content of a text file into words


In [None]:
len(list_line_by_line)

108747

In [None]:
len(sentences)

6680

In [None]:
len(words)

462588

In [None]:
words[:10]

['.I',
 '1',
 '.T',
 '18',
 'Editions',
 'of',
 'the',
 'Dewey',
 'Decimal',
 'Classifications']

# 3.&nbsp;Stopwords removal

In [None]:
# Set up the English stop words list
stop_words = set(stopwords.words('english'))

# Filter out the stop words
filtered_words = [word for word in words if word.lower() not in stop_words]

# Join the words back into a string
filtered_text = " ".join(filtered_words)


In [None]:
type(filtered_text)

str

In [None]:
filtered_text[:1000]

".I 1 .T 18 Editions Dewey Decimal Classifications .A Comaromi , J.P. .W present study history DEWEY Decimal Classification . first edition DDC published 1876 , eighteenth edition 1971 , future editions continue appear needed . spite DDC 's long healthy life , however , full story never told . biographies Dewey briefly describe system , first attempt provide detailed history work spurred growth librarianship country abroad . .X 1 5 1 92 1 1 262 1 1 556 1 1 1004 1 1 1024 1 1 1024 1 1 .I 2 .T Use Made Technical Libraries .A Slater , M. .W report analysis 6300 acts use 104 technical libraries United Kingdom . Library use one aspect wider pattern information use . Information transfer libraries restricted use documents . takes account documents used outside library , still less information transferred orally person person . library acts channel proportion situations information transferred . Taking technical information transfer whole , doubt proportion major one . users technical informat

In [None]:
type(stop_words)

set

In [None]:
len(stop_words)

179

In [None]:
stop_words

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 'r

# 4.&nbsp;Text to dataframe

`cisi.txt`: provided by Dr. Tanoori

In [None]:
# Define function to parse a single document
def parse_document(lines):
    document = {'I': '', 'T': '', 'A': '', 'W': '', 'X': ''}
    current_key = ''
    for line in lines:
        if line.startswith('.I'):
            document['I'] = line.split(' ')[1].strip()
        elif line.startswith('.T') or line.startswith('.A') or line.startswith('.W') or line.startswith('.X'):
            current_key = line[1]
        else:
            document[current_key] += line.strip() + ' '
    return document

# Read the text file

with open('cisi.txt', 'r') as file:
    lines = file.readlines()

# Group lines by document
documents_lines = []
current_document_lines = []
for line in lines:
    if line.startswith('.I') and current_document_lines:
        documents_lines.append(current_document_lines)
        current_document_lines = []
    current_document_lines.append(line)
if current_document_lines:
    documents_lines.append(current_document_lines)

# Parse each document and append to a list
documents = []
for document_lines in documents_lines:
    documents.append(parse_document(document_lines))

# Convert to DataFrame
df_cisi = (
    pd.DataFrame(documents)
    .rename(columns={
        "I":"id",
        "T":"title",
        "A":"author",
        "W":"abstract",
        "X":"references"
    })
) ; display("df_cisi:", df_cisi.shape, df_cisi.head(1))

# Save to CSV if needed
df_cisi.to_csv('cisi.csv', index=False)


FileNotFoundError: ignored

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# 5.&nbsp;Tokenize

In [None]:
def tokenize1(str1):
    list_line_by_line = str1.splitlines() # A list of the text, line by line.
    words = nltk.word_tokenize(" ".join(list_line_by_line)) # It is used to parse the content of a text file into words
    return(words)

# Define the columns you want to apply the function to
columns1 = ["title", "author", "abstract"]

# Apply the function to the specific columns
df_cisi[columns1] = df_cisi[columns1].applymap(tokenize1)

display("df_cisi:", df_cisi.shape, df_cisi.head())


NameError: ignored

# 6.&nbsp;Stopwords removal

`stopwords.txt`: provided by Dr. Tanoori    
Deleted the lines below manually:
```
Onix Text Retrieval Engine
Stop Word List 1
# Freely available stop word list.  This stop word list provides a nice balance between coverage and size.

#

#

#
```



In [None]:
# Define the columns you want to apply the function to
columns1 = ["title", "author", "abstract"]

# Create an empty set to store the stopwords
stopwords = set()
# Open the file
with open('stopwords.txt', 'r') as file:
    # Iterate through each line in the file
    for line in file:
        # Strip leading/trailing whitespace and add the word to the set
        stopwords.add(line.strip())

# Define the stopwords removal function with two parameters
def stopwords_removal(word_list, stopwords_set):
    return [word for word in word_list if word.lower() not in stopwords_set]

# Create a partial function with the stopwords set
stopwords_removal_with_stopwords = partial(stopwords_removal, stopwords_set=stopwords)

# Assuming that the columns you want to apply the function to are stored in a list called columns1
for column in columns1:
    # Apply the partial function to each cell value in the specified column
    df_cisi[column] = df_cisi[column].apply(lambda cell: stopwords_removal_with_stopwords(word_list=cell))

# Display the DataFrame
display("df_cisi:", df_cisi.shape, df_cisi.head(1))


FileNotFoundError: ignored

# Class

# Archive

## Faezeh

In [None]:
nltk.download('punkt') # for Used to analyze and divide sentences
# file_path = '/content/drive/MyDrive/project-IR/cisi.txt'
with open(file_path, "r") as file:
    file_content = file.read().splitlines() #List of file text lines
sentences = nltk.sent_tokenize(" ".join(file_content)) #It is used to parse the content of a text file into sentences
words = nltk.word_tokenize(" ".join(file_content)) #It is used to parse the content of a text file into sentences

print("sentences:", sentences)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


NameError: ignored

## Markdown

0. [Markdown Guide](https://colab.research.google.com/notebooks/markdown_guide.ipynb#scrollTo=Lhfnlq1Surtk)
0. [aaa]()

## Donload a file from Google Drive

Reading a text file directly from Google Drive is a bit more complex than simply opening a local file, as you're dealing with a web-based resource. You will need to download the file first or read it through a web request.    

Here's a step-by-step guide to read a file from a shared Google Drive link:   

1. Make the file public (Anyone with the link): Ensure that the file on Google Drive is shared publicly or with anyone who has the link.   
2. Get the file's ID: Extract the file ID from the link, which is the part between file/d/ and /view.    
3. Download or read the file: Use a library like requests to download the file and read it.    
Here's the code that does this:

In [None]:
import requests

file_link_Google_Drive = "https://drive.google.com/file/d/17Q4Gn0ZrFtrmf3GAj1b-nljFzHlDLaII/view?usp=sharing"
file_id = file_link_Google_Drive.split("/file/d/")[1].split("/view")[0]

# Constructing the download link
download_link = f"https://drive.google.com/uc?export=download&id={file_id}"

# Downloading the content
response = requests.get(download_link)

# Ensure the request was successful
if response.status_code == 200:
    # Decoding the content and printing
    file_content = response.text
    print(file_content)
else:
    print("Failed to download the file.")


.I 1
.T
18 Editions of the Dewey Decimal Classifications
.A
Comaromi, J.P.
.W
   The present study is a history of the DEWEY Decimal
Classification.  The first edition of the DDC was published
in 1876, the eighteenth edition in 1971, and future editions
will continue to appear as needed.  In spite of the DDC's
long and healthy life, however, its full story has never
been told.  There have been biographies of Dewey
that briefly describe his system, but this is the first
attempt to provide a detailed history of the work that
more than any other has spurred the growth of
librarianship in this country and abroad.
.X
1	5	1
92	1	1
262	1	1
556	1	1
1004	1	1
1024	1	1
1024	1	1
.I 2
.T 
Use Made of Technical Libraries
.A 
Slater, M.
.W
This report is an analysis of 6300 acts of use
in 104 technical libraries in the United Kingdom.
Library use is only one aspect of the wider pattern of
information use.  Information transfer in libraries is
restricted to the use of

## list to dataframe

In [None]:
import pandas as pd

# Define your lists
list1 = [1, 2, 3, 4]
list2 = ['A', 'B', 'C', 'D']
list3 = [10, 20, 30, 40]

# Combine the lists into a dictionary
data = {
    'Column1': list1,
    'Column2': list2,
    'Column3': list3
}

# Create a DataFrame
df = pd.DataFrame(data)

# Print the DataFrame
print(df)


   Column1 Column2  Column3
0        1       A       10
1        2       B       20
2        3       C       30
3        4       D       40


## stopwords

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Download stop words
nltk.download('stopwords')
nltk.download('punkt')

# Set up the English stop words list
stop_words = set(stopwords.words('english'))

text = "This is a sample text, and it includes some common stop words."

# Tokenize the text
words = word_tokenize(text)

# Filter out the stop words
filtered_words = [word for word in words if word.lower() not in stop_words]

# Join the words back into a string
filtered_text = " ".join(filtered_words)

print("Original text:", text)
print("Filtered text:", filtered_text)


Original text: This is a sample text, and it includes some common stop words.
Filtered text: sample text , includes common stop words .


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
