### Importing necessary Libarary and Loading the data

In [20]:
#Importing Necessary Libraries

import nltk
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
import pandas as pd
import string
import csv
import sys

In [2]:
# Load the CSV file
csv_filename = "./sentences.csv"
data = pd.read_csv(csv_filename)

In [3]:
data

Unnamed: 0,Sentences
0,Home » Lifestyle » Art » Prakriti-Where the Go...
1,With the sound of nature reverberating in the ...
2,"In addition, the exhibition includes unique pa..."
3,Prakriti-Where the Gods Reside is full of such...
4,The exhibition features traditional and modern...
5,The animal series of the exhibition showcases ...
6,The exhibition also has the Shiva Parvati seri...
7,Prakriti-Where the Gods Reside also has abstra...
8,"In addition, one can witness the finest statue..."
9,Prakriti-Where the Gods Reside has set a prece...


## Data Preprocessing

### Loading sentences csv and preprocessing all the sentences

In [5]:
# Preprocess the content using NLTK
nltk.download('stopwords')
nltk.download('punkt')
stop_words = set(stopwords.words('english'))
ps = PorterStemmer()

preprocessed_content = []

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Nandan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Nandan\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


### Testing Function for preprocessing(Functional Approach)

In [6]:
def preprocess( sentences):
        ps = PorterStemmer()
        stop_words = set(stopwords.words('english'))
        preprocessed = []
        
        for sent in sentences:
            words = word_tokenize(sent)
            filtered_words = [ps.stem(word) for word in words if word.lower() not in stop_words and word not in string.punctuation]
            processed = " ".join(filtered_words)
            preprocessed.append(processed)
        
        return preprocessed

In [7]:
# Preprocessed question
question = ["What did Nepal export?"]

In [8]:
print(preprocess(question))

['nepal export']


### Class for preprocessing all the sentences(OOPS Approach)

In [9]:
#importing Necessary Library

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import string

# Download Necessary daras
nltk.download('punkt')
nltk.download('stopwords')

class TextPreprocessor:
    
    def __init__(self):
        self.ps = PorterStemmer()
        self.stop_words = set(stopwords.words('english'))
    
    def preprocess(self, sentences):
        preprocessed = []
        
        for sent in sentences:
            words = word_tokenize(sent)
            filtered_words = [self.ps.stem(word) for word in words if word.lower() not in self.stop_words and word not in string.punctuation]
            processed = " ".join(filtered_words)
            preprocessed.append(processed)
        
        return preprocessed


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Nandan\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Nandan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [10]:
# Testing the class
sentences = ["I am running in the park."]
preprocessor = TextPreprocessor()
preprocessed_sentences = preprocessor.preprocess(sentences)
print(preprocessed_sentences)

['run park']


### preprocessing all the scraped sentences using the Class

In [11]:
content_column = data['Sentences']

In [13]:
# Convert the column values to a list
sentences = content_column.tolist()

# instantiing a object
preprocessor = TextPreprocessor()

#passing the sentences to the class
preprocessed_sentences = preprocessor.preprocess(sentences)

#output
print(preprocessed_sentences)

['home » lifestyl » art » prakriti-wher god resid explor artist evolut enter bodhisattva galleri pulchowk peac sound run water bird welcom', 'sound natur reverber background come across first section ‘ prakriti-wher god resid ’ featur statu filigre work preciou semi-preci stone bird artist pradip shakya', 'addit exhibit includ uniqu paint artist puspa lal dangol paint bird use feather canva', 'prakriti-wher god resid full interest artwork', 'exhibit featur tradit modern artwork experienc artist creat uniqu blend give art lover wholesom experi', 'anim seri exhibit showcas work artist lok chitrakar udaya charan shrestha meena dangol puspa lal dangol pradip shakya', 'exhibit also shiva parvati seri display work udaya charan samundra man singh shrestha', 'prakriti-wher god resid also abstract paint theme pancha buddha five element raj prakash tuladhar ram prakash shrestha rajan sangache ritesh shahi', 'addit one wit finest statu differ god goddess made metal artist aneel shakya rajan shaky

### Saving Preprocessed data

In [18]:
# Create a DataFrame with preprocessed sentences
preprocessed_data = pd.DataFrame({'sentences': preprocessed_sentences})

# Save the preprocessed data to a CSV file
preprocessed_data.to_csv('preprocessed_content.csv', index=False)  

In [19]:
preprocess_df = pd.read_csv('./preprocessed_content.csv')
preprocess_df

Unnamed: 0,sentences
0,home » lifestyl » art » prakriti-wher god resi...
1,sound natur reverber background come across fi...
2,addit exhibit includ uniqu paint artist puspa ...
3,prakriti-wher god resid full interest artwork
4,exhibit featur tradit modern artwork experienc...
5,anim seri exhibit showcas work artist lok chit...
6,exhibit also shiva parvati seri display work u...
7,prakriti-wher god resid also abstract paint th...
8,addit one wit finest statu differ god goddess ...
9,prakriti-wher god resid set preced curat tradi...
