In [1]:
import csv
import logging
import string
import sqlite3

In [2]:
logging.basicConfig(filename='word_task.log',encoding = "utf8", level = logging.INFO, format = "%(asctime)s \t %(levelname)s \t %(message)s")

In [3]:
class words:
    
    def __init__(self, fileName):
        self.fileName = fileName
        logging.info(f"Object Created for dataset {fileName}!!!")
        
    def text_reader(self):
        """Reads the given data and store it into list and returns it. """
        temp = []
        logging.info(f"Reading Dataset: {self.fileName}")
        try:
            with open(self.fileName, "r+", encoding = "utf8") as f:
                data = csv.reader(f, delimiter = "\n")
                for i in data:
                    if i[0][-1]=="_":
                        i[0]=i[0].strip("_")
                    temp.append(i[0])
            logging.info(f"Reading Dataset {self.fileName} is successful....!!")
            return temp
        except Exception as e:
            logging.warning("Problem Occured while reading the data...!!")
            logging.exception("Error:", e)
            
    def occurance_counter(self,data_list):
        """It counts the occurrances of each word. Returns the list of tuples of word and its count. """
        try:
            s = set()
            logging.info(f"Counting Occurance of each word in the dataset {self.fileName}")
            for i in range(0, len(data_list)):
                s.add((data_list[i], data_list.count(data_list[i])))
            final_data = list(s)
            final_data.sort()
            return final_data
        except Exception as e:
            logging.warning("Problem occurred in occurance_counter() method")
            logging.exception(e)
    
    def starting_occurance_counter(self,data_list):
        """It counts the words starting with same alphabet and returns list of tuples of alphabet and its count."""
        logging.info(f"Counting no. of words starting with same alphabet for dataset {self.fileName}")
        try:
            counter = []
            for i in string.ascii_lowercase:
                count = 0
                for j in data_list:
                    if j.startswith(i) == True:
                        count+=1
                counter.append((i, count))
            return counter
        except Exception as e:
            logging.warning("Problem occurred in starting_occurance_counter() method")
            logging.exception(e) 
    
    def word_extracter(self,data_list):
        """This fuction removes all the digits and punctuation returns only words"""
        logging.info(f"Counting no of words starting with same alphabet in {self.fileName}")
        unwanted =  string.digits + string.punctuation
        for i in range(0, len(data_list)):
            for j in unwanted:
                data_list[i] = data_list[i].replace(j, "")
        data_list = self.null_remover(data_list)
        return data_list
    
    def null_remover(self,data_list):
        """It removes the null strings from the list"""
        data_list = list(filter(lambda x: x!='', data_list))
        return data_list
    
    def dataset_zipper(self, *args):
        """It returns the list of tuples of all dataset where each records of dataset is mapped to other dataset \n
        to form one single tuple"""
        try:
            logging.info(f"Zipping datasets {args} into one........")
            zipped = list(zip(*args))
            logging.info("Datasets zipped into one dataset successful")
            return zipped
        except Exception as e:
            logging.warning("Problem occurred while zipping datasets")
            logging.exception(e)
            
    
    def sqlite_database(self,data):
        """This function helps to create in-memory database"""
        logging.info("Creating in-memory database")
        
        try: 
            db_name = input("Enter name of the database: ")
            table_name= input("Enter table name: ")
        
            db = sqlite3.connect(db_name+".db")
            logging.info(f"Database {db_name} created successfully!!!")

            cur = db.cursor()
            cur.execute(f"create table {table_name}(col1 text, col2 text, col3 text, col4 text, col5 text)")
            logging.info(f"Table {table_name} created successfully!!!")

            for i in data:
                cur.execute("insert into "+table_name+" values(?,?,?,?,?)",i)
            cur.execute(f"select * from {table_name}")
            data = cur.fetchall()
            db.commit()
            db.close()
            logging.info("Data inserted into database successfully..!!!!!")
            return data
                
        except Exception as e:
            logging.warning("Problem occured while database creation")
            logging.exception("Error: ",e)
        
    def __repr__(self):
        return (f"This is the object of file {fileName}")
        

In [4]:
# Creating object for file 'vocab.enron.txt'
obj1 = words("vocab.enron.txt")

In [5]:
# Creating object for file 'vocab.kos.txt'
obj2 = words("vocab.kos.txt")

In [6]:
# Creating object for file 'vocab.nips.txt'
obj3 = words("vocab.nips.txt")

In [7]:
# Creating object for file 'vocab.nytimes.txt'
obj4 = words("vocab.nytimes.txt")

In [8]:
# Creating object for file 'vocab.pubmed.txt'
obj5 = words("vocab.pubmed.txt")

In [9]:
# Reading data from 'vocab.enron.txt'
enron = obj1.text_reader()

In [10]:
# Reading data from 'vocab.kos.txt'
kos = obj2.text_reader()

In [11]:
# Reading data from 'vocab.nips.txt'
nips = obj3.text_reader()

In [12]:
# Reading data from 'vocab.nytimes.txt'
nytimes = obj4.text_reader()

In [13]:
# Reading data from 'vocab.pubmed.txt'
pubmed = obj5.text_reader()

##### Finding occurrrence of each word in each dataset

In [None]:
occur_enron = obj1.occurance_counter(enron)

In [None]:
for i in occur_enron:
    print(i)

In [None]:
occur_kos = obj2.occurance_counter(kos)

In [None]:
for i in occur_kos:
    print(i)

In [None]:
occur_nips = obj3.occurance_counter(nips)

In [None]:
for i in occur_nips:
    print(i)

In [None]:
occur_nytimes = obj4.occurance_counter(nytimes)

In [None]:
for i in occur_nytimes:
    print(i)

In [None]:
occur_pubmed = obj5.occurance_counter(pubmed)

In [None]:
for i in occur_pubmed:
    print(i)

##### Finding count of each words starting with same alphabet

In [None]:
start_enron = obj1.starting_occurance_counter(enron)

In [None]:
for i in start_enron:
    print(i)

In [None]:
start_kos = obj2.starting_occurance_counter(kos)

In [None]:
for i in start_kos:
    print(i)

In [None]:
start_nips = obj3.starting_occurance_counter(nips)

In [None]:
for i in start_nips:
    print(i)

In [None]:
start_nytimes = obj4.starting_occurance_counter(nytimes)

In [None]:
for i in start_nytimes:
    print(i)

In [None]:
start_pubmed = obj5.starting_occurance_counter(pubmed)

In [None]:
for i in start_pubmed:
    print(i)

##### Extracting only words after removing all the puctuations and digits from 'vocab.pubmed.txt'

In [None]:
extract_pubmed = obj5.word_extracter(pubmed)

In [None]:
for i in extract_pubmed:
    print(i)

##### Zipping all five datasets into one and creating a in-memory database for it

In [14]:
zipped = obj1.dataset_zipper(enron, kos, nips,nytimes, pubmed)

In [15]:
for i in range(0,20):
    print(zipped[i],"\r")

('aaa', 'aarp', 'a2i', 'aah', '>=') 
('aaas', 'abandon', 'aaa', 'aahed', '>>') 
('aactive', 'abandoned', 'aaai', 'aaron', '>>>') 
('aadvantage', 'abandoning', 'aapo', 'aback', '>/=') 
('aaker', 'abb', 'aat', 'abacus', '->') 
('aap', 'abc', 'aazhang', 'abajo', '--') 
('aapg', 'abcs', 'abandonment', 'abalone', '-->') 
('aaron', 'abdullah', 'abbott', 'abandon', '-/-') 
('aarp', 'ability', 'abbreviated', 'abandoned', '-/+') 
('aas', 'aboard', 'abcde', 'abandoning', '/-') 
('aau', 'abortion', 'abe', 'abandonment', '/+-') 
('ab1890', 'abortions', 'abeles', 'abandono', '..') 
('ab1x', 'abraham', 'abi', 'abarnard', '...') 
('ab31x', 'abrams', 'abilistic', 'abashed', '+-') 
('aba', 'abroad', 'abilities', 'abate', '+/') 
('abacus', 'absence', 'ability', 'abated', '+/--') 
('abag', 'absent', 'abl', 'abatement', '+/?') 
('abalone', 'absentee', 'able', 'abating', '+/+') 
('abandon', 'absolute', 'ables', 'abbey', '++') 
('abandoned', 'absolutely', 'ablex', 'abbot', '+++') 


In [16]:
# Creating Database for the above data
database = obj1.sqlite_database(zipped)

Enter name of the database: sqllite_task
Enter table name: bag_of_words


In [17]:
# Here is your database (first 10 rows)
for i in range(0, 10):
    print(database[i])

('aaa', 'aarp', 'a2i', 'aah', '>=')
('aaas', 'abandon', 'aaa', 'aahed', '>>')
('aactive', 'abandoned', 'aaai', 'aaron', '>>>')
('aadvantage', 'abandoning', 'aapo', 'aback', '>/=')
('aaker', 'abb', 'aat', 'abacus', '->')
('aap', 'abc', 'aazhang', 'abajo', '--')
('aapg', 'abcs', 'abandonment', 'abalone', '-->')
('aaron', 'abdullah', 'abbott', 'abandon', '-/-')
('aarp', 'ability', 'abbreviated', 'abandoned', '-/+')
('aas', 'aboard', 'abcde', 'abandoning', '/-')
