In [1]:
import os
import spacy
import pprint
import pandas as pd
import speech_recognition as sr
from scrape import get_entire_web_google_results

# PART 01

## SPEECH 2 TEXT

In [2]:
def Speech2text(audio_file):
    r = sr.Recognizer()
    with sr.AudioFile(audio_file) as source:
        audio = r.record(source)
    try:
        text = r.recognize_google(audio)
        return text
    except:
        return "Error"

#### test samples

# PART 02

In [3]:
audio_file = 'data/reference/speech2text/welcome.wav'
Speech2text(audio_file)

"good morning everyone respected teachers and parents and students a special thanks to the chief guest of the day mention his flash surname it's an honour to be traced by the presence of such an accomplice after an entrepreneur"

## ABBREVIATION EXTRACTION

In [4]:
def spacy_ner_ruler(path = 'data/reference/Abbreviations.xlsx'):
    df = pd.read_excel(path)
    nlp = spacy.load('en_core_web_sm')
    ruler = nlp.add_pipe('entity_ruler')

    Abbreviation = df['Abbreviation'].tolist()
    Description = df['Description'].tolist()
    Category = df['Category'].tolist()

    patterns = []
    for i in range(len(Abbreviation)):
        patterns.append({
                        'label': Category[i], 
                        'pattern': Abbreviation[i],
                        'description': Description[i]
                        })
        
    ruler.add_patterns(patterns)
    Abb2Desc = dict(zip(Abbreviation, Description))
    return nlp, Abb2Desc

nlp_ruler, Abb2Desc = spacy_ner_ruler()

In [5]:
def inference_abb(text):
    abbr_dict = {}
    abbr_dict['Abbreviation'] = []
    abbr_dict['Description'] = []

    doc = nlp_ruler(text)
    for ent in doc.ents:
        ent_text = ent.text
        if ent_text in Abb2Desc:
            abbr_dict['Abbreviation'].append(ent_text)
            abbr_dict['Description'].append(Abb2Desc[ent_text])

    df_abbr = pd.DataFrame(abbr_dict)
    return df_abbr

In [6]:
text = 'dnindis ALU ratio is 1.5'
inference_abb(text)

Unnamed: 0,Abbreviation,Description
0,A/D,Analog-to-Digital


# PART 03

## Scraping

In [None]:
def scraping_references(search_item):
    df = get_entire_web_google_results(search_item)
    df = df[['title', 'description', 'DOI']]
    return df.to_dict('records')

In [None]:
text = '''A computer network is a collection of interconnected devices that can communicate and exchange data with each other. The main purpose of a computer network is to facilitate communication and resource sharing between different devices, such as computers, servers, printers, and storage devices. Computer networks can be classified into different types based on their size and geographical coverage, such as local area networks (LANs), wide area networks (WANs), metropolitan area networks (MANs), and personal area networks (PANs). Computer networks have revolutionized the way we communicate and share information, enabling faster and more efficient collaboration between individuals and organizations. They are also essential for the functioning of the Internet, which is the largest and most widely used computer network in the world. References:“Tanenbaum, A. S., Wetherall, D. (2011). Computer Networks (5th ed.). Prentice Hall.”'''
response = scraping_references(text)
pprint.pprint(response)