In [1]:
# !pip install pycocotools

from pycocotools.coco import COCO # COCO python library
import numpy as np
import matplotlib.pyplot as plt
import pylab

import random
import string

import cv2
import os
from pickle import dump, load
import json

import nltk
nltk.download("stopwords")
from nltk.corpus import stopwords

import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional, Input, Dropout, Attention
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.applications.xception import Xception, preprocess_input
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from tensorflow.keras.utils import to_categorical

from tensorflow.keras.layers import add

from tensorflow.keras.models import Model, load_model

# small library for seeing the progress of loops.
from tqdm.notebook import tqdm

pylab.rcParams['figure.figsize'] = (8.0, 10.0)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Jatin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
coco=COCO(r"D:\SEM 5\Sem Project\annotations/instances_train2017.json")

loading annotations into memory...
Done (t=10.29s)
creating index...
index created!


In [3]:
annFile = r"D:\SEM 5\Project Blind\annotations\captions_train2017.json"
coco_caps=COCO(annFile)

loading annotations into memory...
Done (t=0.50s)
creating index...
index created!


In [4]:
dataset = dict()

In [5]:
cats = coco.loadCats(coco.getCatIds())
maincategories = list(set([cat['supercategory'] for cat in cats]))

print("Number of main categories: ", len(maincategories))
print("List of main categories: ", maincategories)

Number of main categories:  12
List of main categories:  ['appliance', 'sports', 'furniture', 'animal', 'outdoor', 'food', 'electronic', 'accessory', 'kitchen', 'indoor', 'vehicle', 'person']


In [6]:
subcategories = [cat['name'] for cat in cats]

print("Number of sub categories: ", len(subcategories))
print("List of sub categories: ", subcategories)

Number of sub categories:  80
List of sub categories:  ['person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train', 'truck', 'boat', 'traffic light', 'fire hydrant', 'stop sign', 'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow', 'elephant', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella', 'handbag', 'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball', 'kite', 'baseball bat', 'baseball glove', 'skateboard', 'surfboard', 'tennis racket', 'bottle', 'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl', 'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza', 'donut', 'cake', 'chair', 'couch', 'potted plant', 'bed', 'dining table', 'toilet', 'tv', 'laptop', 'mouse', 'remote', 'keyboard', 'cell phone', 'microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase', 'scissors', 'teddy bear', 'hair drier', 'toothbrush']


In [7]:
catIds = coco.getCatIds(catNms=subcategories)

subcategories_Ids = dict()
for i in range(0,len(subcategories)):
    subcategories_Ids[subcategories[i]] = catIds[i]

print("Sub categories with IDs :",subcategories_Ids)

Sub categories with IDs : {'person': 1, 'bicycle': 2, 'car': 3, 'motorcycle': 4, 'airplane': 5, 'bus': 6, 'train': 7, 'truck': 8, 'boat': 9, 'traffic light': 10, 'fire hydrant': 11, 'stop sign': 13, 'parking meter': 14, 'bench': 15, 'bird': 16, 'cat': 17, 'dog': 18, 'horse': 19, 'sheep': 20, 'cow': 21, 'elephant': 22, 'bear': 23, 'zebra': 24, 'giraffe': 25, 'backpack': 27, 'umbrella': 28, 'handbag': 31, 'tie': 32, 'suitcase': 33, 'frisbee': 34, 'skis': 35, 'snowboard': 36, 'sports ball': 37, 'kite': 38, 'baseball bat': 39, 'baseball glove': 40, 'skateboard': 41, 'surfboard': 42, 'tennis racket': 43, 'bottle': 44, 'wine glass': 46, 'cup': 47, 'fork': 48, 'knife': 49, 'spoon': 50, 'bowl': 51, 'banana': 52, 'apple': 53, 'sandwich': 54, 'orange': 55, 'broccoli': 56, 'carrot': 57, 'hot dog': 58, 'pizza': 59, 'donut': 60, 'cake': 61, 'chair': 62, 'couch': 63, 'potted plant': 64, 'bed': 65, 'dining table': 67, 'toilet': 70, 'tv': 72, 'laptop': 73, 'mouse': 74, 'remote': 75, 'keyboard': 76, 'c

In [8]:
subcategories_imageIds = dict()

for i in range(0,len(catIds)):
    imgIds = coco.getImgIds(catIds=catIds[i])
    img = []
    for j in imgIds:
        img.append(j)
    subcategories_imageIds[subcategories[i]] = img

print("Sub categories with Image IDs :",len(subcategories_imageIds))

Sub categories with Image IDs : 80


In [9]:
train_cats = subcategories_imageIds['bicycle'] + subcategories_imageIds['airplane']
imgIdss = coco.getImgIds(imgIds = train_cats)
print("Total Images: ", len(imgIdss))

Total Images:  6221


In [10]:
dataset = dict()
imgcaptions = []

for imgid in imgIdss:
    img = coco.loadImgs(imgid)[0]
    annIds = coco_caps.getAnnIds(imgIds=img['id']);
    anns = coco_caps.loadAnns(annIds)
    imgcaptions = []
    for cap in anns:
       
        # Remove punctuation
        cap = cap['caption'].translate(str.maketrans('', '', string.punctuation))

        # Replace - to blank
        cap = cap.replace("-"," ")

        # Split string into word list and Convert each word into lower case
        cap = cap.split()
        cap = [word.lower() for word in cap]

        # join word list into sentence and <start> and <end> tag to each sentence which helps
        # LSTM encoder-decoder model while training.

        cap = '<start> ' + " ".join(cap) + ' <end>'
        imgcaptions.append(cap)

    dataset[img['coco_url']] = imgcaptions


print("Length of Dataset: ",len(dataset))
print(dataset['http://images.cocodataset.org/train2017/000000047084.jpg'])
#dataset

Length of Dataset:  6221
['<start> a jumbo jet plane connected to a boarding deck <end>', '<start> a large blue passenger plane sits on the tarmac at the airport <end>', '<start> a blue commercial airplane parked at a jet way <end>', '<start> a large airplane that is sitting out on the runway <end>', '<start> a blue plane at the airport being offloaded <end>']


In [11]:
import numpy as np
from PIL import Image
import matplotlib.pyplot as plt


from itertools import chain
flatten_list = list(chain.from_iterable(dataset.values())) 
tokenizer = Tokenizer(oov_token='<oov>')
tokenizer.fit_on_texts(flatten_list)
max_length = 46


model = load_model('models/mymodel7_0.h5')



def extract_features(filename, model):
        try:
            image = Image.open(filename)

        except:
            print("ERROR: Couldn't open image! Make sure the image path and extension is correct")
        image = image.resize((299,299))
        image = np.array(image)

        # for images that has 4 channels, we convert them into 3 channels
        if image.shape[2] == 4:
            image = image[..., :3]
        image = np.expand_dims(image, axis=0)
        image = image/127.5
        image = image - 1.0
        feature = model.predict(image)
        return feature

def word_for_id(integer, tokenizer):
    for word, index in tokenizer.word_index.items():
        if index == integer:
            return word
    return None


def caption_generator(img):
    
    xception_model = Xception(include_top=False, pooling="avg")
    photo = extract_features(img, xception_model)
    in_text = 'start'
    for i in range(max_length):
        sequence = tokenizer.texts_to_sequences([in_text])[0]
        sequence = pad_sequences([sequence], maxlen=max_length)
        pred = model.predict([photo,sequence], verbose=0)
        pred = np.argmax(pred)
        word = word_for_id(pred, tokenizer)

        if word is None:
            break
        in_text += ' ' + word

        if word == 'end':
            break
    return in_text

In [12]:
import warnings,logging
warnings.simplefilter('ignore')
logging.disable(logging.WARNING)
from transformers import pipeline
caption = pipeline('image-to-text')


In [13]:
import warnings,logging
warnings.simplefilter('ignore')
logging.disable(logging.WARNING)
from transformers import pipeline
caption = pipeline('image-to-text')
def caption_generator(img_pil):
    image_captions = caption(img_pil)
    for captionss in image_captions:
        for key, value in captionss.items():
            return value
            break


In [14]:
End_Phrases = ["Goodbye","Exit chat","Stop chatting","End conversation","Close chat","Finish chat","Thank you","I'm done","Logout","See you later","Bye for now","Shut down","Quit chat","Terminate conversation","Farewell", "I'm finished","Conclude chat","Log off","Shutdown chatbot","Wrap it up","Cease conversation","End the session","Sign out","Thanks, bye","Time to go" ]

In [None]:
import speech_recognition as sr
import pyttsx3
import pywhatkit
from nltk.corpus import stopwords
from nltk import word_tokenize
import datetime
import requests,json
from urllib.request import urlopen
import cv2
from PIL import Image
import numpy as np



engine = pyttsx3.init()
voices = engine.getProperty("voices")
engine.setProperty("voice" , voices[1].id)

greeting=["hello","hi","whats up"]

def engine_talk(text):
    print(f"Computer  =>  {text}")
    engine.say(text)
    engine.runAndWait()

def speak():
    recognizer = sr.Recognizer()
    with sr.Microphone() as source:
        print("Please start speaking...")
        recognizer.adjust_for_ambient_noise(source)
        try:
            audio = recognizer.listen(source)
            text = recognizer.recognize_google(audio)
            command = text.lower()
            print(f"You  =>  {command}")
            if("alexa" in command):
                return command;
            
        except sr.UnknownValueError:
            print("Sorry, I couldn't understand what you said.")

        except sr.RequestError as e:
            print(f"Request error: {e}")
def weather():

    url = 'http://ipinfo.io/json'
    response = urlopen(url)
    data = json.load(response)

    api_key = '7082eafcabdc1749014c8a9c2b298ffd'
    city_name = data['city']
    print(city_name)
    endpoint = f'http://api.weatherstack.com/current?access_key={api_key}&query={city_name}'
    response = requests.get(endpoint)

    if response.status_code == 200:
        data = json.loads(response.text)
        
        return [ data["current"]["weather_descriptions"][0], data["current"]['temperature'] ]
    else:
        print(f'Error: Unable to fetch weather data. Status code: {response.status_code}')


def run_alexa():
    command=speak()
    if(command == None):
        engine_talk("Cant hear you! Please Repeat" )
        return 1

    elif ("play" in command):
        stop_words=stopwords.words('english')
        stop_words.append("alexa")
        stop_words.append("play")
        stop_words.append("hello")
        arr=word_tokenize(command)
        res=[word for word in arr if word not in stop_words]
        command = ' '.join(res)
        print("You said:", command)
        pywhatkit.playonyt(command)
        engine_talk("Playing " + command)
    elif("date" in command):
        now = datetime.datetime.now()
        day= now.date().day
        month_map = {
            1: "January",
            2: "February",
            3: "March",
            4: "April",
            5: "May",
            6: "June",
            7: "July",
            8: "August",
            9: "September",
            10: "October",
            11: "November",
            12: "December",
        }
        month= now.date().month
        year= now.date().year
        engine_talk(f"Today date is {day} {month_map[month]} {year}")
        
    elif("time" in command):
        now = datetime.datetime.now()
        hour= now.time().hour
        minute= now.time().minute
        if(hour > 12):
            engine_talk(f"Time  is {hour%12} {minute} pm")
        else:
            engine_talk(f"The Current Time is {hour%12}:{minute} am")
    elif("weather" in command or "temperature" in command ) :
        cond,temp = weather()
        print(f"Climate is {cond} in your city with temperature {temp}")
        engine_talk(f"Climate is {cond} in your city with temperature {temp}")
    
    elif("start" in command):
        
        capt = cv2.VideoCapture(0) 
        counter=0

        while True:
            # Read a frame from the camera
            ret, frame = capt.read()

            # Check if the frame was successfully read
            if not ret:
                continue

            img_pil = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))

            if(counter%25==0):
                description = "start " + caption_generator(img_pil) + " end"
                engine_talk(description)


            counter+=1


            img_cv2 = cv2.cvtColor(np.array(img_pil), cv2.COLOR_RGB2BGR)

            cv2.putText(img_cv2, description, (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2)

            # Display the frame
            cv2.imshow('Mobile Camera', img_cv2)

            # Exit loop if 'q' is pressed
            if cv2.waitKey(1) & 0xFF == ord('q'):  
                print("Stop ________________________:)()()((()()()))")
                engine.stop()
                break

        # Release the VideoCapture and close the OpenCV windows
        capt.release()
        cv2.destroyAllWindows()

        
    elif("quit" in command or "exit" in command or "bye" in command):
        engine_talk("Thank You its a lovely interaction")   
        return 0
    
    else:
        print("I Could not hear you properly")
    return 1

engine_talk("Hello Dear! I am Alexa ! How can I help you")
cycle=1
while(cycle):
    cycle = run_alexa()
    print("cycle   ",cycle)
# weather()

Computer  =>  Hello Dear! I am Alexa ! How can I help you
Please start speaking...
Sorry, I couldn't understand what you said.
Computer  =>  Cant hear you! Please Repeat
cycle    1
Please start speaking...
Sorry, I couldn't understand what you said.
Computer  =>  Cant hear you! Please Repeat
cycle    1
Please start speaking...
You  =>  the output
Computer  =>  Cant hear you! Please Repeat
cycle    1
Please start speaking...
You  =>  start the camera
Computer  =>  Cant hear you! Please Repeat
cycle    1
Please start speaking...
You  =>  alexa start the camera
Computer  =>  start a man with glasses and a beard  end
Computer  =>  start a man with glasses and a scarf  end
Computer  =>  start a man with glasses looking at himself in the mirror  end
Computer  =>  start two men in glasses are looking at a mirror  end
Computer  =>  start two men in glasses with a reflection of a man in a mirror  end
Computer  =>  start two men in a mirror with a cell phone  end
Computer  =>  start two men in a

In [48]:
capt = cv2.VideoCapture(0) 
capt.release()
# cv2.destroyAllWindows()