## Reading the data

In [60]:
import nltk
import re
import string

def read_data(file):
    with open(file, "r", encoding="utf-8") as f:
        data = f.read()
    return data

data = read_data('data.txt')
data = data.lower()
data = data.translate(str.maketrans('', '', string.punctuation))
lines = data.split('\n')
print(lines)

['robotics design construction and use of machines robots to perform tasks done traditionally by human beings', 'robots are widely used in such industries as automobile manufacture to perform simple repetitive tasks', 'and in industries where work must be performed in environments hazardous to humans many aspects of robotics involve artificial intelligence', 'robots may be equipped with the equivalent of human senses such as vision touch and the ability to sense temperature', 'some are even capable of simple decision making and current robotics research is geared toward devising', 'robots with a degree of selfsufficiency that will permit mobility and decisionmaking in an unstructured environment', 'today’s industrial robots do not resemble human beings a robot in human form is called an android']


## Extracting english verbs

In [61]:
def extracted_english_word(file_name):
    words = read_data(file_name)
    words = words.translate(str.maketrans('', '', string.punctuation))
    words = words.split('\n')
    
    vectorizer = TfidfVectorizer()
    matrix  = vectorizer.fit_transform(words)
    words = vectorizer.get_feature_names_out()
    words_list = []
    for i in range(len(words)):
        words_list.append(words[i])
    return words_list
    

## Stop Words

In [62]:
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords

verbs = extracted_english_word('verbs.txt')
adjectives = extracted_english_word('adjectives.txt')
adverbs = extracted_english_word('adverbs.txt')
#print(adverbs)

addtional_words = ['used', 'human', 'wide', 'vision', 'ability', 'repetitive', 'performed', 'environments', 'permit', 'sense', 'beings', 'use', 'must', 'may', 'work', 'simple' 'traditionally', 'called', 'perform','done', 'develop','difficult', 'without', 'form', 'variety','unfeasible', 'part', 'seen', 'speech', 'study', 'tasks', 'training','improve', 'needed']
stop_words = stopwords.words('english')
useless_words = stop_words + addtional_words + verbs + adjectives + adverbs
#print("stop_words\n", useless_words)

## Cleaning & Modeling the data with TF-IDFVectorizer

In [67]:
#vectorizer = TfidfVectorizer(stop_words = useless_words, ngram_range=(2,2),)
vectorizer = TfidfVectorizer(stop_words = useless_words, ngram_range=(2,2), max_features=20)
matrix  = vectorizer.fit_transform(lines)
features = vectorizer.get_feature_names_out()
print("Feature Names \n", features)

Feature Names 
 ['artificial intelligence' 'many aspects' 'mobility decisionmaking'
 'research geared' 'resemble robot' 'robot android' 'robotics design'
 'robotics involve' 'robotics research' 'robots degree' 'robots equipped'
 'robots resemble' 'robots traditionally' 'robots widely'
 'selfsufficiency mobility' 'senses touch' 'simple decision'
 'today industrial' 'touch temperature' 'toward devising']


In [68]:
print("Size of the Matrix \n", matrix.shape)

Size of the Matrix 
 (7, 20)


In [69]:
import numpy as np
from numpy import unravel_index

def top_features(number_of_features):
    # number of rews to take the max feature onetime only
    rows = len(data)
    list_features = []
    for i in range(number_of_features):
        max_value_index = unravel_index(data.argmax(), data.shape)
        rew = max_value_index[0]
        culomn = max_value_index[1]
        for j in range(rows):
            # making all columns of the chosen feature equal zero
            data[j][culomn] = 0.0
        list_features.append(features[culomn])
    return list_features

# Top features selected

In [70]:
data = matrix.toarray()
vector = top_features(5)
print(vector)

['robots widely', 'robotics design', 'robots traditionally', 'artificial intelligence', 'many aspects']


In [71]:
import pandas as pd
pd.DataFrame(matrix.toarray(), columns = vectorizer.get_feature_names_out())

Unnamed: 0,artificial intelligence,many aspects,mobility decisionmaking,research geared,resemble robot,robot android,robotics design,robotics involve,robotics research,robots degree,robots equipped,robots resemble,robots traditionally,robots widely,selfsufficiency mobility,senses touch,simple decision,today industrial,touch temperature,toward devising
0,0.0,0.0,0.0,0.0,0.0,0.0,0.707107,0.0,0.0,0.0,0.0,0.0,0.707107,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.57735,0.57735,0.0,0.0,0.0,0.0,0.0,0.57735,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.57735,0.0,0.0,0.0,0.0,0.57735,0.0,0.0,0.57735,0.0
4,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.5
5,0.0,0.0,0.57735,0.0,0.0,0.0,0.0,0.0,0.0,0.57735,0.0,0.0,0.0,0.0,0.57735,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.5,0.5,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0
