## Reading the data

In [87]:
import nltk
import re
import string

def read_data(file):
    with open(file, "r", encoding="utf-8") as f:
        data = f.read()
    return data

data = read_data('data.txt')
data = data.lower()
data = data.translate(str.maketrans('', '', string.punctuation))
lines = data.split('\n')
print(lines)

['machine learning ml is the study of computer algorithms that can improve automatically through experience and by the use of data', ' it is seen as a part of artificial intelligence', 'machine learning algorithms build a model based on sample data known as training data in order to make predictions or', 'decisions without being explicitly programmed to do so machine learning algorithms are used in a wide variety of applications', 'such as in medicine email filtering speech recognition and computer vision', 'where it is difficult or unfeasible to develop conventional algorithms to perform the needed tasks']


## Stop Words

In [95]:
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords

addtional_words = ['used', 'use', 'wide', 'vision','perform', 'develop','difficult', 'without', 'known', 'variety','unfeasible', 'part', 'seen', 'speech', 'study', 'tasks', 'training','improve', 'needed']
stop_words = stopwords.words('english')
useless_words = stop_words + addtional_words
#print("stop_words\n", stop_words)

## Cleaning & Modeling the data with TF-IDFVectorizer

In [100]:
#vectorizer = TfidfVectorizer(stop_words = useless_words, ngram_range=(1,2))
vectorizer = TfidfVectorizer(stop_words = useless_words, ngram_range=(1,2), max_features=20)
matrix  = vectorizer.fit_transform(lines)
features = vectorizer.get_feature_names_out()
print("Feature Names \n", features)

Feature Names 
 ['algorithms' 'computer' 'data' 'filtering' 'filtering recognition'
 'intelligence' 'learning' 'learning algorithms' 'learning ml' 'machine'
 'machine learning' 'make' 'make predictions' 'medicine' 'order'
 'predictions' 'programmed' 'programmed machine' 'recognition'
 'recognition computer']


In [101]:
print("Size of the Matrix \n", matrix.shape)

Size of the Matrix 
 (6, 20)


In [102]:
import numpy as np
from numpy import unravel_index

def top_features(number_of_features):
    # number of rews to talk the max feature onetime only
    rows = len(data)
    list_features = []
    for i in range(number_of_features):
        max_value_index = unravel_index(data.argmax(), data.shape)
        rew = max_value_index[0]
        culomn = max_value_index[1]
        for j in range(rows):
            # making all columns of the chosen feature equal zero
            data[j][culomn] = 0.0
        list_features.append(features[culomn])
    return list_features

In [103]:
data = matrix.toarray()
vector = top_features(10)
print(vector)

['intelligence', 'algorithms', 'data', 'learning ml', 'programmed', 'programmed machine', 'filtering', 'filtering recognition', 'medicine', 'recognition']


In [93]:
import pandas as pd

pd.DataFrame(matrix.toarray(), columns = vectorizer.get_feature_names_out())

Unnamed: 0,algorithms,algorithms applications,algorithms automatically,algorithms build,algorithms perform,applications,artificial,artificial intelligence,automatically,automatically experience,...,order,order make,perform,predictions,programmed,programmed machine,recognition,recognition computer,sample,sample data
0,0.170306,0.0,0.287069,0.0,0.0,0.0,0.0,0.0,0.287069,0.287069,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.57735,0.57735,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.128994,0.0,0.0,0.217433,0.0,0.0,0.0,0.0,0.0,0.0,...,0.217433,0.217433,0.0,0.217433,0.0,0.0,0.0,0.0,0.217433,0.217433
3,0.183414,0.309163,0.0,0.0,0.0,0.309163,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.309163,0.309163,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.33957,0.33957,0.0,0.0
5,0.235391,0.0,0.0,0.0,0.396777,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.396777,0.0,0.0,0.0,0.0,0.0,0.0,0.0
