In [1]:
import pandas as pd
import string
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split


In [2]:
paragraph = """
Machine learning is fascinating.
It allows computers to learn from data.
The more data, the better the learning.
Deep learning is a subset of machine learning.
Neural networks are at the core of deep learning.
Artificial intelligence is evolving rapidly.
Data science combines domain expertise with programming skills.
Big data plays a  CRUCIAL role in  MODERN analytics.
Natural language processing is a key part of AI.
Predictive modeling helps in forecasting future trends."""

In [3]:

def tokenize(text):
    tokens=[]
    word=""
    for char in text:
        if char in string.whitespace or char in string.punctuation:
            if word:
                tokens.append(word)
                word=""
        else:
            word+=char
    if word:
        tokens.append(word)
    return tokens

tokens=tokenize(paragraph)
lower_tokens=[word.lower() for word in tokens]
print("Tokens",tokens)
print("Lower Tokens",lower_tokens)

Tokens ['Machine', 'learning', 'is', 'fascinating', 'It', 'allows', 'computers', 'to', 'learn', 'from', 'data', 'The', 'more', 'data', 'the', 'better', 'the', 'learning', 'Deep', 'learning', 'is', 'a', 'subset', 'of', 'machine', 'learning', 'Neural', 'networks', 'are', 'at', 'the', 'core', 'of', 'deep', 'learning', 'Artificial', 'intelligence', 'is', 'evolving', 'rapidly', 'Data', 'science', 'combines', 'domain', 'expertise', 'with', 'programming', 'skills', 'Big', 'data', 'plays', 'a', 'CRUCIAL', 'role', 'in', 'MODERN', 'analytics', 'Natural', 'language', 'processing', 'is', 'a', 'key', 'part', 'of', 'AI', 'Predictive', 'modeling', 'helps', 'in', 'forecasting', 'future', 'trends']
Lower Tokens ['machine', 'learning', 'is', 'fascinating', 'it', 'allows', 'computers', 'to', 'learn', 'from', 'data', 'the', 'more', 'data', 'the', 'better', 'the', 'learning', 'deep', 'learning', 'is', 'a', 'subset', 'of', 'machine', 'learning', 'neural', 'networks', 'are', 'at', 'the', 'core', 'of', 'deep'

In [4]:
def lemmatize(token):
    lemmas={"learning":"learn",
            "computers":"computer",
            "data":"datum",
            "networks":"network"}
    
    return lemmas.get(token,token)

lemmatized_tokens=[lemmatize(token) for token in lower_tokens]
print("lemmatized tokens",lemmatized_tokens)


lemmatized tokens ['machine', 'learn', 'is', 'fascinating', 'it', 'allows', 'computer', 'to', 'learn', 'from', 'datum', 'the', 'more', 'datum', 'the', 'better', 'the', 'learn', 'deep', 'learn', 'is', 'a', 'subset', 'of', 'machine', 'learn', 'neural', 'network', 'are', 'at', 'the', 'core', 'of', 'deep', 'learn', 'artificial', 'intelligence', 'is', 'evolving', 'rapidly', 'datum', 'science', 'combines', 'domain', 'expertise', 'with', 'programming', 'skills', 'big', 'datum', 'plays', 'a', 'crucial', 'role', 'in', 'modern', 'analytics', 'natural', 'language', 'processing', 'is', 'a', 'key', 'part', 'of', 'ai', 'predictive', 'modeling', 'helps', 'in', 'forecasting', 'future', 'trends']


In [5]:
def stem(token):
    suffixes=["ing","ed","s"]
    for suffix in suffixes:
        if token.endswith(suffix):
            return token[:-len(suffix)]
    return token

stemmed_tokens=[stem(token) for token in lemmatized_tokens]
print("stemmed_tokens",stemmed_tokens)

stemmed_tokens ['machine', 'learn', 'i', 'fascinat', 'it', 'allow', 'computer', 'to', 'learn', 'from', 'datum', 'the', 'more', 'datum', 'the', 'better', 'the', 'learn', 'deep', 'learn', 'i', 'a', 'subset', 'of', 'machine', 'learn', 'neural', 'network', 'are', 'at', 'the', 'core', 'of', 'deep', 'learn', 'artificial', 'intelligence', 'i', 'evolv', 'rapidly', 'datum', 'science', 'combine', 'domain', 'expertise', 'with', 'programm', 'skill', 'big', 'datum', 'play', 'a', 'crucial', 'role', 'in', 'modern', 'analytic', 'natural', 'language', 'process', 'i', 'a', 'key', 'part', 'of', 'ai', 'predictive', 'model', 'help', 'in', 'forecast', 'future', 'trend']


In [6]:
stop_words=["is","to","the","from","and","are","at","of","a"]
filtered_tokens=[token for token in stemmed_tokens if token not in stop_words]
print("filtered tokens",filtered_tokens)

filtered tokens ['machine', 'learn', 'i', 'fascinat', 'it', 'allow', 'computer', 'learn', 'datum', 'more', 'datum', 'better', 'learn', 'deep', 'learn', 'i', 'subset', 'machine', 'learn', 'neural', 'network', 'core', 'deep', 'learn', 'artificial', 'intelligence', 'i', 'evolv', 'rapidly', 'datum', 'science', 'combine', 'domain', 'expertise', 'with', 'programm', 'skill', 'big', 'datum', 'play', 'crucial', 'role', 'in', 'modern', 'analytic', 'natural', 'language', 'process', 'i', 'key', 'part', 'ai', 'predictive', 'model', 'help', 'in', 'forecast', 'future', 'trend']


In [7]:
def create_word_vector(tokens,vector_size=50):
    vocab=set(tokens)
    word_vectors={word:torch.rand(vector_size) for word in vocab}
    
    return word_vectors
word_vectors=create_word_vector(filtered_tokens)


In [8]:
word_to_index={word:idx for idx ,word in enumerate(filtered_tokens)}
print("word to index",word_to_index)

word to index {'machine': 17, 'learn': 23, 'i': 48, 'fascinat': 3, 'it': 4, 'allow': 5, 'computer': 6, 'datum': 38, 'more': 9, 'better': 11, 'deep': 22, 'subset': 16, 'neural': 19, 'network': 20, 'core': 21, 'artificial': 24, 'intelligence': 25, 'evolv': 27, 'rapidly': 28, 'science': 30, 'combine': 31, 'domain': 32, 'expertise': 33, 'with': 34, 'programm': 35, 'skill': 36, 'big': 37, 'play': 39, 'crucial': 40, 'role': 41, 'in': 55, 'modern': 43, 'analytic': 44, 'natural': 45, 'language': 46, 'process': 47, 'key': 49, 'part': 50, 'ai': 51, 'predictive': 52, 'model': 53, 'help': 54, 'forecast': 56, 'future': 57, 'trend': 58}
