# Build a text completion language model from scratch

In [1]:
import random

In [2]:
from collections import defaultdict
graph = defaultdict(list) #  initializing it as an empty list for each key. This can be useful when you are building a graph-like structure where you have nodes (represented by keys) and each node has a list of connected nodes.

''''
Above line creates a defaultdict named graph. 
The list here is the default_factory, which means that if you try to access a key that doesn't exist in graph, it will create an empty list as the default value associated with that key. 
In other words, it ensures that every key in the defaultdict always has a default value of an empty list.
'''

"'\nAbove line creates a defaultdict named graph. \nThe list here is the default_factory, which means that if you try to access a key that doesn't exist in graph, it will create an empty list as the default value associated with that key. \nIn other words, it ensures that every key in the defaultdict always has a default value of an empty list.\n"

In [3]:
''' 
In this example, the graph defaultdict is being used to store a graph-like structure where each key represents a node, and the associated value (a list) represents the neighbors of that node. 
If a key is accessed for the first time, it automatically creates an empty list as the default value.
This can be very handy when dealing with graph algorithms or any situation where you want a default value for keys that haven't been encountered before.

'''
# Accessing a nonexistent key
print(graph['a'])

[]


In [4]:
# Adding values to an existing key
graph['b'].append(1)
graph['b'].append(2)
graph['a'].append("manash")
graph['a'].append("Mondal")



In [5]:
# accessing the values
print(graph['b'])
print(graph)


[1, 2]
defaultdict(<class 'list'>, {'a': ['manash', 'Mondal'], 'b': [1, 2]})


In [6]:
tokens = ["I", "try", "to", "learn", "something", "new", "every", "day"] # list of tokens

In [7]:
for i, token in enumerate(tokens):
    print(i, token)

0 I
1 try
2 to
3 learn
4 something
5 new
6 every
7 day


In [8]:
print(random.choice(tokens)) # random.choice() function to pick a random element from a list
print(random.choice(tokens)) # random.choice() function to pick a random element from a list
print(random.choice(tokens)) # random.choice() function to pick a random element from a list

something
every
every


In [9]:
t_graph = defaultdict(list) # initializing the defaultdict

In [10]:
print(t_graph)
print(t_graph["world"])

defaultdict(<class 'list'>, {})
[]


In [11]:
from string import punctuation # importing the punctuation symbols from the string module

##### The underscore _ before a function or method name in Python is a convention that indicates that the function is intended for internal use and is not part of the public API. It's a way of signaling to other developers that this function or method is considered "private" and should not be relied upon or called directly from outside the class or module.

In [13]:
class MyClass:
    def __init__(self):
        # ...
        pass

    def _internal_function(self):
        # This is considered internal and should not be called from outside the class
        pass

    def public_function(self):
        # This is part of the public API and can be called from outside the class
        pass


'''' 
By convention, names with a single leading underscore (e.g., _internal_function) are considered internal, and developers are encouraged not to use them directly. 
However, it's important to note that Python doesn't enforce the privacy of these names, and they can still be accessed from outside the class or module if desired. 
The use of a single leading underscore is more of a suggestion to other developers rather than a strict access control mechanism.
'''

In [14]:
str.maketrans("", "", punctuation + "1234567890")

''' 
# First Argument (x): Characters to be replaced.
In str.maketrans(x, ...), the first argument x specifies characters in the input string that should be replaced. 
However, in your case, the value is an empty string "". This means there are no specific characters you want to replace.

# Second Argument (y): Characters to be replaced with.
In str.maketrans(..., y, ...), the second argument y specifies the characters that the corresponding characters in the first argument should be replaced with. Here, the second argument is also an empty string "", meaning that there are no replacement characters.
So, the combination of str.maketrans("", "") in the first two arguments essentially says, "Do not replace any characters."

# Third Argument (z): Characters to be removed or mapped.
In str.maketrans(..., ..., z), the third argument z specifies characters that should be either removed or mapped to None (removed). In your case, it's punctuation + "1234567890", meaning that any occurrence of characters in punctuation and digits will be removed from the input string.

'''



''' 
Here's a breakdown:

First and second arguments (x and y): No specific replacements are defined.
Third argument (z): Characters specified in punctuation and digits (1234567890) are to be removed.

So, the purpose of this translation table is to remove characters from the input string, specifically punctuation and digits. 
It's used later with the translate() method to clean up the text.


'''

{33: None,
 34: None,
 35: None,
 36: None,
 37: None,
 38: None,
 39: None,
 40: None,
 41: None,
 42: None,
 43: None,
 44: None,
 45: None,
 46: None,
 47: None,
 58: None,
 59: None,
 60: None,
 61: None,
 62: None,
 63: None,
 64: None,
 91: None,
 92: None,
 93: None,
 94: None,
 95: None,
 96: None,
 123: None,
 124: None,
 125: None,
 126: None,
 49: None,
 50: None,
 51: None,
 52: None,
 53: None,
 54: None,
 55: None,
 56: None,
 57: None,
 48: None}

In [22]:
from string import punctuation

text = "Hello, world!?? 123"
translation_table = str.maketrans("", "", punctuation + "1234567890")

result = text.translate(translation_table)
print("Text var: ", text, end="\n\n")
print("--- After translate/replace -----", end="\n\n")
print("Result var:",result)

Text var:  Hello, world!?? 123

--- After translate/replace -----

Result var: Hello world 


In [23]:
# Replace 'a' with '1', 'b' with '2', and 'c' with '3'
translation_table = str.maketrans('abc', '123')

text = "abc"
result = text.translate(translation_table)
print(result)  # Output: "123"

123


In [25]:
''' 
example_text = "Hello, world!\n123"
result = _tokenize(None, example_text)
print(result)
# Output: ['Hello', 'world']

'''

' \nexample_text = "Hello, world!\n123"\nresult = _tokenize(None, example_text)\nprint(result)\n# Output: [\'Hello\', \'world\']\n\n'

In [24]:
''' 
Suppose you have the text "I love programming. Programming is fun!".

After tokenization, tokens might be ['I', 'love', 'programming', 'Programming', 'is', 'fun'].

In the loop, when i = 0 (first iteration), it considers the token 'I' and appends 'love' to the list associated with 'I' in the self.graph.

Similarly, in subsequent iterations, it adds the next token to the list associated with the current token.

After the loop, the self.graph dictionary would have information like:


{
    'I': ['love'],
    'love': ['programming'],
    'programming': ['Programming'],
    'Programming': ['is'],
    'is': ['fun']
}

'''

' \nSuppose you have the text "I love programming. Programming is fun!".\n\nAfter tokenization, tokens might be [\'I\', \'love\', \'programming\', \'Programming\', \'is\', \'fun\'].\n\nIn the loop, when i = 0 (first iteration), it considers the token \'I\' and appends \'love\' to the list associated with \'I\' in the self.graph.\n\nSimilarly, in subsequent iterations, it adds the next token to the list associated with the current token.\n\nAfter the loop, the self.graph dictionary would have information like:\n\n\n{\n    \'I\': [\'love\'],\n    \'love\': [\'programming\'],\n    \'programming\': [\'Programming\'],\n    \'Programming\': [\'is\'],\n    \'is\': [\'fun\']\n}\n\n'

In [62]:
# Python code​​​​​​‌​‌​‌​‌‌​​​​​​​‌​​‌​‌‌​​​ below
# Use print("messages...") to debug your solution.
import random
from string import punctuation
from collections import defaultdict

show_expected_result = False
show_hints = False


class MarkovChain:
    def __init__(self):
        self.graph = defaultdict(list)

    def _tokenize(self, text):
        return (
            text.translate(str.maketrans("", "", punctuation + "1234567890"))
            .replace("\n", " ")
            .split(" ")
        )

    def train(self, text):
        tokens = self._tokenize(text)
        print("Tokens: ", tokens)
        for i, token in enumerate(tokens):
            
            if (len(tokens) - 1) == i:
                break
            self.graph[token].append(tokens[i + 1])
        print("Graph: ", self.graph, end="\n\n")   

    def generate(self, prompt, length=10):
        # get the lask token from the prompt
        current = self._tokenize(prompt)[-1]
        # initialize the output
        output = prompt
        print("Word Search first starting word: ", current)
        print("Sample Output: ", output, end="\n\n")
        
        for i in range(length):
            print("Iteration: ", i, end="\n")
            print("--------------------------------------------------", end="\n")
            
            print(f"Current option ## {current} : {self.graph[current]}", end="\n")
            
            # look up the options in the graph dictionary
            options = self.graph.get(current, [])
            # print(options)
            if not options:
                continue
            # use random.choice method to pick a current option
            current = random.choice(options)
            
            print("Random Chosen word: ", current)
        
            # add the random choice to the output string
            output += " " + current
            print("Final Output of this iteration: ", output, end="\n\n")
            
            
        return output
text = """Andrey Markov was born on 14 June 1856 in Russia. 
He attended the St. Petersburg Grammar School, where some teachers saw him as a rebellious student. In his academics he performed poorly in most subjects other than mathematics. Later in life he attended Saint Petersburg Imperial University (now Saint Petersburg State University). Among his teachers were Yulian Sokhotski (differential calculus, higher algebra), Konstantin Posse (analytic geometry), Yegor Zolotarev (integral calculus), Pafnuty Chebyshev (number theory and probability theory), Aleksandr Korkin (ordinary and partial differential equations), Mikhail Okatov (mechanism theory), Osip Somov (mechanics), and Nikolai Budajev (descriptive and higher geometry). He completed his studies at the university and was later asked if he would like to stay and have a career as a Mathematician. He later taught at high schools and continued his own mathematical studies. In this time he found a practical use for his mathematical skills. He figured out that he could use chains to model the alliteration of vowels and consonants in Russian literature. He also contributed to many other mathematical aspects in his time. He died at age 66 on 20 July 1922.
Torvalds was born in Helsinki, Finland, the son of journalists Anna and Nils Torvalds,[7] the grandson of statistician Leo Törnqvist and of poet Ole Torvalds, and the great-grandson of journalist and soldier Toivo Karanko. His parents were campus radicals at the University of Helsinki in the 1960s. His family belongs to the Swedish-speaking minority in Finland. He was named after Linus Pauling, the Nobel Prize-winning American chemist, although in the book Rebel Code: Linux and the Open Source Revolution, he is quoted as saying, "I think I was named equally for Linus the Peanuts cartoon character", noting that this made him "half Nobel Prize-winning chemist and half blanket-carrying cartoon character".[8]

Torvalds attended the University of Helsinki from 1988 to 1996,[9] graduating with a master's degree in computer science from the NODES research group.[10] His academic career was interrupted after his first year of study when he joined the Finnish Navy Nyland Brigade in the summer of 1989, selecting the 11-month officer training program to fulfill the mandatory military service of Finland. He gained the rank of second lieutenant, with the role of an artillery observer.[11] He bought computer science professor Andrew Tanenbaum's book Operating Systems: Design and Implementation, in which Tanenbaum describes MINIX, an educational stripped-down version of Unix. In 1990, Torvalds resumed his university studies, and was exposed to Unix for the first time in the form of a DEC MicroVAX running ULTRIX.[12] His MSc thesis was titled Linux: A Portable Operating System.[13]

His interest in computers began with a VIC-20[14] at the age of 11 in 1981. He started programming for it in BASIC, then later by directly accessing the 6502 CPU in machine code (he did not utilize assembly language).[15] He then purchased a Sinclair QL, which he modified extensively, especially its operating system. "Because it was so hard to get software for it in Finland", he wrote his own assembler and editor "(in addition to Pac-Man graphics libraries)"[16] for the QL, and a few games.[17][18] He wrote a Pac-Man clone, Cool Man. On 5 January 1991[19] he purchased an Intel 80386-based clone of IBM PC[20] before receiving his MINIX copy, which in turn enabled him to begin work on Linux.

Linux
Main article: History of Linux
The first Linux prototypes were publicly released in late 1991.[8][21] Version 1.0 was released on 14 March 1994.[22]

Torvalds first encountered the GNU Project in 1991 when another Swedish-speaking computer science student, Lars Wirzenius, took him to the University of Technology to listen to free software guru Richard Stallman's speech.[citation needed] Torvalds used Stallman's GNU General Public License version 2 (GPLv2) for his Linux kernel.

After a visit to Transmeta in late 1996,[23] Torvalds accepted a position at the company in California, where he worked from February 1997 to June 2003. He then moved to the Open Source Development Labs, which has since merged with the Free Standards Group to become the Linux Foundation, under whose auspices he continues to work. In June 2004, Torvalds and his family moved to Dunthorpe, Oregon[24] to be closer to the OSDL's headquarters in Beaverton.

From 1997 to 1999, he was involved in 86open, helping select the standard binary format for Linux and Unix. In 1999, he was named by the MIT Technology Review TR100 as one of the world's top 100 innovators under age 35.[25]

In 1999, Red Hat and VA Linux, both leading developers of Linux-based software, presented Torvalds with stock options in gratitude for his creation.[26] That year both companies went public and Torvalds's share value briefly shot up to about US$20 million.[27][28]

His personal mascot is a penguin nicknamed Tux,[29] which has been widely adopted by the Linux community as the Linux kernel's mascot.[30]

Although Torvalds believes "open source is the only right way to do software", he also has said that he uses the "best tool for the job", even if that includes proprietary software.[31] He was criticized for his use and alleged advocacy of the proprietary BitKeeper software for version control in the Linux kernel. He subsequently wrote a free-software replacement for it called Git.

In 2008, Torvalds stated that he used the Fedora Linux distribution because it had fairly good support for the PowerPC processor architecture, which he favored at the time.[32] He confirmed this in a 2012 interview.[33] He has also posted updates about his choice of desktop environment, often in response to perceived feature regressions.

The Linux Foundation currently sponsors Torvalds so he can work full-time on improving Linux.[34]

Torvalds is known for vocally disagreeing with other developers on the Linux kernel mailing list.[35] Calling himself a "really unpleasant person", he explained, "I'd like to be a nice person and curse less and encourage people to grow rather than telling them they are idiots. I'm sorry—I tried, it's just not in me."[36][37] His attitude, which he considers necessary for making his points clear, has drawn criticism from Intel programmer Sage Sharp and systemd developer Lennart Poettering, among others.[38][failed verification][39]

On Sunday, 16 September 2018, the Linux kernel Code of Conflict was suddenly replaced by a new Code of Conduct based on the Contributor Covenant. Shortly thereafter, in the release notes for Linux 4.19-rc4, Torvalds apologized for his behavior, calling his personal attacks of the past "unprofessional and uncalled for" and announced a period of "time off" to "get some assistance on how to understand people's emotions and respond appropriately". It soon transpired that these events followed The New Yorker approaching Torvalds with a series of questions critical of his conduct.[40][41][42] Following the release of Linux 4.19 on 22 October 2018, Torvalds returned to maintaining the kernel.[43]
"""
chain = MarkovChain()
chain.train(text)
# print(chain.graph)
sample_prompt = "He was"
print("Sample prompt: ",sample_prompt, end="\n\n")
print(chain.generate(sample_prompt)) # sample_prompt is the prompt for the generated text


Tokens:  ['Andrey', 'Markov', 'was', 'born', 'on', '', 'June', '', 'in', 'Russia', '', 'He', 'attended', 'the', 'St', 'Petersburg', 'Grammar', 'School', 'where', 'some', 'teachers', 'saw', 'him', 'as', 'a', 'rebellious', 'student', 'In', 'his', 'academics', 'he', 'performed', 'poorly', 'in', 'most', 'subjects', 'other', 'than', 'mathematics', 'Later', 'in', 'life', 'he', 'attended', 'Saint', 'Petersburg', 'Imperial', 'University', 'now', 'Saint', 'Petersburg', 'State', 'University', 'Among', 'his', 'teachers', 'were', 'Yulian', 'Sokhotski', 'differential', 'calculus', 'higher', 'algebra', 'Konstantin', 'Posse', 'analytic', 'geometry', 'Yegor', 'Zolotarev', 'integral', 'calculus', 'Pafnuty', 'Chebyshev', 'number', 'theory', 'and', 'probability', 'theory', 'Aleksandr', 'Korkin', 'ordinary', 'and', 'partial', 'differential', 'equations', 'Mikhail', 'Okatov', 'mechanism', 'theory', 'Osip', 'Somov', 'mechanics', 'and', 'Nikolai', 'Budajev', 'descriptive', 'and', 'higher', 'geometry', 'He', 

In [48]:
# Example dictionary
my_dict = {
    'name': 'John',
    'age': 25,
    'city': 'New York',
    'country': 'USA'
}

# Specify the key you want to print values for
key_to_print = 'city'

# Check if the key exists in the dictionary
if key_to_print in my_dict:
    # Print the key and its values in the specified format
    print(f"{key_to_print} : {my_dict[key_to_print]}")
else:
    print(f"The key '{key_to_print}' does not exist in the dictionary.")

city : New York
