# Preprocessing

In [None]:
import pandas as pd
import numpy as np
!pip install pickle5
import pickle5 as pickle
import datetime
import os
!pip install nltk
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('wordnet')
!pip install num2words
from num2words import num2words
import re
import sys
!{sys.executable} -m pip install spacy==2.2.4
!{sys.executable} -m spacy download en
import spacy

from google.colab import drive

Collecting pickle5
[?25l  Downloading https://files.pythonhosted.org/packages/f7/4c/5c4dd0462c8d3a6bc4af500a6af240763c2ebd1efdc736fc2c946d44b70a/pickle5-0.0.11.tar.gz (132kB)
[K     |██▌                             | 10kB 10.8MB/s eta 0:00:01[K     |█████                           | 20kB 15.4MB/s eta 0:00:01[K     |███████▍                        | 30kB 10.1MB/s eta 0:00:01[K     |██████████                      | 40kB 8.8MB/s eta 0:00:01[K     |████████████▍                   | 51kB 5.3MB/s eta 0:00:01[K     |██████████████▉                 | 61kB 5.9MB/s eta 0:00:01[K     |█████████████████▍              | 71kB 5.7MB/s eta 0:00:01[K     |███████████████████▉            | 81kB 5.9MB/s eta 0:00:01[K     |██████████████████████▎         | 92kB 6.5MB/s eta 0:00:01[K     |████████████████████████▉       | 102kB 6.1MB/s eta 0:00:01[K     |███████████████████████████▎    | 112kB 6.1MB/s eta 0:00:01[K     |█████████████████████████████▊  | 122kB 6.1MB/s eta 0:00:01

## Files and Folders

In [None]:
drive.mount('/content/drive', force_remount=True)  # use force_remount=True param after upload of new data
# !ls "/content/drive/My Drive/Master/2 - FSS 2021/Information Retrieval/IR Projekt/"

ir_project_drive_folder = "IR Projekt"
full_ir_project_drive_folder = "/content/drive/My Drive/{}/data/wikipedia".format(ir_project_drive_folder)

#full_ir_project_drive_folder = '../data/wikipedia'
raw_folder = full_ir_project_drive_folder + '/raw'
preprocessed_folder = full_ir_project_drive_folder + '/no-pron/preprocessed'

# qa filenames
qa_wikipedia_verified_dev_filename = raw_folder + '/qa/verified-wikipedia-dev.json'
qa_wikipedia_dev_filename = raw_folder + '/qa/wikipedia-dev.json'
qa_wikipedia_test_without_answers_filename = raw_folder + '/qa/wikipedia-test-without-answers.json'
qa_wikipedia_train_filename = raw_folder + '/qa/wikipedia-train.json'

# evidence files
wikipedia_evidence_file = raw_folder + '/wikipedia_evidence_dict.pkl'
preprocessed_wikipedia_evidence_file = preprocessed_folder + '/preprocessed_wikipedia_evidence_dict.pkl'

# qa files
preprocessed_qa_wikipedia_verified_dev_filename = preprocessed_folder + '/qa/verified-wikipedia-dev.pkl'
preprocessed_qa_wikipedia_dev_filename = preprocessed_folder + '/qa/wikipedia-dev.pkl'
preprocessed_qa_wikipedia_test_without_answers_filename = preprocessed_folder + '/qa/wikipedia-test-without-answers.pkl'
preprocessed_qa_wikipedia_train_filename = preprocessed_folder + '/qa/wikipedia-train.pkl'

Mounted at /content/drive


In [None]:
def save_as_pickle(obj, filename):
    """
    save an object in a pickle file dump
    :param obj: object to dump
    :param filename: target file
    :return:
    """
    
    directory = os.path.dirname(filename)
    if not os.path.exists(directory):
        os.makedirs(directory)
    
    with open(filename, 'wb') as file:
        pickle.dump(obj, file, protocol=pickle.HIGHEST_PROTOCOL)


def load_pickle(filename):
    """
    load an object from a given pickle file
    :param filename: source file
    :return: loaded object
    """
    with open(filename, 'rb') as file:
        return pickle.load(file)

## Preprocess steps:


* Stop Words Removal
* Stemming (walked -> walk)
* Lemmatization (was, is -> be)
* Remove special characters
* Case folding
* Transform numbers in words



### Single functions

In [None]:
def case_folding(text):
  return text.lower()

In [None]:
def remove_special_characters(text):
  without_special_chars = re.sub('[^\w ]', ' ', text)
  # because of replacement multiple spaces are possible which have to be removed
  return re.sub('\W+', ' ', without_special_chars)

In [None]:
def remove_stop_words(text):
  return ' '.join([word for word in text.split() if word not in stopwords.words('english')])

In [None]:
nlp = spacy.load('en', disable=['parser', 'ner'])
def lemmatize_words(text):
  doc = nlp(text)
  spacy_lemmatized = " ".join([token.lemma_ for token in doc if str(token.lemma_) != '-PRON-'])
  return spacy_lemmatized

In [None]:
porter_stemmer = nltk.stem.porter.PorterStemmer()
def stem_words(words):
  return ' '.join([porter_stemmer.stem(word) for word in words.split()])

In [None]:
def transform_numbers(words):
  return [num2words(word) if word.isdigit() else word for word in words]

### Main Preprocessing Function

In [None]:
def preprocess(text):
  text = case_folding(text)
  text = remove_special_characters(text)
  text = remove_stop_words(text)
  text = lemmatize_words(text)
  text = stem_words(text)
  # words = transform_numbers(words)
  return text

In [None]:
example_doc = """
examples 
Allo Allo! He is a BBC television British sitcom that was broadcast on BBC One from 1982 to 1992, comprising eighty-five episodes. The story is set in a small-town café in German-occupied France during the Second World War. It is a parody of another BBC programme, the wartime drama Secret Army. Allo, Allo! was created by David Croft, who also wrote the theme music, and Jeremy Lloyd. Lloyd and Croft wrote the first six series. The remaining series were written by Lloyd and Paul Adam.  Main plot   Set during the Second World War,Allo Allo! tells the fictitious story of René Artois,  a café owner in the town of Nouvion, France. Military from the Axis powers have occupied the town  and stolen all of its valuable artefacts. These include a painting of The Fallen Madonna by Van Klomp (usually referred to as The Fallen Madonna with the Big Boobies). Two officers, Colonel Kurt von Strohm and Captain Hans Geering, have decided to keep the paintings for themselves after the war, and they coerce René into hiding them in his café. Hitler also wants the paintings, and sends Herr Flick of the Gestapo to the town to find them. Flick, in turn, conspires to keep them. The paintings are duplicated by a forger, get mixed up, lost, found and are put in knackwurst sausages, and hidden in the cellar of Café René.  Other valuable artefacts include a painting of the Cracked Vase with the big daisies by Van Gogh; the first cuckoo clock ever made; and some silver.  At the same time, the café is being used as a safe house for two brave but clueless British airmen, Fairfax and Carstairs. René is forced to work with the French Resistance, led by Michelle Dubois, who threaten to shoot him for serving Germans in his café. The far-fetched plans of the Resistance to get the airmen back to Britain repeatedly fail. These are some of the main running gags of the series.   As part of these plans, the Resistance have placed a radio in the bedroom of René's mother-in-law, Madame Fanny La Fan, as this is the only room nobody enters unless they have to. This secret device for communication between London and the Resistance (codename "Nighthawk") is hidden under the bed, and incoming messages are signalled by light bulbs concealed in the bed-knobs – leading the mother-in-law to cry "Ze flashing knobs!". René answers with "'Allo, 'allo, zis is Night'awk, are you receiving me?", hence the title of the show ("allô" is the normal French way of greeting someone over a remote communication system).   The Resistance is also "helped" by Officer Crabtree, a British spy posing as a French policeman, sent to France because he can speak French. However, he does not speak it very well, especially the vowels, resulting in frequent malapropisms. For example, whenever he says "Good morning", it comes out as "Good moaning".  René is also trying to keep his affairs with his waitresses secret from his wife, Edith, who regularly sings in the café. But she is such an appallingly bad singer (which she does not realise herself) that visitors to her café often put cheese in their ears to block the sound. In addition, the communist resistance is plotting against René for serving Germans and for working with the Gaullist Resistance. However, the communist resistance only blow things up for money. The only reason they do not shoot René is that their leader, Denise Laroque, is in love with him, a fact he has to hide from both his wife and the waitresses, Yvette Carte-Blanche, Maria Recamier and Mimi Labonq. Furthermore, the seemingly gay German Lieutenant Gruber is also continually flirting with René and finding him in embarrassing situations. These situations are more humorous because René is not stereotypically attractive, is not considered a hero, and is often forced (against his will) by his wife to undertake missions and secret operations. Once, Edith memorably points a gun at René to stop him running away to hide with his cousin; when interrupted by the five German officers, he explains that his wife had been proposing to him.  In one early episode, René is arrested and shot by a German firing squad for blowing up a railway line, on the orders of General Erich von Klinkerhoffen, a ruthless general from Berlin, but the German officers put dummy bullets in the firing squad's rifles. Although René survives, he has to spend the entire series posing as his own twin brother, who is also called René. René's will bequeaths the café to Edith; so to get Café René back – or put "his fingers back in his own till", as he puts it – René tries to convince Edith to marry him again. Meanwhile, Edith is wooed by the Italian Captain Bertorelli and by Monsieur Alfonse, the undertaker who is torn between his love for Edith and his admiration for René, whom he considers a true hero of France.  These few plot devices provide the basic storyline throughout the entire series, upon which are hung classic farce set-ups, physical comedy and visual gags, ridiculous fake accents, a large amount of sexual innuendo, and a fast-paced running string of broad cultural clichés. Each episode builds on previous ones, requiring viewers to follow the series to understand the plot fully. The series revolved around individual story arcs spread across several episodes, where typically a far-fetched scheme by the Resistance to repatriate the British airmen would become intertwined with the Gestapo's attempts to recover the missing paintings and the German officers' corrupt activities, which would culminate with the three groups' plans frustrating one another and leaving them in an even worse situation than before. At the start of each subsequent episode, René summarises the plot to date for the audience (breaking the 4th wall); a gag based on the "As you remember..." device commonly used in serials. In reruns, some local TV stations have shuffled the episodes, making these plot synopses useful.  Characters   Most of the characters have a catchphrase, gimmick, or saying, which became easily recognisable throughout the series.  Main characters   * René François Artois (Gorden Kaye) – The local café proprietor who, whilst trying to remain impartial, has been dragged into the war by both sides. The Germans are threatening to shoot him if he does not secretly hide stolen valuables; the Resistance is using his café as a safe-house for shot-down British airmen; and on top of that, he is trying to keep his passionate love affairs with the café waitresses secret from his wife. Whenever his wife Edith catches him in the arms of another woman, René invariably responds with the phrase "You stupid woman! Can you not see that..." followed by a convoluted explanation, which Edith always believes, leading to an apology from her. René does not care much for his mother-in-law, often referring to her as a "silly old bat!" Each episode starts with scenery, costumes or props from the end of the previous episode, and (usually) René opens each episode with a monologue to the camera starting "You may be wondering why..." and proceeding to describe the situation he finds himself in, and to summarise the previous episode. Rene's only love is for his waitress Yvette Carte Blanche. * Edith Melba Artois (née La Fan) (Carmen Silvera) – René's wife, and the café's resident cabaret performer. However, her singing is so bad and tuneless (as René is once heard to comment: "my wife, who cannot carry a tune in a bucket...") that locals stick cheese in their ears to block out the noise. Whilst René views her with disdain, she is also the subject of much romantic wooing by the local undertaker Monsieur Alfonse and the Italian Captain Alberto Bertorelli. Whenever she finds René embracing one of the waitresses, she usually asks "René! What are you doing 'olding that servant girl in your arms?". Many fans have observed that whilst René is constantly fêted as the 'hero of the resistance' it is invariably Edith who comes up with the solution to any problem.  * Madame Fanny La Fan (Rose Hill) – Edith's mother. She lives in an attic above the café, lying in bed wearing white nighties and a goffered cap. This is also where the British airmen and the radio (complete with "Ze flashing knobs!") are hidden. When she wants attention she bangs her walking stick on the floor, and cries out "Will nobody 'ear the cries of a poor old woman?" She is partial to a glass of gin, and occasionally fills in for her daughter as part of the café cabaret, although her singing is just as bad (if not worse). She also hates the Germans, as evidenced by saying "The Germans, I spit on zem", after which she actually spits, usually to the side of the bed.  * Yvette Carte-Blanche (Vicki Michelle) – A waitress at the café. She is passionately in love with René, and wants to elope with him to Geneva in Switzerland, but cannot do so because René has to stay loyal to his wife. She is also responsible for 'entertaining' the German officers, upstairs at the café with wet celery and a flying helmet, and sometimes with an egg whisk. In her intimate moments with René, she throws her arms around him and rumbles an elongated, deep growl of "Ooooooh, René." Often clinched in the kitchen, "I was just 'anging up ze knockwurst when I remembered all ze 'appy times we 'ad in 'ere." Rene Artois is her One True Love. * Maria Recamier (Francesca Gonshaw) (series 1 to 3) – Another waitress, who is also in love with René. She has no idea that René loves Yvette, and also believes that René should run away with her. She is short statured and many of the jokes play on her small size. "May I get you something...that is not on a high shelf?" When she speaks she has the tendency to spit when she rolls her "r"s. She disguises herself as a Red Cross parcel after sneaking into the British POW camp's mail room in the episode 'Camp Dance', but unfortunately gets sent back to Switzerland after failing to put enough stamps on.  * Mimi Labonq  (Sue Hodge) (series 4 to 9) – The replacement waitress for Maria, she is also short in stature. She is a member of the Resistance with a bloodthirsty hatred of the Germans, and a secret mission to kill the "German swines", often after wooing them – her address on her card is "straight up the stairs, first on the left past the linen cupboard". She also has a bit of a fancy for René. **It is heavily hinted that the waitresses supplement their income by prostitution with the Germans, and Yvette frequently entices Colonel von Strohm with the promise of using "the flying helmet and the wet celery". Precisely how these are to be used is never made clear. * Michelle "of the Résistance" Dubois (Kirsten Cooke) – Leader of the local "French Charles de Gaulle (the one with the big 'ooter) Résistance", she devises elaborate plans to help British airmen escape, and to blow up German ammunition trains and lorries. Whenever she concocts a plan, she gathers everyone around and issues her instruction, "Listen very carefully, I shall say 'zis only once!", followed by the vital information. Whenever her plans are thwarted by someone's stupidity, she rebukes that person, "You fool!" She pretends to fall in love with René, but only to stop him leaving the Resistance. Michelle is also the only French character in the series who speaks English (see Languages below). * Monsieur Roger LeClerc (Jack Haig) (series 1 to 5) – The Resistance's elderly forger, 'master' of disguises (all of them unconvincing), and café piano player. He is responsible for delivering to the café various goods such as batteries, bombs, and radio equipment. He does this in a variety of disguises, ranging from an onion seller to a lost mountain hiker. On delivery LeClerc always says, "It is I, LeClerc", accompanied by a raising of his glasses, presumably to reveal his identity. He seems to think that he is good at disguise even though he is always instantly recognisable (René once remarks: "Ze man of a thousand faces, every one ze same!") He is also the childhood sweetheart of Madame Fanny, and often pops up out of her bed whenever Fanny says "Ze flashing knobs!", only to be pushed back down. He is an escaped convict.  * Monsieur Ernest LeClerc (Derek Royle, (series 6); Robin Parkinson, (series 7 to 9)) – This character was introduced to the series after the sudden death of Jack Haig (the actor who played Roger LeClerc), as his brother. He has many of the same characteristics, and is also a former childhood sweetheart of Madame Fanny. The character switch was explained by claiming that Roger had tried to get his brother out of prison, but ended up taking his place. When Royle died after only one series, the series' producers chose to replace him with a different actor playing the same character. * Monsieur Alfonse (Kenneth Connor) – "Alfonse, the half Belgian undertaker, 'swiftly and with style'." He is in love with Madame Edith, often wooing her with flowers and the prospect of living above the mortuary. He has a serious heart condition, causing his "dicky ticker" to go into overdrive when he glimpses the women's underclothing. He often helps the Resistance. When he hears that René is to remarry Madame Edith he challenges him to a duel, from which René eventually escapes, disguised as a woman. René is thereafter proclaimed by Alfonse as the Hero of the Resistance – "The bravest transvestite in all France". Alfonse is later due to officiate at the remarriage of René and Edith himself, in his capacity as Deputy Mayor, only to faint due to his "dicky ticker". He has set up a tunnel from his mortuary to the British POW camp to smuggle in the airmen. Funds for the Resistance, borrowed from him, are forged by LeClerc on their return – unfortunately leading Alfonse to feel generous and spend the money on a party at the café. * Major-General Erich von Klinkerhoffen (Hilary Minster) – A ruthless commander. He always threatens to have French peasants shot when the Resistance attacks the Germans. He occupies a rather grand château, where he is wooed by the serving girls as part of a Resistance mission to steal the knockwurst. He is later implicated in a plot to blow up Hitler, based on a conversation, misheard by the Gestapo, of a plan for a birthday party, with Hitler's painting at the head of the table, and the "blowing-up" is actually the sound of bursting cases of balloons. Von Klinkerhoffen is almost universally disliked by von Strohm and the other Germans – who unsuccessfully plot his assassination on more than one occasion. * Colonel Kurt von Strohm (Richard Marner) – The corrupt German town commandant. He is kept occupied by hiding valuable local paintings and antiques, which he intends to sell after the war. He frequently visits the café, where the waitresses provide him with much entertainment. He always gets René to do his dirty work, threatening him with the line "Othervise I vill have you shot!", with Captain Hans Geering agreeing in a high pitched voice, "He vould, he did it before!" Overweight, bumbling and greedy, the Colonel often promises René a cut of the profits but is quick to take them away. He exclaims with a big smug smile, "Ve are vinning ze var. I am a German officer and I can shoot anyone I like!". In episode 1 of series 8, Colonel von Strohm refers to himself as Kurt. * Lieutenant Hubert Gruber (Guy Siner) – A German officer on leave from the Russian front, with a crush on René. He is also responsible for forging certain pieces of art. He has the effeminate manner of a stereotypical homosexual, and owns a "little tank" (which we later find out to be called Hubert Jr. - possibly a reference to Hubert Selby Jr), driven by the unseen Clarence. Gruber is in charge of the firing squad that shoots René (unbeknownst to Gruber, with blanks) and feels terribly guilty about the incident. He takes over from Captain Geering as the Colonel's assistant after the captain is captured by the British. * Captain Hans Geering (Sam Kelly) (series 1 to 4, series 7) – Original assistant to Colonel von Strohm, he has various characteristics that run counter to the stereotype of a Nazi officer (for example, he is not at all shocked to discover that his uniform is being made by a Jewish tailor). He frequently visits the café. He is mistaken for a British airman in 'Camp Dance' and sent to Britain. He returns briefly in one episode of series 7, having accidentally become a trusted member of British intelligence. He is astonished to discover that his friends René and Edith are actually the mysterious 'Nighthawk', but is happy in his new life, having become a naturalised Englishman. He is notable for his odd pronunciation of 'colonel' which usually sounds like "Colon-Nell". Whenever the Germans have to salute the Führer, Geering often heralds him by saying "-tler!" instead of the full salute, which when spoken quickly sounds like "clop!". (In a 2007 BBC special, Kelly says about his character that "Hans was just too lazy to say the whole sentence". Rumours that the actor refused to give the regular salute are false.  Indeed, in the second series episode "Herr Flick's Revenge" and the third series episodes "Flight of Fancy", "Pretty Maids All in a Row" and "The Great Un-Escape", the Hans Geering character gives the full "Heil Hitler" salute. Kelly also went on to play Hitler himself in Stalag Luft in 1993.) * Captain Alberto Bertorelli (Gavin Richards, series 4 to 6; Roger Kitter, series 7) – An Italian who has come to the local town as Benito Mussolini has joined the war. He has an eye for the ladies and is known as a womaniser, often using the phrase "Da Beautiful-a Liedee I kiss-a de 'and-a". When saluting the Führer, Bertorelli instead says "Heil-a Mussolini", and when things go wrong he always says "What a mistake-a to make-a!" His Italian troops are unprofessional and always run away. In greeting, he kisses everybody except Gruber whom he knows about and so shakes his hand. Famously asked about his medals: "The first row are for service in Abyssinia. The second row are for service in North Africa". The last row? "They are for servicing Fiats!" Later he puts on a big feast for Madame Edith and is seen to put some of the olive oil on his hair. * Herr Otto Flick (Richard Gibson, series 1 to 8; David Janson, series 9) – The local Gestapo officer who tries to show as little emotion as possible. He dresses in a long leather double-breasted coat over a pinstriped suit, with a wide-brimmed leather hat, leather gloves and octagonal steel-rimmed glasses. In the episode "Watch the Birdie" (series 5, episode 9), he is shown wearing an SS uniform with the insignia of Sturmbannführer, and he may actually hold that rank, given that he is the godson of Heinrich Himmler (whose telephone number, according to Herr Flick, is "Berlin 1"). He fancies the equally blonde Helga Geerhart, whom he plans to marry after the war. He has a considerably exaggerated limp, and frequently uses the word 'Gestapo' as an adjective: "My powerful Gestapo binoculars", "My Gestapo staff car" etc. In one episode he answers the phone by announcing himself as "Flick, the Gestapo"; after a short period, he is forced to explain to the caller that he said 'Flick, the Gestapo' and not 'Fick (German for "f*ck") the Gestapo'. Herr Flick often hits von Smallhausen on the head with his cane, saying "Wrong!" When Herr Flick wants Helga to kiss him, he will say in a stern manner "You may kiss me now!" In the episode "Pigeon Post", it is revealed that he has the same taste in undergarments as Helga. Inviting Helga to the Gestapo dance, he explains their song: "You put your left boot in! You take your left boot out! You do a lot of shouting and you shake your fists about! You light a little smokie and you burn down ze town! Zat's vot it's all about! Ahh...Himmler, Himmler, Himmler...". When Richard Gibson decided not to return for the final series, the change of actors was explained by Herr Flick having had plastic surgery to avoid capture by approaching Allied forces, rather than being a straightforward recast – such as when Roger Kitter replaced Gavin Richards as Captain Bertorelli. * Private Helga Geerhart (Kim Hartman) – The Colonel's secretary, and lover of Herr Flick. She is well-built, and known for a tendency to take off her clothes for tenuous reasons, showcasing a vast range of erotic lingerie. This could be seen as a parody of Jane, a British comic strip character popular during World War II, who was always losing her clothes and constantly being captured or found in lingerie. She says, "When he's like this I always find it's best to strip off and ask questions later". When inquiring how they are going to be together after the war, he says: "I will take you for long walks on a short lead". Helga's attempts to seduce Herr Flick usually have no effect. Typically these include a particularly vigorous kiss. When announcing visitors to the Colonel's office, Helga always yells at the top of her voice, for example, "GENERAL ERICH VON KLINKERHOFFEN!" and "GO A-VAY!" Helga was a Lance Corporal for several episodes. * Herr Engelbert von Smallhausen (John Louis Mansi) (series 2 to 9) – Herr Flick's assistant. Dressed exactly like Herr Flick but only half as tall. He also copies his exaggerated limp. He often suggests stupid plans and ideas, only for them to be put down by Herr Flick. He once delivers an out-of-date ransom note from the Resistance, saying "It vas tied around a brick and thrown at my head – I have only just regained consciousness!" He once turns up at Herr Flick's dungeon banging on the door without success. The door then explodes and he enters over the wreckage to say apologetically "I forgot my key!"; Herr Flick then berates him for the misuse of Gestapo dynamite. He is often sent on spying missions or to eavesdrop on the radio, but he likes listening to Tommy Handley. When hypnotised in one episode, he reveals that his real name is Bobby Cedric von Smallhausen. * Officer (Captain) Crabtree (Arthur Bostrom) (series 2 to 9) – A British spy posing as a French police officer. Unfortunately, his French is weak and he is invariably unable to use the correct vowel sounds, which means that sometimes he is quite incomprehensible, most famously in his usual greeting "Good moaning!" (which he is even heard to use at night). Despite this, the Germans never seem to suspect him. To quote a notable example: "I was pissing by the door when I heard two shats. You are holding in your hind a smoking goon. You are clearly the guilty potty!" Another, during an air raid, is: "They have had a direct hot on the pimps!" "The pimps?" "The pimps! The pimps in the pimping station! No water is being pimped through the poops!" To repair the airmen's air balloon: "You must get your hands on girl's knockers. At least farty, maybe fifty." And: "I am mauving in a ginger fashion becerrs my poloceman's pints are full of dinamote!" He then unbuttons his flies and slowly pulls out several large knockwurst in front of the watching café patrons. After Crabtree is introduced in the series, Yvette frequently announces him as "That idiot British officer who thinks he can speak French". He says, "I admit my Fronch cod be butter." Another example comes when Officer Crabtree mistakes Captain Alberto Bertorelli for a German officer, addressing him with a raised hand and: "Hole Hotler!" (as taken from script) instead of "Heil Hitler!" When Captain Bertorelli points out he is actually Italian, Officer Crabtree responds with: "Hael Missuloni!" instead of "Hail Mussolini!" When Flick and von Smallhausen (in disguise as "vinkle" salesmen) comment on his strange accent, he tells them he's from "Nipples" (Naples). When they can't understand the answer, he frustratingly states "You know...See Nipples and do!" * RAF Flight Lieutenants Fairfax and Carstairs (John D. Collins and Nicholas Frankau) (series 1–7, series 9) – Two British airmen who are trying to get back to the United Kingdom, their plane having been shot down. Emerging from where they are hiding, they say "Hello!" with an exaggerated upper-class English accent. When talking to one another, Fairfax or Carstairs always start with the words "I say, Fairfax/Carstairs...". On discovery of the tunnel to the British POW camp, all the café staff are trapped there, including the Resistance and the hostage German officers, who then all have to adopt exaggerated RP accents as POWs, with large moustaches and flying helmets. On inspection by the German camp guards, they stand to attention saying clichés like "Toodle pip! Good Show! Bang on! Old fruit!" Humour is also derived from the French not being able to understand what the British airmen are saying, and vice versa, even though all the lines in the show are spoken in English. One of the minor characters of the show remarks on Fairfax and Carstairs' origin being from 402 Squadron, although in reality this was a Canadian RAF Squadron, whereas both characters are from England.  Recurring characters   * General Leopold von Flockenstuffen (Ken Morley) (series 5–7) – A German general, whose sexuality is similar to that of Gruber. At one point he has to take over command of the district when von Klinkerhoffen is considered to have gone completely mad. * Denise Laroque (Moira Foot) (series 5) – Original leader of the Communist resistance and childhood sweetheart of René. * Louise (Carole Ashby) (series 5–9) – Later leader of the Communist resistance, she is also in love with René. * Henriette (Phoebe Scholfield) (series 1–2 & 5–6) – Michelle's assistant in the Resistance. Often appears alongside Michelle during attempts to save the British airmen. * Corporal Caponi (John Banks) (series 5–6) – Captain Bertorelli's second-in-command of the Italian troops stationed in Nouvion. * Private Elsa Bigstern (Louise Gold) (series 7) is Helga's replacement when she leaves for a course. She is a masculine type with red hair and a booming voice. She is young, keen and eager, to the point where she stuns General von Klinkerhoffen and Colonel von Strohm. She starts a relationship with Herr Flick, but seems the more dominant of the two, much to his disgust. When Helga returns, Elsa disappears without explanation. * Dr LeConte (David Rowlands) (series 8–9) The local doctor in Nouvion, with asthma and in a worse state than anyone else in the town. * Clarence – Lt. Gruber's tank driver. Drives Gruber's little tank quite often for him, but is never actually seen in the flesh. Gruber often gives him orders to drive the tank, and is sometimes mentioned by him while in René's café. * Madame Lennard - Aside from Clarence, Madame Lennard is the most frequently-mentioned character. She works as a milliner and dressmaker. She models a see-through nightdress for Edith when she plans to remarry René, and René, who was looking through the keyhole, enjoys watching her as she is "well-stacked". By series eight, Dr Le Conte does pregnancy tests for both the newly widowed Madame Lennard and Yvette; but the frogs he used for the pregnancy test jumped into each other's jars, meaning Yvette was not pregnant with René's child (as she had believed), meaning that Madame Lennard was pregnant out of wedlock. * The Fallen Madonna (With The Big Boobies) by Van Klomp – A valuable portrait whose location and authenticity is a key concern to other characters, the original changing hands frequently, as well as various fake copies. Other antiques (such as a painting referred to as The Cracked Vase with the Big Daisies by Van Gogh, essentially one of the Sunflower paintings) occasionally crop up, but The Fallen Madonna often recurs throughout all of the series, often hidden in sausages or other guises. No one ever knows who has the original. Once Herr Flick manages to get hold of three copies and comments "I have three paintings with six big boobies!"  The late Lord Bath was a big fan of Allo 'Allo!, and in 1992 created an exhibition in his ancestral home Longleat. In return the BBC made a copy of the painting of the Fallen Madonna, which may still be seen today.  Character table   Key  *A dark grey cell indicates the character was not in that Series. *(a) indicates that the character returned in a one episode cameo. *The table shows only characters written in with new scenes, not appearances in archive footage.  Languages   It could have been tricky to represent to the audience the (perhaps) four different languages (French, German, Italian and English) spoken by the characters. The programme uses the device of representing each language with English spoken in a theatrical foreign accent.  For example, an exchange between French-speaking characters, conducted in English with a French accent, is totally incomprehensible to the British airmen until Michelle (the only French character who speaks English) switches to Bertie Wooster-esque "top hole, old chap" style banter in an upper-class English accent. The British undercover officer Crabtree, in the permanent disguise of a French-speaking gendarme, speaks abominable French. His (presumed) mangling of French vowels is represented by similarly distorted English, most famously his customary greeting catchphrase of "Good moaning"; many of his distortions come out as innuendoes, such as "I was pissing by the door, and I thought I would drip in".  The Germans, generally, speak in a more guttural way than the French. Bertorelli, the Italian captain, speaks in a nasal tone, generally adding an "-a" at the end of certain words: for instance in his catchphrase, "What a mistake-a to make-a!". Other examples included "We drop-a the bolls", "I kiss-a your hand-a". In spite of the difficulties in communicating with the British characters, the French, Germans, and Italians all understand each other perfectly, the implication apparently being that they all understand French (and Bertorelli understands German spoken when no French are present) which they use when talking to one another, but in which their own accents remain evident.  When one particular plan calls for Herr Flick and von Smallhausen to impersonate British airmen, a gramophone record is used to learn the 'nuances' of English. This essentially consists of the non-word sounds suitably voiced with the signature 'upper-class English accent' employed in the programme. Within the scope of the on-screen action, it is a surprisingly effective masquerade.  In one episode, René is actually forced to speak German. His voice is noticeably more high-pitched, which may be a gag concerning the way the Germans talk.  The last few series introduced a new gag, where Colonel von Strohm and Lieutenant Gruber are put in situations where they have to speak in a strange manner. In one episode the two try to learn Spanish, which is basically "German" with high-pitched voices and mangled consonants. In another they are forced to wear "suicide teeth" – large bulky dentures containing poison – making them garble their speech to avoid releasing the poison. In yet another, von Strohm and Gruber are posing as Frenchmen, and are forced to speak French. This comes out as another set of non-words sounding like "Woffel woffel, woffel woffel". A further episode features a Swedish art dealer inspecting The Fallen Madonna, who pronounces "Heil Hitler!" as "Oil Jesus!"  Episodes   After the pilot aired in December 1982, a full-length first series of seven episodes was commissioned and aired from September 1984 onwards. Series two, three and four followed annually, with six episodes each.  Series five was commissioned with a view to syndicating the show in America.  As a result, it aired as a single long series of twenty-six episodes between September 1988 and February 1989. The attempts to air the show in America failed (although the series later became popular on PBS), and so series six had only eight episodes commissioned, which aired from September 1989 onwards.  On 25 January 1990, Gorden Kaye suffered serious head injuries in a car crash brought on by gale-force winds. This delayed the start of the seventh series, which consisted of ten episodes airing from January 1991 onwards. Series 8 (7 episodes) followed in January 1992, and the ninth and final series of six episodes aired later that year from September onwards.  Two Christmas specials were also made. The first was a 45-minute episode, which followed Series 2 in 1985, and the second was also a 45-minute episode, screened at Christmas 1991, preceding Series 8.  In 1994, two years after the series ended, the BBC broadcast The Best of 'Allo 'Allo!, a compilation of clips from the series, linked by new scenes featuring Gorden Kaye and Carmen Silvera, in which René and Edith reminisce about the events of the war.  On 22 March 2007, a one-off special episode entitled The Return of 'Allo 'Allo! was filmed in Manchester, and was broadcast on 28 April 2007 at 9 pm on BBC 2. The storyline involves René writing his memoirs after the war, and the events from the final episode in 1992 have been overlooked. The new scenes were interspersed with clips from the original series and new interviews. The actors who reprised their roles were: Gorden Kaye, Vicki Michelle, Sue Hodge, Kirsten Cooke, Arthur Bostrom, Guy Siner, Robin Parkinson, John D. Collins and Nicholas Frankau. In addition, Richard Gibson and Sam Kelly are interviewed, although they are not reprising their respective roles. The only main characters who did not appear in the reunion at all (where the actor or actress who played the character originally was then alive) were Private Helga Geerhart (played by Kim Hartman) and Herr Engelbert von Smallhausen (played by John Louis Mansi). Jeremy Lloyd wrote the new material.    End credits   At the end of the each show, the end credits begin with a short vignette shot of each of the main characters with the actor's name displayed below. The shots are not always actual clips from the episode but usually re-enactments of a specific shot or action for each character from that episode. Being an ensemble show, the actor credits are given in the order of their first spoken line for that particular episode. Because every episode begins with René recapping the plot to camera thus far, Gorden Kaye is always first (even if he is not the first seen on screen, such as the start of episode 26 "The Sausages in the Trousers" where Mimi and Edith are first seen, but René has the first line). Gorden Kaye was credited first in all but one of the episodes, where he was credited second behind Carmen Silvera.  Cultural references   The show's premise was not to make fun of the war but to spoof war-based film and TV dramas, and in particular a BBC1 drama Secret Army, which ran from 1977 to 1979 and dealt with the activities of a Belgian "escape line" that returned allied pilots to Britain, working from a Brussels café and later restaurant. Many of the elements and characters are directly taken from Secret Army, such as the café owner having an affair in the restaurant under the nose of his wife, a bed-ridden woman in a room above who knocks on the floor for attention, a pianist who is also the forger, and the enmity between the Gestapo and the German military. Many storylines for  'Allo 'Allo also derive directly from episodes of Secret Army, such as the valuable paintings and the accompanying forgeries, which both the Germans and the Resistance are seeking to obtain in an episode from the second series of Secret Army. Some actors from Secret Army also appear in 'Allo 'Allo!: Richard Marner, Guy Siner, John D. Collins, Hilary Minster and David Beckett. Inspiration was also drawn from patriotic black-and-white British melodramas of the 1940s.  The French village setting is reminiscent of 1972s Clochemerle, whilst Rene's intermediary role between the Germans and the Resistance reflects a comic version of Rick from Casablanca (as well as directly matching the proprietor of the café in Secret Army).  Two of the BBC's earlier wartime-based comedies – Dad's Army and It Ain't Half Hot Mum – were also written by David Croft in partnership with Jimmy Perry. Several actors from Allo 'Allo! also appeared in these series: Carmen Silvera, Rose Hill, Jack Haig, Joy Allen, Michael Stainton, Robert Aldous, John Leeson, John D. Collins and Robin Parkinson in Dad's Army, and Robin Parkinson, Gorden Kaye, John D. Collins, Iain Rattray and Eric Dodson in It Ain't Half Hot Mum.  The Shelburne Escape and Evasion Line (Operation Bonaparte) of the Second World War (Comet Line) has some similarities to this series. More than 300 airmen and agents escaped through this line.  Music   Having a café cabaret in the plot, music was often performed on the show. This usually took place with Madame Edith singing, and either Lt. Gruber or LeClerc at the piano. Occasionally, Gruber sang and played piano at the same time. Characters could also be seen whistling or humming tunes at certain points.  Theme tune   David Croft and Roy Moore composed the theme tune performed at the start and end of each episode. It features a French-style melody performed on an accordion. The title is London Calling, but according to Guy Siner the first lyrics are: Allo 'Allo, we meet again,And just as before...  Carmen Silvera sang the full song and was released on LP in the 1980s.   Other music   The café cabaret music usually took the form of 1930s film and show tunes – reminiscent of the way period songs were also used in Secret Army.  Most popular was "Louise" from the film Innocents in Paris (1953), which featured a number of times and was even sung in the "broken-French" language of Crabtree, who pronounced the title "Loo-woes". Gruber sang a number such as "Can't Help Lovin' Dat Man" from Show Boat or "(I Got a Woman Crazy for Me) She's Funny That Way" by Neil Monet and Richard A. Whiting. He gazed at René in a slightly lustful manner, replacing lyrics such as "woman" and "she" with "boy" and "he". He caused a particular sensation with his straight version of Noël Coward's "Mad About the Boy".  Naturally the "La Marseillaise" and the German National Anthem "Deutschlandlied" featured from time to time, for example where several French peasants sang La Marsellaise to celebrate the expected bombing of the Germans, but the singers flawlessly and without hesitation switch to Das Lied der Deutschen when the Germans come past. Helga also sometimes stripped to a rather raunchy version of the latter tune.  Captain Bertorelli could be seen singing "'O Sole Mio (It's Now or Never)"; and the British airmen in a prisoner of war camp could be seen singing "Hitler Has Only Got One Ball".  In 1986, Gorden Kaye and Vicki Michelle released a version of the hit song Je t'aime... moi non plus. The characters of Yvette and René could be heard talking and canoodling in a comic manner whilst the familiar musical Je t′aime melody played in the background. The song got to number fifty-seven in the UK Singles Chart.   Stage show   The show gave rise to a successful touring stage-show featuring most of the TV cast. This ran from 1986 to 1992 and included three London stage runs as well as international tours.  In January 1990 Gorden Kaye suffered serious head injuries in a car accident. As a result, his understudy, John Larson, played the part in a London Palladium production. Kaye still has a dent in his forehead from a piece of wood that smashed through the car window. He wanted to end the television show after his accident, but was convinced by Jeremy Lloyd to continue.  In Australia Gorden Kaye's part was played by Australian comedian/impressionist Max Gillies (later, Gorden Kaye repaid the favour when he took over Max Gillies' role in another play in Australia, when Max Gillies was unable to take part).  The show was last performed for a summer season at Bournemouth's Pier Theatre in 1996.  In 2007 Gorden Kaye, Sue Hodge and Guy Siner reprised their roles in a production of the stage show in Brisbane, Australia. They were joined by Steven Tandy as Colonel von Strohm and Jason Gann as Herr Flick.   A new touring show, based on the 1992 tour written by David Croft and Jeremy Lloyd, opened at the Gordon Craig Theatre in Stevenage, Hertfordshire on 29 August 2008 before going on a national tour in 2009.  Vicki Michelle is reprising her role as Yvette Carte-Blanche. The Cast also included Jeffrey Holland playing Rene Artois and his wife Judy Buxton playing Michelle. Other cast members included Robin Sebastian as Gruber, James Rossman as Herr Flick, Nell Jerram as Private Helga Geerhart and Claire Andreadis as Mimi Labonq.  The theatrical version is also frequently performed by amateur theatre companies in the UK and elsewhere.  Locations   Although the French town of Nouvion in which the series is set indeed exists, all filming was done in Norfolk.   From 1982-1987 all interior scenes were filmed in front of a live studio audience at the BBC Television Centre studios in London. From 1988 production moved to Elstree Studios in the BBC's Studio D. With hopes for a US syndication deal the BBC planned to make 26 new episodes of the sitcom and so bigger space was needed for the production. Even though the US syndication deal did not go ahead as planned, production remained at Elstree Studios for the remaining episodes of the show which ended in 1992. With more space to play with, the outside set of Café Rene became a semi-permanent structure in the former ATV Garage building.      DVD releases   Australian and New Zealand releases   In Australia, Roadshow Entertainment, under licence from the BBC began releasing the series on DVD in 2006, on a semi-annual basis. To date, all series have been released on DVD with only "The Return of 'Allo 'Allo!" TV special remaining.  UK releases   Universal Playback, under licence from the BBC, began releasing the series on DVD in 2002. In the UK six box sets with series 1–9 have been released, as well as a complete box set.  The original UK releases have episode titles superimposed over the openings of the episodes (series 1–4). The 2013 re-release of the complete series box set omits the majority, but not all of these superimposed titles. The American releases have no on-screen episode titles, reflecting the way that the shows were originally transmitted.  North American releases   In January 2004, BBC Worldwide began releasing the show themselves onto DVD in North America, beginning with Series 1. The releases have continued on a somewhat irregular basis (approximately twice-yearly).  * Note: The Best of 'Allo 'Allo! is included as an extra on the series nine DVDs.
"""

preprocessed_example = preprocess(example_doc)
print(example_doc)
print()
print(preprocessed_example)


melanie molitor mom tennis world 1


melani molitor mom tenni world 1


## Apply Preprocessing to Data

### Wikipedia Evidence

In [None]:
import multiprocessing as mp


documents_dict = load_pickle(wikipedia_evidence_file)


def preprocess_with_key(key_value):
    (idx, (key, value)) = key_value
    
    if idx % 1000 == 0:
        print(idx, datetime.datetime.now())
    
    return key, preprocess(value)


with mp.Pool(mp.cpu_count()) as pool:
    preprocessed = pool.imap_unordered(preprocess_with_key, enumerate(documents_dict.items()), chunksize=10)
    preprocessed_filtered = filter(lambda docText: docText[1], preprocessed)
    preprocessed_documents_dict = dict(preprocessed_filtered)


0 2021-05-02 17:03:34.183092
1000 2021-05-02 17:04:18.072637
2000 2021-05-02 17:04:56.761132
3000 2021-05-02 17:05:38.677236
4000 2021-05-02 17:06:22.539016
5000 2021-05-02 17:07:01.915552
6000 2021-05-02 17:07:43.277680
7000 2021-05-02 17:08:18.152408
8000 2021-05-02 17:09:00.993457
9000 2021-05-02 17:09:40.233415
10000 2021-05-02 17:10:15.820553
11000 2021-05-02 17:10:53.613171
12000 2021-05-02 17:11:28.792987
13000 2021-05-02 17:12:05.520814
14000 2021-05-02 17:12:45.030672
15000 2021-05-02 17:13:23.296752
16000 2021-05-02 17:14:04.572937
17000 2021-05-02 17:14:42.455192
18000 2021-05-02 17:15:20.757213
19000 2021-05-02 17:16:02.251539
20000 2021-05-02 17:16:42.854163
21000 2021-05-02 17:17:24.589846
22000 2021-05-02 17:18:02.921418
23000 2021-05-02 17:18:40.697818
24000 2021-05-02 17:19:17.749195
25000 2021-05-02 17:19:54.301245
26000 2021-05-02 17:20:39.336642
27000 2021-05-02 17:21:17.487271
28000 2021-05-02 17:22:01.755831
29000 2021-05-02 17:22:39.108972
30000 2021-05-02 17:23:

In [None]:
save_as_pickle(preprocessed_documents_dict, preprocessed_wikipedia_evidence_file)

### Wikipedia QA

In [None]:
import json

def preprocess_qa_file(input_file, output_file):
  with open(input_file) as json_file:
    qa_data = json.load(json_file)['Data']
    
  print(len(qa_data))
  for question_dict in qa_data:
    question_dict['Question_preprocessed'] = preprocess(question_dict['Question'])
  
  save_as_pickle(qa_data, output_file)

In [None]:
preprocess_qa_file(qa_wikipedia_verified_dev_filename, preprocessed_qa_wikipedia_verified_dev_filename)
preprocess_qa_file(qa_wikipedia_dev_filename, preprocessed_qa_wikipedia_dev_filename)
preprocess_qa_file(qa_wikipedia_test_without_answers_filename, preprocessed_qa_wikipedia_test_without_answers_filename)
preprocess_qa_file(qa_wikipedia_train_filename, preprocessed_qa_wikipedia_train_filename)

318
7993
7701
61888
