# **FINAL PROJECT for the AI Developer Internship at Technocolabs Softwares**

## Task 1: Data Preprocessing by Naaima BEN KADOUR  

# Preprocessing of the dataset question_bank.csv

### Import all the modules 

In [57]:
import numpy as np
import pandas as pd
import re
import nltk
import spacy
import string
from nltk.corpus import stopwords
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import wordnet


### Read the dataset using pandas

In [58]:
pd.options.mode.chained_assignment = None

full_df = pd.read_csv("question_bank.csv", nrows=836)
df = full_df[["ClarifyingQuestion"]]
df["Text"] = df["ClarifyingQuestion"].astype(str)
full_df.head(10)

Unnamed: 0,qrel,ClarifyingQuestion
0,q_149,Which color blocks?
1,q_436,After you remove the one green block there are...
2,q_111,in any square west of the red blocks?
3,q_653,Should I destory east or west puyrple?
4,q_170,Where exactly am I placing the blue blocks?
5,q_479,What purple blocks? There are none only blue.
6,q_321,How do i place in the bottom if the purple are...
7,q_236,Where on the map do I place the yellow block?
8,q_293,Should the two purple blocks I'm setting be pl...
9,q_965,How many blue blocks am I adding to the first ...


### Start the preprocessing by applying the lowercase to all the data from ClarifyingQuestion column

In [59]:
df["lowercase"] = df["ClarifyingQuestion"].str.lower()
df.head()

Unnamed: 0,ClarifyingQuestion,Text,lowercase
0,Which color blocks?,Which color blocks?,which color blocks?
1,After you remove the one green block there are...,After you remove the one green block there are...,after you remove the one green block there are...
2,in any square west of the red blocks?,in any square west of the red blocks?,in any square west of the red blocks?
3,Should I destory east or west puyrple?,Should I destory east or west puyrple?,should i destory east or west puyrple?
4,Where exactly am I placing the blue blocks?,Where exactly am I placing the blue blocks?,where exactly am i placing the blue blocks?


### Eliminate the punctuation from the data

In [60]:
PUNCT_TO_REMOVE = string.punctuation
def remove_punctuation(text):
    """custom function to remove the punctuation"""
    return text.translate(str.maketrans('', '', PUNCT_TO_REMOVE))

df["pnctRemoved"] = df["lowercase"].apply(lambda text: remove_punctuation(text))
df.head()

Unnamed: 0,ClarifyingQuestion,Text,lowercase,pnctRemoved
0,Which color blocks?,Which color blocks?,which color blocks?,which color blocks
1,After you remove the one green block there are...,After you remove the one green block there are...,after you remove the one green block there are...,after you remove the one green block there are...
2,in any square west of the red blocks?,in any square west of the red blocks?,in any square west of the red blocks?,in any square west of the red blocks
3,Should I destory east or west puyrple?,Should I destory east or west puyrple?,should i destory east or west puyrple?,should i destory east or west puyrple
4,Where exactly am I placing the blue blocks?,Where exactly am I placing the blue blocks?,where exactly am i placing the blue blocks?,where exactly am i placing the blue blocks


### Delete the stopwords from the data

In [61]:
#download the stopwords
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [62]:
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [63]:
STOPWORDS = set(stopwords.words('english'))
def remove_stopwords(text):
    """custom function to remove the stopwords"""
    return " ".join([word for word in str(text).split() if word not in STOPWORDS])

df["text_wo_stop"] = df["pnctRemoved"].apply(lambda text: remove_stopwords(text))
df.head()

Unnamed: 0,ClarifyingQuestion,Text,lowercase,pnctRemoved,text_wo_stop
0,Which color blocks?,Which color blocks?,which color blocks?,which color blocks,color blocks
1,After you remove the one green block there are...,After you remove the one green block there are...,after you remove the one green block there are...,after you remove the one green block there are...,remove one green block green blocks green bloc...
2,in any square west of the red blocks?,in any square west of the red blocks?,in any square west of the red blocks?,in any square west of the red blocks,square west red blocks
3,Should I destory east or west puyrple?,Should I destory east or west puyrple?,should i destory east or west puyrple?,should i destory east or west puyrple,destory east west puyrple
4,Where exactly am I placing the blue blocks?,Where exactly am I placing the blue blocks?,where exactly am i placing the blue blocks?,where exactly am i placing the blue blocks,exactly placing blue blocks


### Lemmatization (reducing inflected words)

In [71]:
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [72]:
#Lemmatization

lemmatizer = WordNetLemmatizer()
wordnet_map = {"N":wordnet.NOUN, "V":wordnet.VERB, "J":wordnet.ADJ, "R":wordnet.ADV}
def lemmatize_words(text):
    pos_tagged_text = nltk.pos_tag(text.split())
    return " ".join([lemmatizer.lemmatize(word, wordnet_map.get(pos[0], wordnet.NOUN)) for word, pos in pos_tagged_text])

df["text_lemmatized"] = df["text_wo_stop"].apply(lambda text: lemmatize_words(text))
df.head()

Unnamed: 0,ClarifyingQuestion,Text,lowercase,pnctRemoved,text_wo_stop,text_stemmed,text_lemmatized
0,Which color blocks?,Which color blocks?,which color blocks?,which color blocks,color blocks,color block,color block
1,After you remove the one green block there are...,After you remove the one green block there are...,after you remove the one green block there are...,after you remove the one green block there are...,remove one green block green blocks green bloc...,remov one green block green block green block ...,remove one green block green block green block...
2,in any square west of the red blocks?,in any square west of the red blocks?,in any square west of the red blocks?,in any square west of the red blocks,square west red blocks,squar west red block,square west red block
3,Should I destory east or west puyrple?,Should I destory east or west puyrple?,should i destory east or west puyrple?,should i destory east or west puyrple,destory east west puyrple,destori east west puyrpl,destory east west puyrple
4,Where exactly am I placing the blue blocks?,Where exactly am I placing the blue blocks?,where exactly am i placing the blue blocks?,where exactly am i placing the blue blocks,exactly placing blue blocks,exactli place blue block,exactly place blue block


In [73]:
df.drop(["text_stemmed"], axis=1, inplace=True)

In [76]:
#show the dataframe
df

Unnamed: 0,ClarifyingQuestion,Text,lowercase,pnctRemoved,text_wo_stop,text_lemmatized
0,Which color blocks?,Which color blocks?,which color blocks?,which color blocks,color blocks,color block
1,After you remove the one green block there are...,After you remove the one green block there are...,after you remove the one green block there are...,after you remove the one green block there are...,remove one green block green blocks green bloc...,remove one green block green block green block...
2,in any square west of the red blocks?,in any square west of the red blocks?,in any square west of the red blocks?,in any square west of the red blocks,square west red blocks,square west red block
3,Should I destory east or west puyrple?,Should I destory east or west puyrple?,should i destory east or west puyrple?,should i destory east or west puyrple,destory east west puyrple,destory east west puyrple
4,Where exactly am I placing the blue blocks?,Where exactly am I placing the blue blocks?,where exactly am i placing the blue blocks?,where exactly am i placing the blue blocks,exactly placing blue blocks,exactly place blue block
...,...,...,...,...,...,...
830,Where should I build the blue blocks after des...,Where should I build the blue blocks after des...,where should i build the blue blocks after des...,where should i build the blue blocks after des...,build blue blocks destroying red blocks replac...,build blue block destroy red block replace red...
831,Would the row of 2 blocks be under the blue st...,Would the row of 2 blocks be under the blue st...,would the row of 2 blocks be under the blue st...,would the row of 2 blocks be under the blue st...,would row 2 blocks blue structure added,would row 2 block blue structure add
832,There are only orange blocks not yellow.,There are only orange blocks not yellow.,there are only orange blocks not yellow.,there are only orange blocks not yellow,orange blocks yellow,orange block yellow
833,Left and right as I face which direction?,Left and right as I face which direction?,left and right as i face which direction?,left and right as i face which direction,left right face direction,left right face direction


### Save the dataframe to csv file

In [78]:
df.to_csv(r'cleaned_questionsBank.csv', index=False)

# Preprocessing of the dataset clarifying_questions_train.csv


### Read the dataset using pandas

In [83]:
pd.options.mode.chained_assignment = None

C_questions = pd.read_csv("clarifying_questions_train.csv", nrows=6827)
C_questions

Unnamed: 0,GameId,ClarifyingQuestion,InitializedWorldPath,InputInstruction,IsInstructionClear,Partition,qrel,qbank
0,CQ-game-1,,initial_world_states/builder-data/34-c135/step-20,Destroy the two blocks over the row that is co...,Yes,train,,
1,CQ-game-10,,initial_world_states/builder-data/12-c139/step-22,Place one yellow block on top of each purple b...,Yes,train,,
2,CQ-game-10,,initial_world_states/builder-data/12-c139/step-22,Place one yellow block on top of each purple b...,Yes,train,,
3,CQ-game-1000,Which color blocks?,initial_world_states/builder-data/23-c135/step-2,Place four blocks to the east of the highest b...,No,train,q_149,"'q_696', 'q_203', 'q_516', 'q_677', 'q_769', '..."
4,CQ-game-1001,,initial_world_states/builder-data/4-c96/step-18,Locate the purple structure and the block on i...,Yes,train,,
...,...,...,...,...,...,...,...,...
6822,CQ-game-992,,initial_world_states/builder-data/32-c126/step-2,Place a yellow block immediately to the west o...,Yes,train,,
6823,CQ-game-993,,initial_world_states/builder-data/2-c116/step-16,Stack three green blocks on top of every green...,Yes,train,,
6824,CQ-game-994,,initial_world_states/builder-data/7-c73/step-4,Place a green block on top of the highest gree...,Yes,train,,
6825,CQ-game-995,Which way should the orange blocks be placed?,initial_world_states/builder-data/11-c97/step-4,Facing North place a column of three green blo...,No,train,q_241,"'q_683', 'q_918', 'q_144', 'q_710', 'q_273', '..."


In [95]:
C_questions["InputInstruction"].astype(str)
C_questions.head()

Unnamed: 0,GameId,ClarifyingQuestion,InitializedWorldPath,InputInstruction,IsInstructionClear,Partition,qrel,qbank
0,CQ-game-1,,initial_world_states/builder-data/34-c135/step-20,Destroy the two blocks over the row that is co...,Yes,train,,
1,CQ-game-10,,initial_world_states/builder-data/12-c139/step-22,Place one yellow block on top of each purple b...,Yes,train,,
2,CQ-game-10,,initial_world_states/builder-data/12-c139/step-22,Place one yellow block on top of each purple b...,Yes,train,,
3,CQ-game-1000,Which color blocks?,initial_world_states/builder-data/23-c135/step-2,Place four blocks to the east of the highest b...,No,train,q_149,"'q_696', 'q_203', 'q_516', 'q_677', 'q_769', '..."
4,CQ-game-1001,,initial_world_states/builder-data/4-c96/step-18,Locate the purple structure and the block on i...,Yes,train,,


### Lowercase

In [97]:
C_questions["II_lowercase"] = C_questions["InputInstruction"].str.lower()
C_questions.head()

Unnamed: 0,GameId,ClarifyingQuestion,InitializedWorldPath,InputInstruction,IsInstructionClear,Partition,qrel,qbank,lowercase,II_lowercase
0,CQ-game-1,,initial_world_states/builder-data/34-c135/step-20,Destroy the two blocks over the row that is co...,Yes,train,,,destroy the two blocks over the row that is co...,destroy the two blocks over the row that is co...
1,CQ-game-10,,initial_world_states/builder-data/12-c139/step-22,Place one yellow block on top of each purple b...,Yes,train,,,place one yellow block on top of each purple b...,place one yellow block on top of each purple b...
2,CQ-game-10,,initial_world_states/builder-data/12-c139/step-22,Place one yellow block on top of each purple b...,Yes,train,,,place one yellow block on top of each purple b...,place one yellow block on top of each purple b...
3,CQ-game-1000,Which color blocks?,initial_world_states/builder-data/23-c135/step-2,Place four blocks to the east of the highest b...,No,train,q_149,"'q_696', 'q_203', 'q_516', 'q_677', 'q_769', '...",place four blocks to the east of the highest b...,place four blocks to the east of the highest b...
4,CQ-game-1001,,initial_world_states/builder-data/4-c96/step-18,Locate the purple structure and the block on i...,Yes,train,,,locate the purple structure and the block on i...,locate the purple structure and the block on i...


### Remove the punctuation:

In [98]:
PUNCT_TO_REMOVE = string.punctuation
def remove_punctuation(text):
    """custom function to remove the punctuation"""
    return text.translate(str.maketrans('', '', PUNCT_TO_REMOVE))

C_questions["pnctRemoved"] = C_questions["lowercase"].apply(lambda text: remove_punctuation(text))
C_questions.head()

Unnamed: 0,GameId,ClarifyingQuestion,InitializedWorldPath,InputInstruction,IsInstructionClear,Partition,qrel,qbank,lowercase,II_lowercase,pnctRemoved
0,CQ-game-1,,initial_world_states/builder-data/34-c135/step-20,Destroy the two blocks over the row that is co...,Yes,train,,,destroy the two blocks over the row that is co...,destroy the two blocks over the row that is co...,destroy the two blocks over the row that is co...
1,CQ-game-10,,initial_world_states/builder-data/12-c139/step-22,Place one yellow block on top of each purple b...,Yes,train,,,place one yellow block on top of each purple b...,place one yellow block on top of each purple b...,place one yellow block on top of each purple b...
2,CQ-game-10,,initial_world_states/builder-data/12-c139/step-22,Place one yellow block on top of each purple b...,Yes,train,,,place one yellow block on top of each purple b...,place one yellow block on top of each purple b...,place one yellow block on top of each purple b...
3,CQ-game-1000,Which color blocks?,initial_world_states/builder-data/23-c135/step-2,Place four blocks to the east of the highest b...,No,train,q_149,"'q_696', 'q_203', 'q_516', 'q_677', 'q_769', '...",place four blocks to the east of the highest b...,place four blocks to the east of the highest b...,place four blocks to the east of the highest b...
4,CQ-game-1001,,initial_world_states/builder-data/4-c96/step-18,Locate the purple structure and the block on i...,Yes,train,,,locate the purple structure and the block on i...,locate the purple structure and the block on i...,locate the purple structure and the block on i...


### Remove the stopwords:

In [99]:
STOPWORDS = set(stopwords.words('english'))
def remove_stopwords(text):
    """custom function to remove the stopwords"""
    return " ".join([word for word in str(text).split() if word not in STOPWORDS])

C_questions["text_wo_stop"] = C_questions["pnctRemoved"].apply(lambda text: remove_stopwords(text))
C_questions.head()

Unnamed: 0,GameId,ClarifyingQuestion,InitializedWorldPath,InputInstruction,IsInstructionClear,Partition,qrel,qbank,lowercase,II_lowercase,pnctRemoved,text_wo_stop
0,CQ-game-1,,initial_world_states/builder-data/34-c135/step-20,Destroy the two blocks over the row that is co...,Yes,train,,,destroy the two blocks over the row that is co...,destroy the two blocks over the row that is co...,destroy the two blocks over the row that is co...,destroy two blocks row connected column also d...
1,CQ-game-10,,initial_world_states/builder-data/12-c139/step-22,Place one yellow block on top of each purple b...,Yes,train,,,place one yellow block on top of each purple b...,place one yellow block on top of each purple b...,place one yellow block on top of each purple b...,place one yellow block top purple block top row
2,CQ-game-10,,initial_world_states/builder-data/12-c139/step-22,Place one yellow block on top of each purple b...,Yes,train,,,place one yellow block on top of each purple b...,place one yellow block on top of each purple b...,place one yellow block on top of each purple b...,place one yellow block top purple block top row
3,CQ-game-1000,Which color blocks?,initial_world_states/builder-data/23-c135/step-2,Place four blocks to the east of the highest b...,No,train,q_149,"'q_696', 'q_203', 'q_516', 'q_677', 'q_769', '...",place four blocks to the east of the highest b...,place four blocks to the east of the highest b...,place four blocks to the east of the highest b...,place four blocks east highest block horizontally
4,CQ-game-1001,,initial_world_states/builder-data/4-c96/step-18,Locate the purple structure and the block on i...,Yes,train,,,locate the purple structure and the block on i...,locate the purple structure and the block on i...,locate the purple structure and the block on i...,locate purple structure block eastern part pla...


### Lemmatization:

In [100]:
#Lemmatization
C_questions["text_lemmatized"] = C_questions["text_wo_stop"].apply(lambda text: lemmatize_words(text))
C_questions.head()

Unnamed: 0,GameId,ClarifyingQuestion,InitializedWorldPath,InputInstruction,IsInstructionClear,Partition,qrel,qbank,lowercase,II_lowercase,pnctRemoved,text_wo_stop,text_lemmatized
0,CQ-game-1,,initial_world_states/builder-data/34-c135/step-20,Destroy the two blocks over the row that is co...,Yes,train,,,destroy the two blocks over the row that is co...,destroy the two blocks over the row that is co...,destroy the two blocks over the row that is co...,destroy two blocks row connected column also d...,destroy two block row connect column also dest...
1,CQ-game-10,,initial_world_states/builder-data/12-c139/step-22,Place one yellow block on top of each purple b...,Yes,train,,,place one yellow block on top of each purple b...,place one yellow block on top of each purple b...,place one yellow block on top of each purple b...,place one yellow block top purple block top row,place one yellow block top purple block top row
2,CQ-game-10,,initial_world_states/builder-data/12-c139/step-22,Place one yellow block on top of each purple b...,Yes,train,,,place one yellow block on top of each purple b...,place one yellow block on top of each purple b...,place one yellow block on top of each purple b...,place one yellow block top purple block top row,place one yellow block top purple block top row
3,CQ-game-1000,Which color blocks?,initial_world_states/builder-data/23-c135/step-2,Place four blocks to the east of the highest b...,No,train,q_149,"'q_696', 'q_203', 'q_516', 'q_677', 'q_769', '...",place four blocks to the east of the highest b...,place four blocks to the east of the highest b...,place four blocks to the east of the highest b...,place four blocks east highest block horizontally,place four block east high block horizontally
4,CQ-game-1001,,initial_world_states/builder-data/4-c96/step-18,Locate the purple structure and the block on i...,Yes,train,,,locate the purple structure and the block on i...,locate the purple structure and the block on i...,locate the purple structure and the block on i...,locate purple structure block eastern part pla...,locate purple structure block eastern part pla...


### Save and download the preprocessed data:

In [101]:
C_questions.to_csv(r'cleaned_clarifyingQuestionsTrain.csv', index=False)