In [1]:
import pandas as pd
import re
import string

# 2 Cleaning Text Data

In [2]:
data = pd.read_csv('data_after_step_1.csv')
data = data.drop(labels=['Unnamed: 0'],axis=1)

In [3]:
data.head()

Unnamed: 0,PDF_Page,Chapter,Chapter_Page,Text,Number of Words
0,26,1,1,1-1Chapter 1Aircraft StructuresA Brief History...,19
1,26,1,1,Advances in materials and processes \nused to ...,27
2,26,1,1,Combined with continuous powerplant \n\ndevelo...,13
3,26,1,1,The key discovery that ﬁliftﬂ could be created...,28
4,26,1,1,George Cayley \ndeveloped an efficient cambere...,21


In the cleaning process I got rid of kind of symbols, numbers, line breaks,transformed the text to lowercase (etc)

In [4]:
# Convert all text to lowercase
data['Text'] = data['Text'].apply(lambda sentence: sentence.lower())
# remove symbols, exclamation marks... --> '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~' 
data['Text'] = data['Text'].apply(lambda sentence: re.sub('[%s]' % re.escape(string.punctuation), '', sentence))
# remove numbers 
data['Text'] = data['Text'].apply(lambda sentence: re.sub('[0-9]', '', sentence))
# remove line breaks special characters
data['Text'] = data['Text'].apply(lambda sentence: re.sub('[\t\n\r\f\v]' , '', sentence))
# Substitute multiple white spaces characters for one
data['Text'] = data['Text'].apply(lambda sentence: re.sub(' +' , ' ', sentence))
# From analysing the data afterwards the symbol ° has shown up several time
data['Text'] = data['Text'].apply(lambda sentence: sentence.replace('°',''))

In [5]:
data.head()

Unnamed: 0,PDF_Page,Chapter,Chapter_Page,Text,Number of Words
0,26,1,1,chapter aircraft structuresa brief history of ...,19
1,26,1,1,advances in materials and processes used to co...,27
2,26,1,1,combined with continuous powerplant developmen...,13
3,26,1,1,the key discovery that ﬁliftﬂ could be created...,28
4,26,1,1,george cayley developed an efficient cambered ...,21


# 3 Removing Stopwords, and Lemmatizing 

### NLTK ---> Lemmatizer did not work so well
http://www.nltk.org/api/nltk.stem.html?highlight=lemmatizer

In [6]:
# tokenize each sentence
#data['Raw Sentence tokenize'] = data['Text'].apply(lambda sentence: sentence.split(' '))
#Stopwords
#from nltk.corpus import stopwords
#stop_words = set(stopwords.words('english')) # set lookup O(1) --> verify?
#data['W/O stop words Sentence tokenize'] = data['Raw Sentence tokenize']\
#.apply(lambda sentence: [word for word in sentence if word not in stop_words])
#Lemmatization --> http://www.nltk.org/api/nltk.stem.html?highlight=lemmatizer
#from nltk.stem import WordNetLemmatizer
#lemma = WordNetLemmatizer()
#data['W/O SW and Lemm Sentence tokenize'] = data['W/O stop words Sentence tokenize']\
#.apply(lambda sentence: [lemma.lemmatize(word) for word in sentence])

### Spacy
https://spacy.io/usage/linguistic-features

In [7]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [8]:
# Remove Stop words and Lemmatize sentences
data['Tokenize']=data['Text']\
.apply(lambda sentence: [token.lemma_ for token in list(nlp(sentence)) if not token.is_stop])

In [9]:
data.head()

Unnamed: 0,PDF_Page,Chapter,Chapter_Page,Text,Number of Words,Tokenize
0,26,1,1,chapter aircraft structuresa brief history of ...,19,"[chapter, aircraft, structuresa, brief, histor..."
1,26,1,1,advances in materials and processes used to co...,27,"[advance, material, process, construct, aircra..."
2,26,1,1,combined with continuous powerplant developmen...,13,"[combine, continuous, powerplant, development,..."
3,26,1,1,the key discovery that ﬁliftﬂ could be created...,28,"[key, discovery, ﬁliftﬂ, create, pass, air, cu..."
4,26,1,1,george cayley developed an efficient cambered ...,21,"[george, cayley, develop, efficient, camber, a..."


In [10]:
data.to_csv('data_after_step_2.csv')

Got better results in lemmatizing using Spacy, althouth it took longer to process the data