## Wikipedia Crawl example

Author: J. Hickman

- This code crawls through wikipedia to get a bunch of text data
- The code lets the user specify search category topics.
  - The more different the topics are, the easier the classification will be.
  - For example, i used (pizza, metallurgy, basketball)
- It then searches wikipedia for articles related to these topics
- Loops over the wikipedia pages and gets the text from the wikipedia pages
- Breaks the text into chunks (based on a user input specifying the number of sentences per chunk)
- Each chunk is cleaned and tagged with a "label" (classification) and a numeric "sentiment score" (regression)
- These cleaned chunks form a corpus of strings with associated tags

```
python -m pip install wikipedia_sections
```

### Import

In [2]:
# conda install -c conda-forge wikipedia
# conda install -c conda-forge wordcloud
# pip install wikipedia_sections

import wikipedia
import nltk
import string 
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk.sentiment import SentimentIntensityAnalyzer
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np




In [3]:
# RUN THE FOLLOWING IF YOU HAVEN'T DOWNLOADED THESE BEFORE
nltk.download('vader_lexicon')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('stopwords')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/modeedna/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package punkt to /Users/modeedna/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/modeedna/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/modeedna/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/modeedna/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

### Set user parameters 

In [4]:
# PARAMETERS 
label_list=['FIFA World Cup','FIFA Champion','FIFA Ranking']
max_num_pages=25
sentence_per_chunk=5
min_sentence_length=20

# GET STOPWORDS
# from nltk.corpus import stopwords
stop_words=nltk.corpus.stopwords.words('english')

# INITALIZE STEMMER+LEMITZIZER+SIA
sia = SentimentIntensityAnalyzer()
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

### Define text cleaning function

In [5]:
def clean_string(text):
	# #FILTER OUT UNWANTED CHAR
	new_text=""
	# keep=string.printable
	keep=" abcdefghijklmnopqrstuvwxyz0123456789"
	for character in text:
		if character.lower() in keep:
			new_text+=character.lower()
		else: 
			new_text+=" "
	text=new_text
	# print(text)

	# #FILTER OUT UNWANTED WORDS
	new_text=""
	for word in nltk.tokenize.word_tokenize(text):
		if word not in nltk.corpus.stopwords.words('english'):
			#lemmatize 
			tmp=lemmatizer.lemmatize(word)
			# tmp=stemmer.stem(tmp)

			# update word if there is a change
			# if(tmp!=word): print(tmp,word)
			
			word=tmp
			if len(word)>1:
				if word in [".",",","!","?",":",";"]:
					#remove the last space
					new_text=new_text[0:-1]+word+" "
				else: #add a space
					new_text+=word.lower()+" "
	text=new_text.strip()
	return text

#clean_string('the word "pizza" first appeared in a Latin text from the town of Gaeta, then still part of the Byzantine Empire, in 997 AD; the text states that a tenant of certain property is to give the bishop of Gaeta duodecim pizze ("twelve pizzas") every Christmas Day, and another twelve every Easter Sunday.Suggested etymologies include:')


### Preform a wikipedia crawl


In [6]:
#INITIALIZE 
corpus=[]  # list of strings (input variables X)
targets=[] # list of targets (labels or response variables Y)

#--------------------------
# LOOP OVER TOPICS 
#--------------------------
for label in label_list:

	#SEARCH FOR RELEVANT PAGES 
	titles=wikipedia.search(label,results=max_num_pages)
	print("Pages for label =",label,":",titles)

	#LOOP OVER WIKI-PAGES
	for title in titles:
		try:
			print("	",title)
			wiki_page = wikipedia.page(title, auto_suggest=True)

			# LOOP OVER SECTIONS IN ARTICLE AND GET PAGE TEXT
			for section in wiki_page.sections:
				text=wiki_page.section(section); #print(text)

				#BREAK IN TO SENTANCES 
				sentences=nltk.tokenize.sent_tokenize(text)
				counter=0
				text_chunk=''

				#LOOP OVER SENTENCES 
				for sentence in sentences:
					if len(sentence)>min_sentence_length:
						if(counter%sentence_per_chunk==0 and counter!=0):
							# PROCESS COMPLETED CHUNK 
							
							# CLEAN STRING
							text_chunk=clean_string(text_chunk)

							# REMOVE LABEL IF IN STRING (MAKES IT TOO EASY)
							text_chunk=text_chunk.replace(label,"")
							
							# REMOVE ANY DOUBLE SPACES
							text_chunk=' '.join(text_chunk.split()).strip()

							#UPDATE CORPUS 
							corpus.append(text_chunk)

							#UPDATE TARGETS
							score=sia.polarity_scores(text_chunk)
							target=[label,score['compound']]
							targets.append(target)

							#print("TEXT\n",text_chunk,target)

							# RESET CHUNK FOR NEXT ITERATION 
							text_chunk=sentence
						else:
							text_chunk+=sentence
						#print("--------\n", sentence)
						counter+=1

		except:
			print("WARNING: SOMETHING WENT WRONG:", title);  


Pages for label = FIFA World Cup : ['FIFA World Cup', '2018 FIFA World Cup', '2022 FIFA World Cup', '2014 FIFA World Cup', '2010 FIFA World Cup', '2026 FIFA World Cup', '2030 FIFA World Cup', '2006 FIFA World Cup', '2002 FIFA World Cup', 'List of FIFA World Cup finals', '1998 FIFA World Cup', "2023 FIFA Women's World Cup", 'FIFA Club World Cup', 'FIFA World Cup awards', 'FIFA World Cup hosts', '1994 FIFA World Cup', "FIFA Women's World Cup", 'FIFA U-20 World Cup', '2034 FIFA World Cup', '2026 FIFA World Cup qualification', '1990 FIFA World Cup', '1930 FIFA World Cup', "FIFA Men's World Ranking", '1986 FIFA World Cup', 'List of FIFA World Cup anthems and songs']
	 FIFA World Cup
	 2018 FIFA World Cup
	 2022 FIFA World Cup
	 2014 FIFA World Cup
	 2010 FIFA World Cup
	 2026 FIFA World Cup
	 2030 FIFA World Cup
	 2006 FIFA World Cup
	 2002 FIFA World Cup
	 List of FIFA World Cup finals
	 1998 FIFA World Cup
	 2023 FIFA Women's World Cup
	 FIFA Club World Cup
	 FIFA World Cup awards
	 FIFA 

### Save results

In [7]:
#SANITY CHECKS AND PRINT TO FILE 
print("number of text chunks = ",len(corpus))
print("number of targets = ",len(targets))

tmp=[]
for i in range(0,len(corpus)):
    tmp.append([corpus[i],targets[i][0],targets[i][1]])
df=pd.DataFrame(tmp)
df=df.rename(columns={0: "text", 1: "label", 2: "sentiment"})
print(df)
df.to_csv('wiki-crawl-results.csv',index=False)

number of text chunks =  727
number of targets =  727
                                                  text           label  \
0    world first international football match chall...  FIFA World Cup   
1    great britain represented england national ama...  FIFA World Cup   
2    west auckland tournament returned 1911 success...  FIFA World Cup   
3    due success olympic football tournament fifa p...  FIFA World Cup   
4    total 13 nation took part seven south america ...  FIFA World Cup   
..                                                 ...             ...   
722  2022 world cup qualification wale drawn group ...    FIFA Ranking   
723  primary kit long red crest football associatio...    FIFA Ranking   
724  2000 2009 wale played home match millennium st...    FIFA Ranking   
725  ryan giggs scored wale goal match becoming fir...    FIFA Ranking   
726  2014 world cup qualifying campaign saw four ho...    FIFA Ranking   

     sentiment  
0       0.8360  
1       0.8360  
2     

### Extra Code

In [1]:
# #RELOAD FILE AND PRETEND THAT IS OUR STARTING POINT 
# df=pd.read_csv('wiki-crawl-results.csv')  
# #print(df)

# #CONVERT FROM STRING LABELS TO INTEGERS 
# labels=[]; #y1=[]; y2=[]
# y1=[]
# for label in df["label"]:
#     if label not in labels:
#         labels.append(label)
#         print("index =",len(labels)-1,": label =",label)
#     for i in range(0,len(labels)):
#         if(label==labels[i]):
#             y1.append(i)
# y1=np.array(y1)

# # CONVERT DF TO LIST OF STRINGS 
# corpus=df["text"].to_list()
# y2=df["sentiment"].to_numpy()

# print("number of text chunks = ",len(corpus))
# print(len(y1))
# print(corpus[0:3])

# # INITIALIZE COUNT VECTORIZER
# vectorizer=CountVectorizer()   

# # RUN COUNT VECTORIZER ON OUR COURPUS 
# Xs  =  vectorizer.fit_transform(corpus)   
# X=np.array(Xs.todense())

# #CONVERT TO ONE-HOT VECTORS
# maxs=np.max(X,axis=0)
# X=np.ceil(X/maxs)

# # DOUBLE CHECK 
# print(X.shape,y1.shape,y2.shape)