# Import Statements

In [1]:
from bs4 import BeautifulSoup
from sklearn.decomposition import TruncatedSVD, LatentDirichletAllocation, NMF
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer
import itertools
import pandas as pd
import requests
import time
import warnings
warnings.filterwarnings("ignore")

# Data Source Selection

## API Query Outcome

In [2]:
acled = pd.read_csv('C:\\Users\\missn\\Documents\\Thinkful\\CSV Datasets\\Unit 7 Capstone Project\\2017-09-01-2018-09-01-Cameroon-Nigeria.csv')

In [3]:
acled.source.value_counts()

Vanguard (Lagos)                                                           506
The Sun (Nigeria)                                                          191
The Nation (Nigeria)                                                       113
L'Oeil du Sahel                                                            109
Daily Independent (Nigeria)                                                106
The Guardian (Lagos)                                                        87
This Day (Lagos)                                                            87
Local Source Project                                                        85
Local sources                                                               75
Agence France Presse                                                        72
Nigerian Tribune                                                            56
Camer.be                                                                    52
Daily Trust (Abuja)                                 

**Selections:**
- https://www.vanguardngr.com | <span class="girk">Worked</span> 
- http://sunnewsonline.com/ | <span class="girk">Worked</span>
- http://thenationonlineng.net/
- https://independent.ng/ | <span class="girk">Worked</span>
- https://www.afp.com/en | <span class="burk">Didn't Work</span>
- https://www.tribuneonlineng.com/ | <span class="girk">Worked</span>
- https://www.dailytrust.com.ng/ | <span class="girk">Worked</span>
- https://www.journalducameroun.com/ | <span class="burk">Didn't Work</span>
- https://www.premiumtimesng.com/ | <span class="burk">Didn't Work</span>
- https://www.bareta.news/ | <span class="girk">Worked</span>
- http://www.cameroonpostline.com/ | <span class="girk">Worked</span>
- http://saharareporters.com/ | <span class="burk">Didn't Work</span>


# Site Scraping, Soup Making, & Corpus Creation

## Site Scraping

### Lightweight Ruby Web Service

In [4]:
path_or_buf = 'C:\\Users\\missn\\Documents\\Thinkful\\Python Notebooks\\2 Bootcamp\\links_json.js'
output = pd.read_json(path_or_buf)
output = output['links'].apply(pd.Series).rename(columns={'searchTerm': 'Search Term', 'site': 'Site', 'link': 'Link', 'url': 'URL'}).drop(['URL'], axis=1)
output = output[['Search Term', 'Site', 'Link']].dropna()

In [5]:
output

Unnamed: 0,Search Term,Site,Link
0,Ambazonia,Vanguard,https://www.vanguardngr.com/2018/02/vision-amb...
1,Ambazonia,Vanguard,https://www.vanguardngr.com/2018/02/vision-amb...
2,Ambazonia,Vanguard,https://www.vanguardngr.com/2018/02/declared-a...
3,Ambazonia,Vanguard,https://www.vanguardngr.com/2018/02/declared-a...
4,Ambazonia,Vanguard,https://www.vanguardngr.com/2018/04/france-amb...
5,Ambazonia,Vanguard,https://www.vanguardngr.com/2018/04/france-amb...
6,Ambazonia,Vanguard,https://www.vanguardngr.com/tag/ambazonia/
7,Ambazonia,Vanguard,https://www.vanguardngr.com/tag/ambazonia/
8,Ambazonia,Vanguard,https://www.vanguardngr.com/tag/ambazonia-repu...
9,Ambazonia,Vanguard,https://www.vanguardngr.com/tag/ambazonia-repu...


In [6]:
output['Link'].duplicated()

0      False
1       True
2      False
3       True
4      False
5       True
6      False
7       True
8      False
9       True
10     False
11      True
12     False
13      True
14     False
15      True
16     False
17      True
18     False
19      True
23     False
24      True
25     False
26      True
27     False
28      True
29     False
30      True
31     False
32      True
       ...  
278    False
279    False
280    False
281    False
282    False
283    False
284    False
285    False
286    False
287    False
288    False
289    False
290    False
291    False
292    False
293    False
294    False
295    False
296    False
297    False
298    False
299    False
300    False
301    False
302    False
303    False
304    False
305    False
306    False
307    False
Name: Link, Length: 290, dtype: bool

In [7]:
output.loc[output.duplicated(), :].count()

Search Term    62
Site           62
Link           62
dtype: int64

In [8]:
output_clean = output.drop_duplicates()

In [9]:
indexes = output_clean[output_clean['Link'].str.contains('#respond')].index.tolist()
output_clean = output_clean.drop(indexes, axis = 0)

In [10]:
print(indexes)

[269, 271, 273, 275, 277, 279, 281, 283, 285, 287, 289, 291, 293, 295, 297, 299, 301, 303, 305, 307]


In [11]:
output_clean

Unnamed: 0,Search Term,Site,Link
0,Ambazonia,Vanguard,https://www.vanguardngr.com/2018/02/vision-amb...
2,Ambazonia,Vanguard,https://www.vanguardngr.com/2018/02/declared-a...
4,Ambazonia,Vanguard,https://www.vanguardngr.com/2018/04/france-amb...
6,Ambazonia,Vanguard,https://www.vanguardngr.com/tag/ambazonia/
8,Ambazonia,Vanguard,https://www.vanguardngr.com/tag/ambazonia-repu...
10,Ambazonia,Vanguard,https://www.vanguardngr.com/2018/01/dss-preven...
12,Ambazonia,Vanguard,https://www.vanguardngr.com/2018/10/ngo-urges-...
14,Ambazonia,Vanguard,https://www.vanguardngr.com/2018/01/camerounia...
16,Ambazonia,Vanguard,https://www.vanguardngr.com/2018/02/cameroun-b...
18,Ambazonia,Vanguard,https://www.vanguardngr.com/2018/01/cameroon-a...


In [12]:
output_clean['Site'].unique()

array(['Vanguard', 'Sun News', 'The Nation', 'Tribune', 'Daily Trust',
       'Independent', 'Bareta News', 'Cameroon Post'], dtype=object)

## Soup Making

In [13]:
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) \ AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36'}

In [14]:
vanguard = output_clean.loc[output_clean['Site'] == 'Vanguard']
vanguard_documents_list = []

for link in vanguard['Link']: 
    link_response = requests.get(link, headers = headers)
    time.sleep(5)
    link_soup =  BeautifulSoup(link_response.content, "html.parser")
    vanguard_documents_list.append(link_soup.find("div", class_ = "entry-content").get_text())
    
vanguard['Document'] = vanguard_documents_list

In [15]:
sun_news = output_clean.loc[output_clean['Site'] == 'Sun News']
sun_news_documents_list = []

for link in sun_news['Link']: 
    link_response = requests.get(link, headers = headers)
    time.sleep(5)
    link_soup =  BeautifulSoup(link_response.content, "html.parser")
    sun_news_documents_list.append(link_soup.find("div", class_ = "content-inner").get_text())
    
sun_news['Document'] = sun_news_documents_list

In [16]:
the_nation = output_clean.loc[output_clean['Site'] == 'The Nation']
the_nation_documents_list = []
for link in the_nation['Link']: 
    link_response = requests.get(link, headers = headers)
    time.sleep(5)
    link_soup =  BeautifulSoup(link_response.content, "html.parser")
    the_nation_documents_list.append(link_soup.find("div", class_ = "entry-content").get_text())

the_nation['Document'] = the_nation_documents_list

In [17]:
tribune = output_clean.loc[output_clean['Site'] == 'Tribune']
tribune_documents_list = []

for link in tribune['Link']: 
    link_response = requests.get(link, headers = headers)
    time.sleep(5)
    link_soup =  BeautifulSoup(link_response.content, "html.parser")
    tribune_documents_list.append(link_soup.find("div", class_ = "entry-content clearfix").get_text())
    
tribune['Document'] = tribune_documents_list

In [18]:
daily_trust = output_clean.loc[output_clean['Site'] == 'Daily Trust']
daily_trust_documents_list = []

for link in vanguard['Link']: 
    link_response = requests.get(link, headers = headers)
    time.sleep(5)
    link_soup =  BeautifulSoup(link_response.content, "html.parser")
    daily_trust_documents_list.append(link_soup.find("div", class_ = "entry-content").get_text())
    
daily_trust['Document'] = daily_trust_documents_list

In [19]:
independent = output_clean.loc[output_clean['Site'] == 'Independent']
independent_documents_list = []

for link in independent['Link']: 
    link_response = requests.get(link, headers = headers)
    time.sleep(5)
    link_soup =  BeautifulSoup(link_response.content, "html.parser")
    independent_documents_list.append(link_soup.find("div", class_ = "td-post-content").get_text())
    
independent['Document'] = independent_documents_list

In [20]:
bareta_news = output_clean.loc[output_clean['Site'] == 'Bareta News']
bareta_news_documents_list = []

for link in bareta_news['Link']: 
    link_response = requests.get(link, headers = headers)
    time.sleep(5)
    link_soup =  BeautifulSoup(link_response.content, "html.parser")
    bareta_news_documents_list.append(link_soup.find("div", class_ = "theiaPostSlider_preloadedSlide").get_text())
    
bareta_news['Document'] = bareta_news_documents_list

In [21]:
cameroon_post = output_clean.loc[output_clean['Site'] == 'Cameroon Post']
cameroon_post_documents_list = []

for link in cameroon_post['Link']: 
    link_response = requests.get(link, headers = headers)
    time.sleep(5)
    link_soup =  BeautifulSoup(link_response.content, "html.parser")
    cameroon_post_documents_list.append(link_soup.find("div", class_ = "entry-content").get_text())
    
cameroon_post['Document'] = cameroon_post_documents_list

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHA

## Corpus Creation

In [35]:
output_clean['Document'] = pd.concat([vanguard['Document'],sun_news['Document'],the_nation['Document'],tribune['Document'],daily_trust['Document'],independent['Document'],bareta_news['Document'],cameroon_post['Document']], axis = 0)

In [38]:
output_clean['Document'] = output_clean['Document'].str.replace(r'\n', ' ')

In [39]:
output_clean.head()

Unnamed: 0,Search Term,Site,Link,Document
0,Ambazonia,Vanguard,https://www.vanguardngr.com/2018/02/vision-amb...,By Emmanuel Unah & Victoria Ojeme This Insigh...
2,Ambazonia,Vanguard,https://www.vanguardngr.com/2018/02/declared-a...,"By Emmanuel Unah, Victoria Ojeme & Dotun Ibiw..."
4,Ambazonia,Vanguard,https://www.vanguardngr.com/2018/04/france-amb...,"By Derric Yuh For the past three weeks, an en..."
6,Ambazonia,Vanguard,https://www.vanguardngr.com/tag/ambazonia/,"In Manyu in particular, schools cannot re-ope..."
8,Ambazonia,Vanguard,https://www.vanguardngr.com/tag/ambazonia-repu...,"In the English speaking part of Cameroun, if..."


In [40]:
output_clean.tail()

Unnamed: 0,Search Term,Site,Link,Document
298,Boko Haram,Cameroon Post,https://cameroonpostline.com/appointing-atanga...,"Appointing Atanga Nji, Nalova Lyonga Ministe..."
300,Boko Haram,Cameroon Post,https://cameroonpostline.com/released-scnc-fir...,Released SCNC Firebrand Warns gov’t Against ...
302,Boko Haram,Cameroon Post,https://cameroonpostline.com/as-parliament-ope...,As Parliament Opens: House Speaker Ignores M...
304,Boko Haram,Cameroon Post,https://cameroonpostline.com/fame-ndongo-stoke...,Fame Ndongo Stokes Flames Of Anglophone Cris...
306,Boko Haram,Cameroon Post,https://cameroonpostline.com/i-predicted-the-a...,I Predicted The Anglophone Problem In 2008 –...


In [41]:
path_or_buf = 'Output_Clean.csv'
output_clean.to_csv(path_or_buf)