In [5]:
from Web_scrapping import extract_internal_links
from Web_scrapping import extract_text_data_from_internal_links
import pandas as pd
from datetime import datetime

In [7]:
import nbformat
from IPython import get_ipython
from IPython.core.interactiveshell import InteractiveShell
import re

def run_notebook(notebook_path):
    with open(notebook_path) as f:
        nb = nbformat.read(f, as_version=4)
    shell = InteractiveShell.instance()
    for cell in nb.cells:
        if cell.cell_type == 'code':
            shell.run_cell(cell.source)
run_notebook('normalizing_amh_lett.ipynb')
run_notebook('removing_eng_words.ipynb')
run_notebook('removing_punctuation.ipynb')

In [6]:

# URL of the homepage
homepage_url = 'https://ethiopiainsider.com/'

# Extract internal links
internal_links = extract_internal_links(homepage_url)

# Extract text data from the internal links
text_data_eth = extract_text_data_from_internal_links(internal_links)

df_eth = pd.DataFrame(columns=['Date', 'Website', 'Article', 'links'])

# Current date for the 'Date' column
current_date = datetime.now().strftime('%Y-%m-%d')

# Populate the DataFrame with articles and metadata
for link, article in zip(internal_links, text_data_eth):
    new_row = {
        'Date': current_date,
        'Website': homepage_url,
        'Article': article,
        'links': link
    }
    df_eth = pd.concat([df_eth, pd.DataFrame([new_row])], ignore_index=True)

# Display the DataFrame
print(df_eth)

           Date                       Website  \
0    2024-05-25  https://ethiopiainsider.com/   
1    2024-05-25  https://ethiopiainsider.com/   
2    2024-05-25  https://ethiopiainsider.com/   
3    2024-05-25  https://ethiopiainsider.com/   
4    2024-05-25  https://ethiopiainsider.com/   
..          ...                           ...   
183  2024-05-25  https://ethiopiainsider.com/   
184  2024-05-25  https://ethiopiainsider.com/   
185  2024-05-25  https://ethiopiainsider.com/   
186  2024-05-25  https://ethiopiainsider.com/   
187  2024-05-25  https://ethiopiainsider.com/   

                                               Article  \
0                                                        
1                                                        
2                                                        
3                                                        
4                                                        
..                                                 ...   
183  

In [13]:
df_eth['Website'] = df_eth['Website'].str.replace('https://', '').str.replace('.com', '')

# Data Processing

### Dropping Duplicates

In [8]:
df_eth = df_eth.drop_duplicates(subset=['Article'])

In [14]:
df_eth

Unnamed: 0,Date,Website,Article,links
0,2024-05-25,ethiopiainsider/,,https://ethiopiainsider.com/
9,2024-05-25,ethiopiainsider/,". .. , , . ...",https://ethiopiainsider.com/careers/
14,2024-05-25,ethiopiainsider/,የኤርትራው ፕሬዝዳንት ኢሳያስ አፈወርቂ “ጠቅላይ እና ሁሉን ልቆጣጠር ባይ...,https://ethiopiainsider.com/2024/13111/
17,2024-05-25,ethiopiainsider/,የትግራይ ክልል ጊዜያዊ አስተዳደር በአላማጣ ከተማ አቅራቢያ በሚገኙት “ገ...,https://ethiopiainsider.com/2024/13100/
20,2024-05-25,ethiopiainsider/,ጠቅላይ ሚኒስትር አብይ አህመድ ስልጣን ከያዙ ወዲህ ለስድስተኛ ጊዜ የመከ...,https://ethiopiainsider.com/2024/13080/
23,2024-05-25,ethiopiainsider/,የመንግስት የስራ ኃላፊዎች ለህክምና ወደ ውጭ ሀገራት የሚያደርጉት ጉዞ ከ...,https://ethiopiainsider.com/2024/13070/
26,2024-05-25,ethiopiainsider/,ባለፈው ማክሰኞ ግንቦት 6፤ 2016 ለፓርላማ የቀረበው የፌደራል መንግስት...,https://ethiopiainsider.com/2024/13066/
30,2024-05-25,ethiopiainsider/,በተስፋለም ወልደየስ በአዲስ አበባ ፒያሳ አካባቢ ከተገነባው የአድዋ ድል ...,https://ethiopiainsider.com/2024/12740/
33,2024-05-25,ethiopiainsider/,በተስፋለም ወልደየስ በአዲስ አበባ ከተማ ፒያሳ አካባቢ በተለምዶ ዶሮ ማነ...,https://ethiopiainsider.com/2024/12658/
36,2024-05-25,ethiopiainsider/,በተስፋለም ወልደየስ ጊዜው ጠቅላይ ሚኒስትር አብይ አህመድ ወደ ስልጣን የ...,https://ethiopiainsider.com/2023/12050/


### Removing English words

In [15]:
df_eth['Article']=df_eth['Article'].apply(remove_english)

In [16]:
df_eth

Unnamed: 0,Date,Website,Article,links
0,2024-05-25,ethiopiainsider/,,https://ethiopiainsider.com/
9,2024-05-25,ethiopiainsider/,". .. , , . ...",https://ethiopiainsider.com/careers/
14,2024-05-25,ethiopiainsider/,የኤርትራው ፕሬዝዳንት ኢሳያስ አፈወርቂ “ጠቅላይ እና ሁሉን ልቆጣጠር ባይ...,https://ethiopiainsider.com/2024/13111/
17,2024-05-25,ethiopiainsider/,የትግራይ ክልል ጊዜያዊ አስተዳደር በአላማጣ ከተማ አቅራቢያ በሚገኙት “ገ...,https://ethiopiainsider.com/2024/13100/
20,2024-05-25,ethiopiainsider/,ጠቅላይ ሚኒስትር አብይ አህመድ ስልጣን ከያዙ ወዲህ ለስድስተኛ ጊዜ የመከ...,https://ethiopiainsider.com/2024/13080/
23,2024-05-25,ethiopiainsider/,የመንግስት የስራ ኃላፊዎች ለህክምና ወደ ውጭ ሀገራት የሚያደርጉት ጉዞ ከ...,https://ethiopiainsider.com/2024/13070/
26,2024-05-25,ethiopiainsider/,ባለፈው ማክሰኞ ግንቦት 6፤ 2016 ለፓርላማ የቀረበው የፌደራል መንግስት...,https://ethiopiainsider.com/2024/13066/
30,2024-05-25,ethiopiainsider/,በተስፋለም ወልደየስ በአዲስ አበባ ፒያሳ አካባቢ ከተገነባው የአድዋ ድል ...,https://ethiopiainsider.com/2024/12740/
33,2024-05-25,ethiopiainsider/,በተስፋለም ወልደየስ በአዲስ አበባ ከተማ ፒያሳ አካባቢ በተለምዶ ዶሮ ማነ...,https://ethiopiainsider.com/2024/12658/
36,2024-05-25,ethiopiainsider/,በተስፋለም ወልደየስ ጊዜው ጠቅላይ ሚኒስትር አብይ አህመድ ወደ ስልጣን የ...,https://ethiopiainsider.com/2023/12050/


### Normalizing Letters

In [18]:
df_eth['Article']=df_eth['Article'].apply(normalize_char_level_missmatch)

In [19]:
df_eth

Unnamed: 0,Date,Website,Article,links
0,2024-05-25,ethiopiainsider/,,https://ethiopiainsider.com/
9,2024-05-25,ethiopiainsider/,". .. , , . ...",https://ethiopiainsider.com/careers/
14,2024-05-25,ethiopiainsider/,የኤርትራው ፕሬዝዳንት ኢሳያስ አፈወርቂ “ጠቅላይ እና ሁሉን ልቆጣጠር ባይ...,https://ethiopiainsider.com/2024/13111/
17,2024-05-25,ethiopiainsider/,የትግራይ ክልል ጊዜያዊ አስተዳደር በአላማጣ ከተማ አቅራቢያ በሚገኙት “ገ...,https://ethiopiainsider.com/2024/13100/
20,2024-05-25,ethiopiainsider/,ጠቅላይ ሚኒስትር አብይ አህመድ ስልጣን ከያዙ ወዲህ ለስድስተኛ ጊዜ የመከ...,https://ethiopiainsider.com/2024/13080/
23,2024-05-25,ethiopiainsider/,የመንግስት የስራ ሀላፊዎች ለህክምና ወደ ውጭ ሀገራት የሚያደርጉት ጉዞ ከ...,https://ethiopiainsider.com/2024/13070/
26,2024-05-25,ethiopiainsider/,ባለፈው ማክሰኞ ግንቦት 6፤ 2016 ለፓርላማ የቀረበው የፌደራል መንግስት...,https://ethiopiainsider.com/2024/13066/
30,2024-05-25,ethiopiainsider/,በተስፋለም ወልደየስ በአዲስ አበባ ፒያሳ አካባቢ ከተገነባው የአድዋ ድል ...,https://ethiopiainsider.com/2024/12740/
33,2024-05-25,ethiopiainsider/,በተስፋለም ወልደየስ በአዲስ አበባ ከተማ ፒያሳ አካባቢ በተለምዶ ዶሮ ማነ...,https://ethiopiainsider.com/2024/12658/
36,2024-05-25,ethiopiainsider/,በተስፋለም ወልደየስ ጊዜው ጠቅላይ ሚኒስትር አብይ አህመድ ወደ ስልጣን የ...,https://ethiopiainsider.com/2023/12050/


### Removing Special Characters

In [24]:
df_eth['Article'] = df_eth['Article'].apply(remove_punc_and_special_chars)

TypeError: sub() missing 1 required positional argument: 'string'