In [68]:
%matplotlib inline

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.utils import shuffle
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer

import warnings
warnings.filterwarnings('ignore')

np.random.seed(42) 

# Exploratory Data Analysis

In [56]:
data = pd.read_csv('data/Amharic News Dataset.csv')

data = shuffle(data)
data.head()

Unnamed: 0,headline,category,date,views,article,link
44810,በመዲናዋ የጋራ መኖሪያ ቤቶችን ግንባታ ለማጠናቀቅና ለማስጀመር የ20 ቢሊ...,ፖለቲካ,"September 14, 2017",Unknown,በአዲስ አበባ የተጀመሩ የጋራ መኖሪያ ቤቶችን ግንባታ ለማጠናቀቅና አዳዲስ...,https://waltainfo.com/am/29360/
18329,አሸንዳ፣ ሻደይ፣ አሸንድዬ፣ ሶለል፣ ማሪያና ዓይኒ ዋሪ በዓል በዩኔስኮ አ...,ሀገር አቀፍ ዜና,"Jan 8, 2020",387,አዲስ አበባ፣ ታህሳስ 29፣ 2012 (ኤፍ.ቢ.ሲ) የአሸንዳ፣ ሻደይ፣ አሸ...,https://www.fanabc.com/%e1%8b%a8%e1%8a%a0%e1%8...
17644,ግምታዊ ዋጋቸው ከ31 ነጥብ 6 ሚሊየን ብር በላይ የሆኑ የኮንትሮባንድ ዕ...,ሀገር አቀፍ ዜና,"Mar 4, 2020",634,አዲስ አበባ ፣ የካቲት 25 ፣ 2012 (ኤፍ ቢ ሲ) ባለፉት አምስት ቀና...,https://www.fanabc.com/%e1%8c%8d%e1%88%9d%e1%8...
32773,​ባምላክ ተሰማ የዓለም ዋንጫ ለመምራት እጩ ከሆኑ ዳኞች ውስጥ ተካተተ,ስፖርት,"November 19, 2017",Unknown,ሩሲያ በ2018 ለምታስተናግደው የዓለም ዋንጫ ፊፋ ጨዋታዎችን ሊመሩ የሚች...,https://soccerethiopia.net/football/31318
4139,“መንገድ ለሰው” መርሐ ግብር በየሳምንቱ እንደሚካሄድ ተገለፀ,ሀገር አቀፍ ዜና,"August 7, 2019",32,አዲስ አበባ፤- “መንገድ ለሰው” በሚል በአዲስ አበባ ከተማ ሲካሄድ የነ...,https://www.press.et/Ama/?p=15606


In [33]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 51483 entries, 44810 to 15795
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   headline  51470 non-null  object
 1   category  51482 non-null  object
 2   date      51483 non-null  object
 3   views     51483 non-null  object
 4   article   51483 non-null  object
 5   link      51483 non-null  object
dtypes: object(6)
memory usage: 2.7+ MB


In [57]:
data = data.dropna(subset=['article'])

In [58]:
data['link'].value_counts()

https://amharic.voanews.com//a/amb-tibor-nagy-in-addis-ababa-on-sudan-06-14-19/4960197.html                                                                                                                                                                                                                                                 3
https://amharic.voanews.com//a/covid-africa/5535902.html                                                                                                                                                                                                                                                                                    3
https://amharic.voanews.com//a/looking-ahead-Ethiopian-renaissance-dam-weaspire-with-voa-alula-kebede-june-2020/5470822.html                                                                                                                                                                                                                

In [59]:
data.category.unique()

array(['ፖለቲካ', 'ሀገር አቀፍ ዜና', 'ስፖርት', 'ዓለም አቀፍ ዜና', 'ቢዝነስ', 'መዝናኛ', nan],
      dtype=object)

In [60]:
data['word_len'] = data['article'].str.split().str.len()
data.head()

Unnamed: 0,headline,category,date,views,article,link,word_len
44810,በመዲናዋ የጋራ መኖሪያ ቤቶችን ግንባታ ለማጠናቀቅና ለማስጀመር የ20 ቢሊ...,ፖለቲካ,"September 14, 2017",Unknown,በአዲስ አበባ የተጀመሩ የጋራ መኖሪያ ቤቶችን ግንባታ ለማጠናቀቅና አዳዲስ...,https://waltainfo.com/am/29360/,302
18329,አሸንዳ፣ ሻደይ፣ አሸንድዬ፣ ሶለል፣ ማሪያና ዓይኒ ዋሪ በዓል በዩኔስኮ አ...,ሀገር አቀፍ ዜና,"Jan 8, 2020",387,አዲስ አበባ፣ ታህሳስ 29፣ 2012 (ኤፍ.ቢ.ሲ) የአሸንዳ፣ ሻደይ፣ አሸ...,https://www.fanabc.com/%e1%8b%a8%e1%8a%a0%e1%8...,165
17644,ግምታዊ ዋጋቸው ከ31 ነጥብ 6 ሚሊየን ብር በላይ የሆኑ የኮንትሮባንድ ዕ...,ሀገር አቀፍ ዜና,"Mar 4, 2020",634,አዲስ አበባ ፣ የካቲት 25 ፣ 2012 (ኤፍ ቢ ሲ) ባለፉት አምስት ቀና...,https://www.fanabc.com/%e1%8c%8d%e1%88%9d%e1%8...,102
32773,​ባምላክ ተሰማ የዓለም ዋንጫ ለመምራት እጩ ከሆኑ ዳኞች ውስጥ ተካተተ,ስፖርት,"November 19, 2017",Unknown,ሩሲያ በ2018 ለምታስተናግደው የዓለም ዋንጫ ፊፋ ጨዋታዎችን ሊመሩ የሚች...,https://soccerethiopia.net/football/31318,143
4139,“መንገድ ለሰው” መርሐ ግብር በየሳምንቱ እንደሚካሄድ ተገለፀ,ሀገር አቀፍ ዜና,"August 7, 2019",32,አዲስ አበባ፤- “መንገድ ለሰው” በሚል በአዲስ አበባ ከተማ ሲካሄድ የነ...,https://www.press.et/Ama/?p=15606,193


In [38]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 51483 entries, 44810 to 15795
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   headline  51470 non-null  object
 1   category  51482 non-null  object
 2   date      51483 non-null  object
 3   views     51483 non-null  object
 4   article   51483 non-null  object
 5   link      51483 non-null  object
 6   word_len  51483 non-null  int64 
dtypes: int64(1), object(6)
memory usage: 3.1+ MB


In [61]:
# Total number of words

num_words = data['article'].apply(lambda x: len(x.split(' '))).sum()

wrd_mean = data.word_len.mean()

print(f"Total # of words: {num_words}, Word Mean: {wrd_mean}")

Total # of words: 12730586, Word Mean: 248.9586853912942


In [62]:
# Removing null and NaN values from all the rows

print(f"Before\n {data.isna().sum()}")
data.dropna(axis=0, inplace=True)
print(f"After\n {data.isna().sum()}")

Before
 headline    13
category     1
date         0
views        0
article      0
link         0
word_len     0
dtype: int64

After headline    0
category    0
date        0
views       0
article     0
link        0
word_len    0
dtype: int64


# character level normalization

Amharic has characters wich have the same sound that can be interchangably used.

for example letters 'ሃ','ኅ','ኃ','ሐ','ሓ','ኻ','ሀ' have the same sound so we change them to 'ሀ' 

In [64]:
import re
#method to normalize character level missmatch such as ጸሀይ and ፀሐይ
def normalize_char_level_missmatch(input_token):
    rep1=re.sub('[ሃኅኃሐሓኻ]','ሀ',input_token)
    rep2=re.sub('[ሑኁዅ]','ሁ',rep1)
    rep3=re.sub('[ኂሒኺ]','ሂ',rep2)
    rep4=re.sub('[ኌሔዄ]','ሄ',rep3)
    rep5=re.sub('[ሕኅ]','ህ',rep4)
    rep6=re.sub('[ኆሖኾ]','ሆ',rep5)
    rep7=re.sub('[ሠ]','ሰ',rep6)
    rep8=re.sub('[ሡ]','ሱ',rep7)
    rep9=re.sub('[ሢ]','ሲ',rep8)
    rep10=re.sub('[ሣ]','ሳ',rep9)
    rep11=re.sub('[ሤ]','ሴ',rep10)
    rep12=re.sub('[ሥ]','ስ',rep11)
    rep13=re.sub('[ሦ]','ሶ',rep12)
    rep14=re.sub('[ዓኣዐ]','አ',rep13)
    rep15=re.sub('[ዑ]','ኡ',rep14)
    rep16=re.sub('[ዒ]','ኢ',rep15)
    rep17=re.sub('[ዔ]','ኤ',rep16)
    rep18=re.sub('[ዕ]','እ',rep17)
    rep19=re.sub('[ዖ]','ኦ',rep18)
    rep20=re.sub('[ጸ]','ፀ',rep19)
    rep21=re.sub('[ጹ]','ፁ',rep20)
    rep22=re.sub('[ጺ]','ፂ',rep21)
    rep23=re.sub('[ጻ]','ፃ',rep22)
    rep24=re.sub('[ጼ]','ፄ',rep23)
    rep25=re.sub('[ጽ]','ፅ',rep24)
    rep26=re.sub('[ጾ]','ፆ',rep25)
    #Normalizing words with Labialized Amharic characters such as በልቱዋል or  በልቱአል to  በልቷል  
    rep27=re.sub('(ሉ[ዋአ])','ሏ',rep26)
    rep28=re.sub('(ሙ[ዋአ])','ሟ',rep27)
    rep29=re.sub('(ቱ[ዋአ])','ቷ',rep28)
    rep30=re.sub('(ሩ[ዋአ])','ሯ',rep29)
    rep31=re.sub('(ሱ[ዋአ])','ሷ',rep30)
    rep32=re.sub('(ሹ[ዋአ])','ሿ',rep31)
    rep33=re.sub('(ቁ[ዋአ])','ቋ',rep32)
    rep34=re.sub('(ቡ[ዋአ])','ቧ',rep33)
    rep35=re.sub('(ቹ[ዋአ])','ቿ',rep34)
    rep36=re.sub('(ሁ[ዋአ])','ኋ',rep35)
    rep37=re.sub('(ኑ[ዋአ])','ኗ',rep36)
    rep38=re.sub('(ኙ[ዋአ])','ኟ',rep37)
    rep39=re.sub('(ኩ[ዋአ])','ኳ',rep38)
    rep40=re.sub('(ዙ[ዋአ])','ዟ',rep39)
    rep41=re.sub('(ጉ[ዋአ])','ጓ',rep40)
    rep42=re.sub('(ደ[ዋአ])','ዷ',rep41)
    rep43=re.sub('(ጡ[ዋአ])','ጧ',rep42)
    rep44=re.sub('(ጩ[ዋአ])','ጯ',rep43)
    rep45=re.sub('(ጹ[ዋአ])','ጿ',rep44)
    rep46=re.sub('(ፉ[ዋአ])','ፏ',rep45)
    rep47=re.sub('[ቊ]','ቁ',rep46) #ቁ can be written as ቊ
    rep48=re.sub('[ኵ]','ኩ',rep47) #ኩ can be also written as ኵ  
    return rep48


In [65]:
data['article'] = data['article'].str.replace('[^\w\s]','')

In [66]:
data['article'] = data['article'].apply(lambda x: normalize_char_level_missmatch(x))

In [67]:
n_data = data[['article','category']]
n_data.head()

text,label = data['article'].values,data['category'].values

## Model Development

In [46]:
# Train Test Split

from sklearn.model_selection import train_test_split

X = text
y = label
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 42)

#### Logistics Regression with TfidfTransformer

In [69]:
from sklearn.linear_model import LogisticRegression

lr = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', LogisticRegression(n_jobs=1, C=1e5)),
               ])
lr.fit(X_train, y_train)

Pipeline(steps=[('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                ('clf', LogisticRegression(C=100000.0, n_jobs=1))])

In [71]:
from sklearn.metrics import classification_report, accuracy_score
y_pred = lr.predict(X_test)

print(f'accuracy {accuracy_score(y_pred, y_test)}')
print(classification_report(y_test, y_pred,target_names=data.category.unique()))

accuracy 0.8417524771711676
              precision    recall  f1-score   support

        ፖለቲካ       0.85      0.84      0.84      4090
  ሀገር አቀፍ ዜና       0.82      0.74      0.78       126
        ስፖርት       0.97      0.98      0.98      2102
  ዓለም አቀፍ ዜና       0.65      0.66      0.66       795
        ቢዝነስ       0.87      0.86      0.87      1308
        መዝናኛ       0.74      0.77      0.76      1873

    accuracy                           0.84     10294
   macro avg       0.82      0.81      0.81     10294
weighted avg       0.84      0.84      0.84     10294



#### SGDClassifier with TfidfTransformer

In [72]:
from sklearn.linear_model import SGDClassifier

sgd = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, random_state=42, max_iter=5, tol=None)),
               ])
sgd.fit(X_train, y_train)

Pipeline(steps=[('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                ('clf',
                 SGDClassifier(alpha=0.001, max_iter=5, random_state=42,
                               tol=None))])

In [73]:
from sklearn.metrics import classification_report, accuracy_score
y_pred = sgd.predict(X_test)

print(f'accuracy {accuracy_score(y_pred, y_test)}')
print(classification_report(y_test, y_pred,target_names=data.category.unique()))

accuracy 0.7280940353604041
              precision    recall  f1-score   support

        ፖለቲካ       0.62      0.95      0.75      4090
  ሀገር አቀፍ ዜና       1.00      0.02      0.03       126
        ስፖርት       0.96      0.95      0.95      2102
  ዓለም አቀፍ ዜና       0.88      0.23      0.36       795
        ቢዝነስ       0.84      0.62      0.72      1308
        መዝናኛ       0.80      0.33      0.47      1873

    accuracy                           0.73     10294
   macro avg       0.85      0.52      0.55     10294
weighted avg       0.77      0.73      0.70     10294

