# Exploratory Data Analysis

In [2]:
%matplotlib inline
import matplotlib
import pandas as pd
import numpy as np
from sklearn.utils import shuffle
np.random.seed(47)

In [3]:
data = pd.read_csv('../data/Amharic News Dataset.csv')
data = shuffle(data)
data.head(3)

Unnamed: 0,headline,category,date,views,article,link
11501,የሕዝብ ተወካዮች ምክር ቤት የፊታችን ሰኞ ከእረፍት ይመለሳል፡፡,ሀገር አቀፍ ዜና,"October 1, 2019",64,ባሕር ዳር፡ መስከረም 20/2012 ዓ/ም (አብመድ) የሕዝብ ተወካዮች ምክ...,https://www.amharaweb.com/%e1%8b%a8%e1%88%95%e...
279,ባየር ሙኒክ በኢትዮጵያ ሥልጠና ይሰጣል,ስፖርት,"February 17, 2020",19,የጀርመኑ ኃያል ክለብ ባየር ሙኒክ በኢትዮጵያ ለክለቦች የተለያዩ ሥልጠና...,https://www.press.et/Ama/?p=27414
27675,የሴቶች ገፅ | ወርቃማዋ እንስት ሽታዬ ሲሳይ,ስፖርት,"June 18, 2020",Unknown,በቡድን ስኬት እና በግል ክብሮች ባንፀባረቀው የእግር ኳስ ህይወቷ ከትምህ...,https://soccerethiopia.net/football/58486


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 51483 entries, 11501 to 38023
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   headline  51470 non-null  object
 1   category  51482 non-null  object
 2   date      51483 non-null  object
 3   views     51483 non-null  object
 4   article   51483 non-null  object
 5   link      51483 non-null  object
dtypes: object(6)
memory usage: 2.7+ MB


In [5]:
data.shape

(51483, 6)

We have 51,483 records for 6 columns which are **headline, category, date, views, article** and **link**

In [6]:
data = data.dropna(subset=['article'])

In [7]:
data['link'].value_counts()

https://amharic.voanews.com//a/covid-main/5443343.html                                                                                                         3
https://amharic.voanews.com//a/community-service-honored-on-adwa-124th-voa-prince-ermias-sahle-selassie-and-nebiyat-aklilu-with-voa-march-2020/5331348.html    3
https://amharic.voanews.com//a/london-marathon/5608398.html                                                                                                    3
https://amharic.voanews.com//a/sochi-russia-africa-ethiopia-10-24-19/5138261.html                                                                              3
https://amharic.voanews.com//a/5461661.html                                                                                                                    3
                                                                                                                                                              ..
https://www.press.et/Ama/?p=27942 

In [8]:
data.category.unique()

array(['ሀገር አቀፍ ዜና', 'ስፖርት', 'ፖለቲካ', 'ቢዝነስ', 'ዓለም አቀፍ ዜና', 'መዝናኛ', nan],
      dtype=object)

Equivalent to [National News, Politics, Sports, International News, Business, Entertainment, nan]

In [9]:
data['word_len'] = data['article'].str.split().str.len()
data.head(3)

Unnamed: 0,headline,category,date,views,article,link,word_len
11501,የሕዝብ ተወካዮች ምክር ቤት የፊታችን ሰኞ ከእረፍት ይመለሳል፡፡,ሀገር አቀፍ ዜና,"October 1, 2019",64,ባሕር ዳር፡ መስከረም 20/2012 ዓ/ም (አብመድ) የሕዝብ ተወካዮች ምክ...,https://www.amharaweb.com/%e1%8b%a8%e1%88%95%e...,214
279,ባየር ሙኒክ በኢትዮጵያ ሥልጠና ይሰጣል,ስፖርት,"February 17, 2020",19,የጀርመኑ ኃያል ክለብ ባየር ሙኒክ በኢትዮጵያ ለክለቦች የተለያዩ ሥልጠና...,https://www.press.et/Ama/?p=27414,206
27675,የሴቶች ገፅ | ወርቃማዋ እንስት ሽታዬ ሲሳይ,ስፖርት,"June 18, 2020",Unknown,በቡድን ስኬት እና በግል ክብሮች ባንፀባረቀው የእግር ኳስ ህይወቷ ከትምህ...,https://soccerethiopia.net/football/58486,1054


In [10]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 51483 entries, 11501 to 38023
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   headline  51470 non-null  object
 1   category  51482 non-null  object
 2   date      51483 non-null  object
 3   views     51483 non-null  object
 4   article   51483 non-null  object
 5   link      51483 non-null  object
 6   word_len  51483 non-null  int64 
dtypes: int64(1), object(6)
memory usage: 3.1+ MB


In [11]:
data.word_len.mean()

248.9586853912942

# character level normalization

Amharic has characters wich have the same sound that can be interchangably used.

for example letters 'ሃ','ኅ','ኃ','ሐ','ሓ','ኻ','ሀ' have the same sound so we change them to 'ሀ' 

In [12]:
import re
#method to normalize character level missmatch such as ጸሀይ and ፀሐይ
def normalize_char_level_missmatch(input_token):
    rep1=re.sub('[ሃኅኃሐሓኻ]','ሀ',input_token)
    rep2=re.sub('[ሑኁዅ]','ሁ',rep1)
    rep3=re.sub('[ኂሒኺ]','ሂ',rep2)
    rep4=re.sub('[ኌሔዄ]','ሄ',rep3)
    rep5=re.sub('[ሕኅ]','ህ',rep4)
    rep6=re.sub('[ኆሖኾ]','ሆ',rep5)
    rep7=re.sub('[ሠ]','ሰ',rep6)
    rep8=re.sub('[ሡ]','ሱ',rep7)
    rep9=re.sub('[ሢ]','ሲ',rep8)
    rep10=re.sub('[ሣ]','ሳ',rep9)
    rep11=re.sub('[ሤ]','ሴ',rep10)
    rep12=re.sub('[ሥ]','ስ',rep11)
    rep13=re.sub('[ሦ]','ሶ',rep12)
    rep14=re.sub('[ዓኣዐ]','አ',rep13)
    rep15=re.sub('[ዑ]','ኡ',rep14)
    rep16=re.sub('[ዒ]','ኢ',rep15)
    rep17=re.sub('[ዔ]','ኤ',rep16)
    rep18=re.sub('[ዕ]','እ',rep17)
    rep19=re.sub('[ዖ]','ኦ',rep18)
    rep20=re.sub('[ጸ]','ፀ',rep19)
    rep21=re.sub('[ጹ]','ፁ',rep20)
    rep22=re.sub('[ጺ]','ፂ',rep21)
    rep23=re.sub('[ጻ]','ፃ',rep22)
    rep24=re.sub('[ጼ]','ፄ',rep23)
    rep25=re.sub('[ጽ]','ፅ',rep24)
    rep26=re.sub('[ጾ]','ፆ',rep25)
    #Normalizing words with Labialized Amharic characters such as በልቱዋል or  በልቱአል to  በልቷል  
    rep27=re.sub('(ሉ[ዋአ])','ሏ',rep26)
    rep28=re.sub('(ሙ[ዋአ])','ሟ',rep27)
    rep29=re.sub('(ቱ[ዋአ])','ቷ',rep28)
    rep30=re.sub('(ሩ[ዋአ])','ሯ',rep29)
    rep31=re.sub('(ሱ[ዋአ])','ሷ',rep30)
    rep32=re.sub('(ሹ[ዋአ])','ሿ',rep31)
    rep33=re.sub('(ቁ[ዋአ])','ቋ',rep32)
    rep34=re.sub('(ቡ[ዋአ])','ቧ',rep33)
    rep35=re.sub('(ቹ[ዋአ])','ቿ',rep34)
    rep36=re.sub('(ሁ[ዋአ])','ኋ',rep35)
    rep37=re.sub('(ኑ[ዋአ])','ኗ',rep36)
    rep38=re.sub('(ኙ[ዋአ])','ኟ',rep37)
    rep39=re.sub('(ኩ[ዋአ])','ኳ',rep38)
    rep40=re.sub('(ዙ[ዋአ])','ዟ',rep39)
    rep41=re.sub('(ጉ[ዋአ])','ጓ',rep40)
    rep42=re.sub('(ደ[ዋአ])','ዷ',rep41)
    rep43=re.sub('(ጡ[ዋአ])','ጧ',rep42)
    rep44=re.sub('(ጩ[ዋአ])','ጯ',rep43)
    rep45=re.sub('(ጹ[ዋአ])','ጿ',rep44)
    rep46=re.sub('(ፉ[ዋአ])','ፏ',rep45)
    rep47=re.sub('[ቊ]','ቁ',rep46) #ቁ can be written as ቊ
    rep48=re.sub('[ኵ]','ኩ',rep47) #ኩ can be also written as ኵ  
    return rep48


In [13]:
data['article'] = data['article'].str.replace('[^\w\s]','')

  data['article'] = data['article'].str.replace('[^\w\s]','')


In [14]:
data['article'] = data['article'].apply(lambda x: normalize_char_level_missmatch(x))

In [15]:
n_data = data[['article','category']]
n_data.head()

text,label = data['article'].values,data['category'].values

In [16]:
# n_data.head(5).to_csv('table.csv')

# Naive Bays - CountVectorizer

In [17]:
from sklearn.feature_extraction.text import CountVectorizer
matrix = CountVectorizer(analyzer='word',max_features=1000,ngram_range=(1, 3))
X = matrix.fit_transform(text).toarray()
X

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [18]:
unique_label = list(set(label))
Y= []
for i in label:
    Y.append(unique_label.index(i))

In [19]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, Y,test_size=0.2)

In [20]:
# Naive Bayes 
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train, y_train)

# Predict Class
y_pred = classifier.predict(X_test)

# Accuracy 
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, y_pred)

accuracy

0.6190152471593668

In [21]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred, target_names=['ስፖርት', 'መዝናኛ', 'ሀገር አቀፍ ዜና', 'ቢዝነስ', 'ዓለም አቀፍ ዜና', 'ፖለቲካ', 'nan']))

              precision    recall  f1-score   support

        ስፖርት       0.00      0.00      0.00         0
        መዝናኛ       0.96      0.92      0.94      2089
  ሀገር አቀፍ ዜና       0.39      0.72      0.51       805
        ቢዝነስ       0.59      0.57      0.58      1852
  ዓለም አቀፍ ዜና       0.43      0.89      0.58      1264
        ፖለቲካ       0.27      0.82      0.41       115
         nan       0.83      0.38      0.52      4172

    accuracy                           0.62     10297
   macro avg       0.49      0.61      0.50     10297
weighted avg       0.72      0.62      0.62     10297



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# Naive Bays - tf -df

In [22]:
from sklearn.feature_extraction.text import TfidfVectorizer
matrix = TfidfVectorizer(analyzer='word',max_features=1000,ngram_range=(1, 3))
X = matrix.fit_transform(text).toarray()
X

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [23]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, Y,test_size=0.2)

In [24]:
# Naive Bayes 
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train, y_train)

# Predict Class
y_pred = classifier.predict(X_test)

# Accuracy 
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, y_pred)

accuracy

0.6166844712052054

In [25]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred, target_names=['ስፖርት', 'መዝናኛ', 'ሀገር አቀፍ ዜና', 'ቢዝነስ', 'ዓለም አቀፍ ዜና', 'ፖለቲካ', 'nan']))

              precision    recall  f1-score   support

        ስፖርት       0.00      0.00      0.00         0
        መዝናኛ       0.98      0.93      0.96      2074
  ሀገር አቀፍ ዜና       0.32      0.83      0.46       813
        ቢዝነስ       0.50      0.67      0.57      1847
  ዓለም አቀፍ ዜና       0.60      0.75      0.67      1265
        ፖለቲካ       0.25      0.75      0.37       144
         nan       0.89      0.35      0.50      4154

    accuracy                           0.62     10297
   macro avg       0.50      0.61      0.50     10297
weighted avg       0.75      0.62      0.62     10297



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
