## 2nd Exercise 

In this exercise you need to perform data preprocessing using the news articles dataset provided in the data folder

Use the clean data you have developed from the First Exercise

In [15]:
# Import packages
import numpy as np
import pandas as pd 

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

import warnings

warnings.filterwarnings("ignore")
# seeding
np.random.seed(123)

In [16]:
# load data using pandas 

data = pd.read_csv('../Data/tz_national_news_articles_clean_data.csv')


In [17]:
data.head()

Unnamed: 0,content,word_count
0,kuongeza kasi kuchapisha vitambulisho taifa ma...,307
1,kampuni zimejitokeza kununua ufuta wilaya ruan...,253
2,takwimu hospitali taifa muhimbili mnh zinaones...,191
3,idadi vifo waendesha bodaboda nchini vitokanav...,111
4,serikali imeyafuta mashamba sita yenye ukubwa ...,198


In [18]:
# show sample of the data using head method from pandas 

data.head()


Unnamed: 0,content,word_count
0,kuongeza kasi kuchapisha vitambulisho taifa ma...,307
1,kampuni zimejitokeza kununua ufuta wilaya ruan...,253
2,takwimu hospitali taifa muhimbili mnh zinaones...,191
3,idadi vifo waendesha bodaboda nchini vitokanav...,111
4,serikali imeyafuta mashamba sita yenye ukubwa ...,198


### Implement CountVectorizer method in the News Articles  Dataset

In [26]:
# Instatiate the CountVectorizer from scikitlearn

count_vectorizer = CountVectorizer()

# fit and transform the data with count vectorizer
count_data = count_vectorizer.fit_transform(data['content'].values.astype('U'))


In [27]:
# show the transformed data (the first 10 rows in array)
count_data[:10].toarray() 


array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [28]:
# show top 5 rows using head method from pandas

cv_feature_names = count_vectorizer.get_feature_names_out()

# create dataframe for the transformed data using pandas
count_data_df = pd.DataFrame(count_data.toarray(),
                             columns=list(cv_feature_names))

# show top 5 rows using head method from pandas
count_data_df.head()


Unnamed: 0,aa,aache,aachie,aadfi,aafp,aagiza,aagize,aagwa,aaisa,aakia,...,zungsong,zungu,zunguka,zungulicha,zuri,zuriamesema,zuzu,zuzuamesema,zwa,ùkuaji
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [30]:
# check the shape of the transformed dataset
count_data_df.shape 

(2000, 53491)

### Implement TfidfVectorizer method in the News Articles dataset

In [31]:
# Instatiate the TfidfVectorizer from scikitlearn

tfidf_vectorizer = TfidfVectorizer()

# fit and transform the data with Tfidf vectorizer
tfidf_data = tfidf_vectorizer.fit_transform(data['content'].values.astype('U'))


In [32]:
# show the transformed data (the first 10 rows in array)

tfidf_data[:10].toarray() 

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.01798696],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [33]:

# Collect the features names
cv_feature_names = count_vectorizer.get_feature_names_out()

# create dataframe for the transformed data using pandas

tfidf_data_df = pd.DataFrame(tfidf_data.toarray(),
                             columns=list(cv_feature_names))

# show top 5 rows using head method from pandas
tfidf_data_df.head()



Unnamed: 0,aa,aache,aachie,aadfi,aafp,aagiza,aagize,aagwa,aaisa,aakia,...,zungsong,zungu,zunguka,zungulicha,zuri,zuriamesema,zuzu,zuzuamesema,zwa,ùkuaji
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [34]:
# check the shape of the transformed dataset

tfidf_data_df.shape 


(2000, 53491)