### Setup

In [1]:
c_root_path = '/content/drive/MyDrive/Projects/NewsClassification'

### Imports

In [2]:
import numpy as np, pandas as pd

### EDA

In [3]:
def get_data():
    """
    Read the datasets.
    """
    df_train = pd.read_csv(f'{c_root_path}/train.csv', names = ['classid', 'title', 'desc'], header = 0)
    df_test = pd.read_csv(f'{c_root_path}/test.csv', names = ['classid', 'title', 'desc'], header = 0)
    return df_train, df_test

df_train, df_test = get_data()

In [4]:
df_train.shape, df_test.shape

((120000, 3), (7600, 3))

In [5]:
df_train.head()

Unnamed: 0,classid,title,desc
0,3,Wall St. Bears Claw Back Into the Black (Reuters),"Reuters - Short-sellers, Wall Street's dwindli..."
1,3,Carlyle Looks Toward Commercial Aerospace (Reu...,Reuters - Private investment firm Carlyle Grou...
2,3,Oil and Economy Cloud Stocks' Outlook (Reuters),Reuters - Soaring crude prices plus worries\ab...
3,3,Iraq Halts Oil Exports from Main Southern Pipe...,Reuters - Authorities have halted oil export\f...
4,3,"Oil prices soar to all-time record, posing new...","AFP - Tearaway world oil prices, toppling reco..."


In [6]:
df_test.head()

Unnamed: 0,classid,title,desc
0,3,Fears for T N pension after talks,Unions representing workers at Turner Newall...
1,4,The Race is On: Second Private Team Sets Launc...,"SPACE.com - TORONTO, Canada -- A second\team o..."
2,4,Ky. Company Wins Grant to Study Peptides (AP),AP - A company founded by a chemistry research...
3,4,Prediction Unit Helps Forecast Wildfires (AP),AP - It's barely dawn when Mike Fitzpatrick st...
4,4,Calif. Aims to Limit Farm-Related Smog (AP),AP - Southern California's smog-fighting agenc...


In [7]:
df_train['classid'].value_counts(), df_test['classid'].value_counts()

(3    30000
 4    30000
 2    30000
 1    30000
 Name: classid, dtype: int64,
 3    1900
 4    1900
 2    1900
 1    1900
 Name: classid, dtype: int64)

In [8]:
df_train.info(), df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 120000 entries, 0 to 119999
Data columns (total 3 columns):
 #   Column   Non-Null Count   Dtype 
---  ------   --------------   ----- 
 0   classid  120000 non-null  int64 
 1   title    120000 non-null  object
 2   desc     120000 non-null  object
dtypes: int64(1), object(2)
memory usage: 2.7+ MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7600 entries, 0 to 7599
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   classid  7600 non-null   int64 
 1   title    7600 non-null   object
 2   desc     7600 non-null   object
dtypes: int64(1), object(2)
memory usage: 178.2+ KB


(None, None)

In [10]:
reqdata = df_train['title'].to_list() + df_train['desc'].to_list()
allwords = list()
for obs in reqdata:
    allwords.extend(obs.split())

from collections import Counter
counts = Counter(allwords).most_common(10)
print(counts)


[('the', 177525), ('to', 117593), ('a', 98103), ('of', 97390), ('in', 92054), ('and', 68036), ('on', 55149), ('for', 48430), ('-', 39095), ('#39;s', 30928)]


### Observations

1. Size
    * Size of train data : 120,000
    * Size of test data : 7600

2. Features
    * Columns : Title, Description

3. Target
    * Number of Classes : 4
    * Class Imbalance : None

4. Missing Values
    * None

5. Most Common Words
    * https://wordsrated.com/most-common-words-used-in-news/
    * weekdays, today, tomorrow, yesterday can be seen a lot
    * Says, said, tell, told are quite frequent.

6.