# Cleansing

In [1]:
%matplotlib inline
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import datetime

In [2]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
from sklearn.model_selection import train_test_split, cross_val_score, KFold, StratifiedKFold
from sklearn.feature_extraction.text import CountVectorizer

#### Import Data 

In [3]:
df = pd.read_csv('data-set/training-set(news).csv')

In [4]:
df       

Unnamed: 0,DATE,TIME,LABEL,NEWS TOPPIC,ARTICLE
0,1-มี.ค.-21,17:05,General,Suspect in schoolgirl's rape-murder arrested,SONGKHLA: Police have arrested the prime suspe...
1,1-มี.ค.-21,17:04,General,500kg ganja seized in Nakhon Phanom,NAKHON PHANOM: A man was arrested with 500 kil...
2,1-มี.ค.-21,12:59,General,Samut Sakhon central shrimp market reopens,SAMUT SAKHON: The Central Shrimp Market of thi...
3,1-มี.ค.-21,11:41,General,Thailand adds 80 Covid cases Monday,Thailand added 80 new coronavirus cases on Mon...
4,1-มี.ค.-21,4:44,General,BMA to give Green Bridge a revamp,Buoyed by the warm public response for Bangkok...
...,...,...,...,...,...
105,11-ก.พ.-21,6:24,Property,SC Asset buying 30 plots for development,Demand for low-rise homes persists SET-listed ...
106,10-ก.พ.-21,6:24,Property,Sansiri uses Bar-B-Q Plaza for campaign,Sansiri is moving into the more affordable mar...
107,9-ก.พ.-21,8:04,Property,Land prices to spike in H2,REIC envisages good prospects for Bangkok land...
108,9-ก.พ.-21,7:33,Property,Home loan help for lower earners,More than half of this year's new housing loan...


#### Evaluate raw data

#### Reset Columns

In [5]:
"""
Reset columns names
Change LABEL to Category
Reset index 
"""
df = df.rename(columns= {'DATE':'Date','TIME':'Time', 'LABEL':'Category', 'NEWS TOPPIC':'Topic', 'ARTICLE':'Article'}, inplace= False)

In [6]:
df

Unnamed: 0,Date,Time,Category,Topic,Article
0,1-มี.ค.-21,17:05,General,Suspect in schoolgirl's rape-murder arrested,SONGKHLA: Police have arrested the prime suspe...
1,1-มี.ค.-21,17:04,General,500kg ganja seized in Nakhon Phanom,NAKHON PHANOM: A man was arrested with 500 kil...
2,1-มี.ค.-21,12:59,General,Samut Sakhon central shrimp market reopens,SAMUT SAKHON: The Central Shrimp Market of thi...
3,1-มี.ค.-21,11:41,General,Thailand adds 80 Covid cases Monday,Thailand added 80 new coronavirus cases on Mon...
4,1-มี.ค.-21,4:44,General,BMA to give Green Bridge a revamp,Buoyed by the warm public response for Bangkok...
...,...,...,...,...,...
105,11-ก.พ.-21,6:24,Property,SC Asset buying 30 plots for development,Demand for low-rise homes persists SET-listed ...
106,10-ก.พ.-21,6:24,Property,Sansiri uses Bar-B-Q Plaza for campaign,Sansiri is moving into the more affordable mar...
107,9-ก.พ.-21,8:04,Property,Land prices to spike in H2,REIC envisages good prospects for Bangkok land...
108,9-ก.พ.-21,7:33,Property,Home loan help for lower earners,More than half of this year's new housing loan...


In [7]:
#Function for finding unique values
def find_unique_values(values):
    result = []
    for i in values:
        if i not in result:
            result.append(i)
    for i in result:
         yield i

In [8]:
a = list(find_unique_values(df['Date']))

#### Reset Date Time

In [9]:
df['Date'] = df['Date'].str.replace('-','/') \
                       .str.replace('21','2021') \
                       .str.replace('มี.ค.','Jan', regex=True) \
                       .str.replace('ก.พ.','Fab', regex=True) \
                       .str.replace('ม.ค.','March', regex=True) 

In [10]:
df

Unnamed: 0,Date,Time,Category,Topic,Article
0,1/Jan/2021,17:05,General,Suspect in schoolgirl's rape-murder arrested,SONGKHLA: Police have arrested the prime suspe...
1,1/Jan/2021,17:04,General,500kg ganja seized in Nakhon Phanom,NAKHON PHANOM: A man was arrested with 500 kil...
2,1/Jan/2021,12:59,General,Samut Sakhon central shrimp market reopens,SAMUT SAKHON: The Central Shrimp Market of thi...
3,1/Jan/2021,11:41,General,Thailand adds 80 Covid cases Monday,Thailand added 80 new coronavirus cases on Mon...
4,1/Jan/2021,4:44,General,BMA to give Green Bridge a revamp,Buoyed by the warm public response for Bangkok...
...,...,...,...,...,...
105,11/Fab/2021,6:24,Property,SC Asset buying 30 plots for development,Demand for low-rise homes persists SET-listed ...
106,10/Fab/2021,6:24,Property,Sansiri uses Bar-B-Q Plaza for campaign,Sansiri is moving into the more affordable mar...
107,9/Fab/2021,8:04,Property,Land prices to spike in H2,REIC envisages good prospects for Bangkok land...
108,9/Fab/2021,7:33,Property,Home loan help for lower earners,More than half of this year's new housing loan...


#### Set Category

In [11]:
df.Category = df.Category.astype('category')

In [12]:
type(df.Category)

pandas.core.series.Series

In [13]:
df

Unnamed: 0,Date,Time,Category,Topic,Article
0,1/Jan/2021,17:05,General,Suspect in schoolgirl's rape-murder arrested,SONGKHLA: Police have arrested the prime suspe...
1,1/Jan/2021,17:04,General,500kg ganja seized in Nakhon Phanom,NAKHON PHANOM: A man was arrested with 500 kil...
2,1/Jan/2021,12:59,General,Samut Sakhon central shrimp market reopens,SAMUT SAKHON: The Central Shrimp Market of thi...
3,1/Jan/2021,11:41,General,Thailand adds 80 Covid cases Monday,Thailand added 80 new coronavirus cases on Mon...
4,1/Jan/2021,4:44,General,BMA to give Green Bridge a revamp,Buoyed by the warm public response for Bangkok...
...,...,...,...,...,...
105,11/Fab/2021,6:24,Property,SC Asset buying 30 plots for development,Demand for low-rise homes persists SET-listed ...
106,10/Fab/2021,6:24,Property,Sansiri uses Bar-B-Q Plaza for campaign,Sansiri is moving into the more affordable mar...
107,9/Fab/2021,8:04,Property,Land prices to spike in H2,REIC envisages good prospects for Bangkok land...
108,9/Fab/2021,7:33,Property,Home loan help for lower earners,More than half of this year's new housing loan...


#### Create category id

In [14]:
df['Category_id'] = df['Category'].factorize(sort=False)[0]

#### Re-columns

In [15]:
cols = ['Date', 'Time','Category_id','Category', 'Topic', 'Article']
cols

['Date', 'Time', 'Category_id', 'Category', 'Topic', 'Article']

In [16]:
df = df[cols]

#### Create new DataFrame for unique Category_id and Category

In [17]:
category_id_df = df[['Category_id','Category']].drop_duplicates().sort_values('Category_id')
category_id_df

Unnamed: 0,Category_id,Category
0,0,General
10,1,Politics
20,2,World
30,3,Business
40,4,Opinion
50,5,Auto
60,6,Life
70,7,Sports
80,8,Travel
90,9,Tech


In [18]:
category_to_id = dict(category_id_df.values)
id_to_category = dict(category_id_df[['Category_id', 'Category']].values)

In [19]:
df.sample(5, random_state=0)

Unnamed: 0,Date,Time,Category_id,Category,Topic,Article
85,18/Fab/2021,4:00,8,Travel,Finding peace in northern Thailand,Lampang's ancient temples are a good place for...
10,1/Jan/2021,17:41,1,Politics,Protesters face action over damage to Din Daen...,Police are assessing damage caused at Ding Dae...
75,28/Fab/2021,10:45,7,Sports,America's Cup on hold amid New Zealand's Covid...,WELLINGTON - The America's Cup final between T...
2,1/Jan/2021,12:59,0,General,Samut Sakhon central shrimp market reopens,SAMUT SAKHON: The Central Shrimp Market of thi...
24,1/Jan/2021,18:49,2,World,"France, Germany struggle to sell AstraZeneca v...",Already facing a daunting Covid vaccination ch...


-------------------------------------------------------

# Preprocessing

In [20]:
label = df.Category_id

In [21]:
label

0       0
1       0
2       0
3       0
4       0
       ..
105    10
106    10
107    10
108    10
109    10
Name: Category_id, Length: 110, dtype: int64

In [22]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [23]:
tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', encoding='latin-1', ngram_range=(1, 2), stop_words='english')

In [24]:
features = tfidf.fit_transform(df.Article).toarray()

In [25]:
features

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.07491793, 0.11658891, 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.05358707, ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.04013222, ..., 0.        , 0.        ,
        0.        ],
       [0.10665049, 0.13390502, 0.05210956, ..., 0.        , 0.        ,
        0.        ],
       [0.07759043, 0.        , 0.05421946, ..., 0.        , 0.        ,
        0.        ]])