# Cleansing

In [1]:
%matplotlib inline
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import datetime

In [2]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
from sklearn.model_selection import train_test_split, cross_val_score, KFold, StratifiedKFold
from sklearn.feature_extraction.text import CountVectorizer

#### Import Data 

In [3]:
df = pd.read_csv('data-set/training-set(news).csv')

In [4]:
df       

Unnamed: 0,DATE,TIME,LABEL,NEWS TOPPIC,ARTICLE
0,1-มี.ค.-21,17:05,General,Suspect in schoolgirl's rape-murder arrested,SONGKHLA: Police have arrested the prime suspe...
1,1-มี.ค.-21,17:04,General,500kg ganja seized in Nakhon Phanom,NAKHON PHANOM: A man was arrested with 500 kil...
2,1-มี.ค.-21,12:59,General,Samut Sakhon central shrimp market reopens,SAMUT SAKHON: The Central Shrimp Market of thi...
3,1-มี.ค.-21,11:41,General,Thailand adds 80 Covid cases Monday,Thailand added 80 new coronavirus cases on Mon...
4,1-มี.ค.-21,4:44,General,BMA to give Green Bridge a revamp,Buoyed by the warm public response for Bangkok...
...,...,...,...,...,...
105,11-ก.พ.-21,6:24,Property,SC Asset buying 30 plots for development,Demand for low-rise homes persists SET-listed ...
106,10-ก.พ.-21,6:24,Property,Sansiri uses Bar-B-Q Plaza for campaign,Sansiri is moving into the more affordable mar...
107,9-ก.พ.-21,8:04,Property,Land prices to spike in H2,REIC envisages good prospects for Bangkok land...
108,9-ก.พ.-21,7:33,Property,Home loan help for lower earners,More than half of this year's new housing loan...


#### Reset Columns

In [5]:
"""
Reset columns names
Change LABEL to Category
Reset index 
"""
df = df.rename(columns= {'DATE':'Date','TIME':'Time', 'LABEL':'Category', 'NEWS TOPPIC':'Topic', 'ARTICLE':'Article'}, inplace= False)

In [6]:
df

Unnamed: 0,Date,Time,Category,Topic,Article
0,1-มี.ค.-21,17:05,General,Suspect in schoolgirl's rape-murder arrested,SONGKHLA: Police have arrested the prime suspe...
1,1-มี.ค.-21,17:04,General,500kg ganja seized in Nakhon Phanom,NAKHON PHANOM: A man was arrested with 500 kil...
2,1-มี.ค.-21,12:59,General,Samut Sakhon central shrimp market reopens,SAMUT SAKHON: The Central Shrimp Market of thi...
3,1-มี.ค.-21,11:41,General,Thailand adds 80 Covid cases Monday,Thailand added 80 new coronavirus cases on Mon...
4,1-มี.ค.-21,4:44,General,BMA to give Green Bridge a revamp,Buoyed by the warm public response for Bangkok...
...,...,...,...,...,...
105,11-ก.พ.-21,6:24,Property,SC Asset buying 30 plots for development,Demand for low-rise homes persists SET-listed ...
106,10-ก.พ.-21,6:24,Property,Sansiri uses Bar-B-Q Plaza for campaign,Sansiri is moving into the more affordable mar...
107,9-ก.พ.-21,8:04,Property,Land prices to spike in H2,REIC envisages good prospects for Bangkok land...
108,9-ก.พ.-21,7:33,Property,Home loan help for lower earners,More than half of this year's new housing loan...


In [7]:
#Function for finding unique values
def find_unique_values(values):
    result = []
    for i in values:
        if i not in result:
            result.append(i)
    for i in result:
         yield i

In [8]:
a = list(find_unique_values(df['Date']))

#### Reset Date Time

In [32]:
df['Date'] = df['Date'].str.replace('-','/') \
                       .str.replace('21','2021') \
                       .str.replace('มี.ค.','Jan', regex=True) \
                       .str.replace('ก.พ.','Fab', regex=True) \
                       .str.replace('ม.ค.','March', regex=True) 

In [33]:
df

Unnamed: 0,Date,Time,Category,Topic,Article
0,1/Jan/202021,17:05,General,Suspect in schoolgirl's rape-murder arrested,SONGKHLA: Police have arrested the prime suspe...
1,1/Jan/202021,17:04,General,500kg ganja seized in Nakhon Phanom,NAKHON PHANOM: A man was arrested with 500 kil...
2,1/Jan/202021,12:59,General,Samut Sakhon central shrimp market reopens,SAMUT SAKHON: The Central Shrimp Market of thi...
3,1/Jan/202021,11:41,General,Thailand adds 80 Covid cases Monday,Thailand added 80 new coronavirus cases on Mon...
4,1/Jan/202021,4:44,General,BMA to give Green Bridge a revamp,Buoyed by the warm public response for Bangkok...
...,...,...,...,...,...
105,11/Fab/202021,6:24,Property,SC Asset buying 30 plots for development,Demand for low-rise homes persists SET-listed ...
106,10/Fab/202021,6:24,Property,Sansiri uses Bar-B-Q Plaza for campaign,Sansiri is moving into the more affordable mar...
107,9/Fab/202021,8:04,Property,Land prices to spike in H2,REIC envisages good prospects for Bangkok land...
108,9/Fab/202021,7:33,Property,Home loan help for lower earners,More than half of this year's new housing loan...


#### Split Train, Test set

In [11]:
def split_set(data, test_ratio):
    shuffled_indices = np.random.permutation(len(data))
    test_set_size = int(len(data) * test_ratio)
    test_indices = shuffled_indices[:test_set_size]
    train_indices = shuffled_indices[test_set_size:]
    return data.iloc[train_indices], data.iloc[test_indices]

In [12]:
train_set , test_set = split_set(df, 0.3)

#### Checking Train Test set size 

In [13]:
len(train_set)

77

In [14]:
len(test_set)

33

#### Reset index

In [15]:
train_set = train_set.reset_index(drop=True)

In [16]:
test_set = test_set.reset_index(drop=True)

#### Checking sum of categories

In [17]:
def count_vals(val1, val2):
    result = []
    i = val1.str.count(val2)
    for x in i:
        if x >= 1:
            result.append(x)
    return sum(result)

In [18]:
def category_count(val):
    result = [
        count_vals(val, 'Auto'),
        count_vals(val, 'Business'), 
        count_vals(val, 'General'), 
        count_vals(val, 'Life'),
        count_vals(val, 'Opinion'),
        count_vals(val, 'Politics'),
        count_vals(val, 'Property'),
        count_vals(val, 'Sports'),
        count_vals(val, 'Tech'),
        count_vals(val, 'Travel'),
        count_vals(val, 'World'),
    ]
    return result

In [19]:
a = category_count(train_set['Category'])

In [20]:
b = category_count(test_set['Category'])

In [21]:
x = list(find_unique_values(train_set['Category']))
y = list(find_unique_values(test_set['Category']))
x = sorted(x)
y = sorted(y)

count_cat = pd.DataFrame(list(zip(x,a, y, b)), columns=['Train','Count','Test','Count'])

In [22]:
count_cat

Unnamed: 0,Train,Count,Test,Count.1
0,Auto,7,Auto,3
1,Business,7,Business,3
2,General,6,General,4
3,Life,8,Life,2
4,Opinion,7,Opinion,3
5,Politics,6,Politics,4
6,Property,6,Property,4
7,Sports,8,Sports,2
8,Tech,8,Tech,2
9,Travel,7,Travel,3


In [23]:
train_set.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 77 entries, 0 to 76
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Date      77 non-null     object
 1   Time      77 non-null     object
 2   Category  77 non-null     object
 3   Topic     77 non-null     object
 4   Article   77 non-null     object
dtypes: object(5)
memory usage: 3.1+ KB


In [24]:
test_set.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33 entries, 0 to 32
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Date      33 non-null     object
 1   Time      33 non-null     object
 2   Category  33 non-null     object
 3   Topic     33 non-null     object
 4   Article   33 non-null     object
dtypes: object(5)
memory usage: 1.4+ KB


In [25]:
train_set

Unnamed: 0,Date,Time,Category,Topic,Article
0,1/Jan/2021,9:30,Tech,Apple privacy moves to kneecap Facebook revenue,The upcoming iOS 14 update will give users the...
1,26/Fab/2021,4:00,Tech,Get social,Alternative apps to build a community It's 200...
2,1/Jan/2021,4:30,General,Down but not out,"Aung San Suu Kyi, leader of the National Leagu..."
3,19/Fab/2021,4:00,Property,Ananda to launch 5 projects,"After freezing new project launches last year,..."
4,1/Jan/2021,4:30,Life,Art extravaganza to kick off tomorrow,"Almost 9,000 illustration works by illustrator..."
...,...,...,...,...,...
72,28/Fab/2021,10:45,Sports,America's Cup on hold amid New Zealand's Covid...,WELLINGTON - The America's Cup final between T...
73,28/Fab/2021,4:00,Sports,Tiger mishap forces trip down memory lane,Tiger Woods' accident brought back memories of...
74,1/Jan/2021,12:59,Business,Samut Sakhon central shrimp market reopens,SAMUT SAKHON: The Central Shrimp Market of thi...
75,1/Jan/2021,12:59,General,Samut Sakhon central shrimp market reopens,SAMUT SAKHON: The Central Shrimp Market of thi...
