# Learning any tabulated data

In [3]:
import pandas as pd
import numpy as np

In [4]:
countries = pd.read_csv("countries.csv").fillna(0.0)

In [20]:
countries.sample(5)

Unnamed: 0,Country,Region,Population,Area (sq. mi.),Pop. Density (per sq. mi.),Coastline (coast/area ratio),Net migration,Infant mortality (per 1000 births),GDP ($ per capita),Literacy (%),Phones (per 1000),Arable (%),Crops (%),Other (%),Climate,Birthrate,Deathrate,Agriculture,Industry,Service
124,Madagascar,SUB-SAHARAN AFRICA,18595469,587040,317,82,0,7683,800.0,689,36,507,103,9391,2,4141,1111,276,165,559
53,Czech Republic,EASTERN EUROPE,10235455,78866,1298,0,97,393,15700.0,999,3143,398,305,5715,3,902,1059,34,393,573
57,Dominican Republic,LATIN AMER. & CARIB,9183984,48730,1885,264,-322,3238,6000.0,847,974,2265,1033,6702,2,2322,573,112,306,582
199,Tajikistan,C.W. OF IND. STATES,7320815,143100,512,0,-286,11076,1000.0,994,335,661,92,9247,2,3265,825,234,286,48
58,East Timor,ASIA (EX. NEAR EAST),1062777,15007,708,470,0,4741,500.0,586,0,471,67,9462,2,2699,624,85,231,684


In [30]:
countries = countries.rename(columns={
                        "Region":"region",
                          "Country":"country",
                          "Population":"population",
                          "Pop. Density (per sq. mi.)":"density",
                          "Coastline (coast/area ratio)":"coast",
                          "Net migration":"migrate",
                          "Literacy (%)":"literacy",
                          "Area (sq. mi.)":"area",
                          "Infant mortality (per 1000 births)":"inf_live",
                          "GDP ($ per capita)":"gdp",
                          "Phones (per 1000)":"phones","Service":"service",
                          "Arable (%)":"arable","Agriculture":"agriculture","Industry":"industry",
                          "Crops (%)":"crops","Birthrate":"birthrate","Deathrate":"deathrate",
                          "Other (%)":"other",
                          "Climate":"climate",
                         })

all the regions:

### Data Frame Preprocessing management

In [32]:
import os
class col_core:
    def __init__(self,col_name,save_dir = ".matchbox/fields"):
        os.system("mkdir -p %s"%(save_dir))
        self.col_name = col_name
        self.save_dir = save_dir
        if self.save_dir[-1]!="/": self.save_dir+="/"
            
        self.meta = dict()
    
    def save_meta(self):
        np.save(self.save_dir+str(self.col_name)+".npy",self.meta)
        
    def set_meta(self,meta):
        """
        set meta dict to object
        """
        for k,v in self.meta.items():
            setattr(self,k,v)
        
    def load_meta(self,path=None):
        if path==None:
            path = self.save_dir+str(self.col_name)+".npy"
        self.meta = np.load(path).tolist()
        self.set_meta(self.meta)
        
    def make_meta(self):
        for attr in self.make_meta_list:
            self.meta[attr]  = getattr(self,attr)
    
class categorical(col_core):
    def __init__(self,col_name,save_dir = ".matchbox/fields"):
        super(categorical,self).__init__(col_name, save_dir)
        self.coltype = "categorical"
        self.make_meta_list = ["coltype","idx2cate","cate2idx","width","eye","dim_names"]
        
    def build(self,pandas_s,max_ = 20):
        assert max_>1, "max should be bigger than 1"
        
        vcount = pd.DataFrame(pandas_s.value_counts())
        
        print(vcount)
        
        self.cate_full = list(vcount.index.tolist())
        self.cate_list = self.cate_full[:max_-1]
        
        # build dictionary
        self.idx2cate = dict((k,v) for k,v in enumerate(self.cate_list))
        self.idx2cate.update({len(self.cate_list):"_other"})
        
        self.cate2idx = dict((v,k) for k,v in self.idx2cate.items())
        self.eye = np.eye(len(self.cate2idx))
        
        self.width = len(self.cate2idx)
        
        self.make_meta()
        self.dim_names = list("%s|%s"%(self.col_name,k) for k in self.cate2idx.keys())
        
    def trans2idx(self,cate):
        try:
            return self.cate2idx[cate]
        except:
            return self.cate2idx["_other"]
        
    def prepro_idx(self,pandas_s):
        return pandas_s.apply(self.trans2idx)
    
    def prepro(self,pandas_s):
        return self.eye[self.prepro_idx(pandas_s).values]
    
class categorical_idx(col_core):
    def __init__(self,col_name,save_dir = ".matchbox/fields"):
        super(categorical_idx,self).__init__(col_name, save_dir)
        self.coltype = "categorical_idx"
        self.dim_names = [self.col_name]
        self.width = 1
        self.make_meta_list = ["coltype","idx2cate","cate2idx","width","dim_names"]
        
    def build(self,pandas_s,max_ = 20):
        assert max_>1, "max should be bigger than 1"
        
        vcount = pd.DataFrame(pandas_s.value_counts())
        
        print(vcount)
        
        self.cate_full = list(vcount.index.tolist())
        self.cate_list = self.cate_full[:max_-1]
        
        # build dictionary
        self.idx2cate = dict((k,v) for k,v in enumerate(self.cate_list))
        self.idx2cate.update({len(self.cate_list):"_other"})
        
        self.cate2idx = dict((v,k) for k,v in self.idx2cate.items())
        
        self.make_meta()
        
    def trans2idx(self,cate):
        try:
            return self.cate2idx[cate]
        except:
            return self.cate2idx["_other"]
        
    def prepro(self,pandas_s):
        return pandas_s.apply(self.trans2idx).values
    
class minmax(col_core):
    def __init__(self,col_name,fillna=0.0,save_dir = ".matchbox/fields"):
        """minmax scaler: scale to 0~1"""
        super(minmax,self).__init__(col_name, save_dir)
        self.coltype = "minmax"
        self.fillna = fillna
        self.dim_names = [self.col_name]
        self.width = 1
        self.make_meta_list = ["min_","max_","range","width","dim_names"]
        
    def build(self,pandas_s=None,min_=None,max_=None):
        if type(pandas_s) != pd.core.series.Series:
            assert (min_!=None) and (max_!=None), "If no pandas series is set you have to set min_,max_ value"
            self.min_ = min_
            self.max_ = max_
            
        else:
            pandas_s = pandas_s.fillna(self.fillna)
            if min_ == None:
                self.min_ = pandas_s.min()
            else:
                self.min_ = min_
            if max_ == None:
                self.max_ = pandas_s.max()
            else:
                self.max_ = max_
                
        self.range = self.max_-self.min_
        assert self.range!=0, "the value range is 0"
        print("min_:%.3f \tmax_:%.3f\t range:%.3f"%(self.min_,self.max_,self.range))
        self.make_meta()
        
    def prepro(self,data):
        return (np.clip(data.values,self.min_,self.max_)-self.min_)/self.range
        
class tabulate(col_core):
    def __init__(self,table_name,save_dir = ".matchbox/fields"):
        super(tabulate,self).__init__(table_name, save_dir)
        self.coltype = "tabulate"
        self.cols=dict()
        self.make_meta_list = ["col_name","coltype","cols","dim_names"]
        
    def build_url(self,metalist):
        for url in metalist:
            meta_dict = np.load(url).tolist()
            self.cols[meta_dict["col_name"]] = meta_dict
        self.make_dim()
        self.make_meta()
        
    def build(self,*args):
        for obj in args:
            self.cols[obj["col_name"]] = args.meta
        self.make_dim()
        self.make_meta()
            
    def make_col(self,meta):
        """
        creat sub obj according to sub meta
        """
        col_name = meta["col_name"]
        
        setattr(self,col_name,eval(meta_dict["coltype"])(col_name))
        getattr(self,col_name).set_meta(meta)
        
    def make_dim(self):
        self.dim_names = []
        
        for k,meta in self.meta["cols"].items():
            self.make_col(meta)
            self.dim_names.append("%s|%s"%(self.col_name, meta.dim_names))
            
        self.width = len(self.dim_names)
        
    def prepro(self,data):
        """
        data being a pandas dataframe
        """
        data_list = []
        
        for k in self.meta["cols"].keys():
            # preprocess the data for every column
            col = getattr(self,k)
            data_list.append(col.prepro(data[k]))
        return np.concatenate(data_list,axis = 1)

### Categorical data test

In [38]:
region =categorical_idx("region")

In [34]:
region.build(countries.region)

                                     region
SUB-SAHARAN AFRICA                       51
LATIN AMER. & CARIB                      45
ASIA (EX. NEAR EAST)                     28
WESTERN EUROPE                           28
OCEANIA                                  21
NEAR EAST                                16
EASTERN EUROPE                           12
C.W. OF IND. STATES                      12
NORTHERN AFRICA                           6
NORTHERN AMERICA                          5
BALTICS                                   3


In [47]:
region.prepro(countries.region)[:20]

array([2, 6, 8, 4, 3, 0, 1, 1, 1, 7, 1, 4, 3, 7, 1, 5, 2, 1, 7, 3])

In [36]:
region.save_meta()

In [39]:
region.load_meta()

### Minmax test

In [49]:
area =minmax("area")

In [50]:
area.build(countries["area"])
area.save_meta()

min_:2.000 	max_:17075200.000	 range:17075198.000


In [51]:
area.prepro(countries["area"])[:20]

array([3.79203802e-02, 1.68349439e-03, 1.39485235e-01, 1.15372015e-05,
       2.72910452e-05, 7.30122134e-02, 5.85644746e-06, 2.58269333e-05,
       1.62041342e-01, 1.74510421e-03, 1.11858147e-05, 4.50176215e-01,
       4.91168536e-03, 5.07156637e-03, 8.16271647e-04, 3.88282467e-05,
       8.43316722e-03, 2.51241596e-05, 1.21578678e-02, 1.78773915e-03])

In [52]:
area =minmax("area")
area.load_meta()

In [53]:
area.prepro(countries["area"])[:20]

array([3.79203802e-02, 1.68349439e-03, 1.39485235e-01, 1.15372015e-05,
       2.72910452e-05, 7.30122134e-02, 5.85644746e-06, 2.58269333e-05,
       1.62041342e-01, 1.74510421e-03, 1.11858147e-05, 4.50176215e-01,
       4.91168536e-03, 5.07156637e-03, 8.16271647e-04, 3.88282467e-05,
       8.43316722e-03, 2.51241596e-05, 1.21578678e-02, 1.78773915e-03])

In [48]:
countries

Unnamed: 0,country,region,population,area,density,coast,migrate,inf_live,gdp,literacy,phones,arable,crops,other,climate,birthrate,deathrate,agriculture,industry,service
0,Afghanistan,ASIA (EX. NEAR EAST),31056997,647500,480,000,2306,16307,700.0,360,32,1213,022,8765,1,466,2034,038,024,038
1,Albania,EASTERN EUROPE,3581655,28748,1246,126,-493,2152,4500.0,865,712,2109,442,7449,3,1511,522,0232,0188,0579
2,Algeria,NORTHERN AFRICA,32930091,2381740,138,004,-039,31,6000.0,700,781,322,025,9653,1,1714,461,0101,06,0298
3,American Samoa,OCEANIA,57794,199,2904,5829,-2071,927,8000.0,970,2595,10,15,75,2,2246,327,0,0,0
4,Andorra,WESTERN EUROPE,71201,468,1521,000,66,405,19000.0,1000,4972,222,0,9778,3,871,625,0,0,0
5,Angola,SUB-SAHARAN AFRICA,12127071,1246700,97,013,0,19119,1900.0,420,78,241,024,9735,0,4511,242,0096,0658,0246
6,Anguilla,LATIN AMER. & CARIB,13477,102,1321,5980,1076,2103,8600.0,950,4600,0,0,100,2,1417,534,004,018,078
7,Antigua & Barbuda,LATIN AMER. & CARIB,69108,443,1560,3454,-615,1946,11000.0,890,5499,1818,455,7727,2,1693,537,0038,022,0743
8,Argentina,LATIN AMER. & CARIB,39921833,2766890,144,018,061,1518,11200.0,971,2204,1231,048,8721,3,1673,755,0095,0358,0547
9,Armenia,C.W. OF IND. STATES,2976372,29800,999,000,-647,2328,3500.0,986,1957,1755,23,8015,4,1207,823,0239,0343,0418
