In [1]:
import pandas as pd
import numpy as np
import random

from tabulate import tabulate

from sklearn import linear_model, svm, tree
from sklearn import preprocessing
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import ShuffleSplit
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.neural_network import MLPRegressor

## Building some mock data

In [2]:
# Pre-define
device = ['iPhone', 'Android']
resolution = ['1334×750', '1920×1080', '2560×1440', '1280×840', '1184×720', '854×480']
size = [4, 4.7, 5.1, 5.5]

In [3]:
# The mock data may not be perfect but you get the idea...
def gen_data():
    de = []
    sc = []
    sz = []
    w = []
    h = []
    for i in range(2000):
        this_de = random.choice(device)
        this_sz = random.choice(size[:2]) if this_de == 'iPhone' else random.choice(size[2:])
        
        if this_sz == 4.7 and this_de == 'iPhone':
            this_sc = '1920×1080'
            this_w = random.randint(300, 315)
            this_h = random.randint(30, 35)
        elif this_sz == 4 and this_de == 'iPhone':
            this_sc = '1334×750'
            this_w = random.randint(290, 300)
            this_h = random.randint(33, 38)
            
        if this_sz == 5.1 and this_de =='Android':
            this_sc = random.choice(resolution[:3])
            this_w = random.randint(320, 330)
            this_h = random.randint(40, 45)
        elif this_sz == 5.5 and this_de =='Android':
            this_sc = random.choice(resolution[:3])
            this_w = random.randint(330, 350)
            this_h = random.randint(42, 47)
            
        de.append(this_de)
        sc.append(this_sc)
        sz.append(this_sz)
        w.append(this_w)
        h.append(this_h)
    return np.array(de), np.array(sc), np.array(sz), np.array(w), np.array(h)

In [4]:
de, sc, sz, width, height = gen_data()

In [5]:
# Just have a quick look...
de[:20]

array(['Android', 'Android', 'iPhone', 'iPhone', 'iPhone', 'Android',
       'Android', 'Android', 'Android', 'Android', 'iPhone', 'Android',
       'Android', 'Android', 'iPhone', 'iPhone', 'Android', 'Android',
       'iPhone', 'Android'], 
      dtype='<U7')

In [6]:
# How many samples we have now? We can check one column
width.shape 

(2000,)

In [7]:
# What is our data looks like?
df = pd.DataFrame(data={'width': width, 'height': height, 'device': de, 'screen': sc, 'size': sz})
df.head(10)

Unnamed: 0,device,height,screen,size,width
0,Android,45,1920×1080,5.1,329
1,Android,42,2560×1440,5.5,341
2,iPhone,33,1334×750,4.0,297
3,iPhone,34,1334×750,4.0,295
4,iPhone,30,1920×1080,4.7,312
5,Android,45,2560×1440,5.5,346
6,Android,44,1334×750,5.5,347
7,Android,46,1334×750,5.5,345
8,Android,44,2560×1440,5.5,344
9,Android,42,2560×1440,5.5,349


In [8]:
df.columns

Index(['device', 'height', 'screen', 'size', 'width'], dtype='object')

## However, you cannot have string values in a ML algorithm (most cases...)

In [9]:
enc = preprocessing.LabelEncoder()
df.head(8)

Unnamed: 0,device,height,screen,size,width
0,Android,45,1920×1080,5.1,329
1,Android,42,2560×1440,5.5,341
2,iPhone,33,1334×750,4.0,297
3,iPhone,34,1334×750,4.0,295
4,iPhone,30,1920×1080,4.7,312
5,Android,45,2560×1440,5.5,346
6,Android,44,1334×750,5.5,347
7,Android,46,1334×750,5.5,345


In [10]:
df['device'] = enc.fit_transform(df['device'])
df['screen'] = enc.fit_transform(df['device'])
# df['size'] = enc.fit_transform(df['size'])

In [11]:
df.head(8)

Unnamed: 0,device,height,screen,size,width
0,0,45,0,5.1,329
1,0,42,0,5.5,341
2,1,33,1,4.0,297
3,1,34,1,4.0,295
4,1,30,1,4.7,312
5,0,45,0,5.5,346
6,0,44,0,5.5,347
7,0,46,0,5.5,345


In [12]:
def train_test_spliter(df, random_seed=0, test_size=0.15, n_splits=1):
    # df = preprocessing.maxabs_scale(df, axis=0, copy=True)
    col = df.columns
    df1 = pd.DataFrame(data=df, columns=col)
    rs = ShuffleSplit(n_splits=n_splits, test_size=test_size, random_state=random_seed)
    for train_index, test_index in rs.split(df):
        pass
    traindf = pd.DataFrame(data=df, index=train_index)
    testdf = pd.DataFrame(data=df, index=test_index)
    return traindf, testdf

traindf, testdf = train_test_spliter(df)

## Also, some preprocessing (normalization) is needed...

In [13]:
X_train = pd.DataFrame(data=traindf, columns=['device', 'screen', 'size'])
X_train = preprocessing.maxabs_scale(X_train, axis=0, copy=True)
X_test = pd.DataFrame(data=testdf, columns=['device', 'screen', 'size'])
X_test = preprocessing.maxabs_scale(X_test, axis=0, copy=True)

y_train = pd.DataFrame(data=traindf, columns=['height', 'width'])
y_test = pd.DataFrame(data=testdf, columns=['height', 'width'])

## Define some helpers

In [14]:
def apply_method(X_train, y_train, X_test, y_test, method, regressor, target):
    reg = regressor()
    reg.fit(X_train, y_train[target])
    pred = reg.predict(X_test)
    result = mean_squared_error(y_test[target], pred)
    if method == 'Regression Tree':
        global prediction, raw_data
        prediction = pred
        raw_data = np.array(y_train[target])
    # print(np.array(y_train[target]), pred)
    return method, target, result

def apply_method_multi(X_train, y_train, X_test, y_test, method, regressor):
    reg = regressor()
    reg.fit(X_train, y_train)
    pred = reg.predict(X_test)
    result = mean_squared_error(y_test, pred)
    return method, result

## Let's see the results from single-target learning algorithms

In [15]:
methods = {'Linear Regression': linear_model.LinearRegression, 
           'Support Vector Regression': svm.SVR, 
           'Regression Tree': tree.DecisionTreeRegressor, 
           'Random Forest': RandomForestRegressor, 
           'Gradient Boosting': GradientBoostingRegressor, 
           'Neural Network Multi-layer Perceptron': MLPRegressor}
targets = ['height', 'width']


result = []
for ct in targets:
    for name in methods:
        try:
            result.append(apply_method(X_train, y_train, X_test, y_test, name, methods[name], ct))
        except:
            pass



In [16]:
print(tabulate(result, headers=['Method', 'Target', 'Error']))

Method                                 Target         Error
-------------------------------------  --------  ----------
Gradient Boosting                      height       3.15684
Support Vector Regression              height       4.6879
Linear Regression                      height       4.746
Random Forest                          height       3.15915
Neural Network Multi-layer Perceptron  height       7.55303
Regression Tree                        height       3.15677
Gradient Boosting                      width       19.4181
Support Vector Regression              width       45.9378
Linear Regression                      width       23.8086
Random Forest                          width       19.3449
Neural Network Multi-layer Perceptron  width     3546.9
Regression Tree                        width       19.4187


## And then, multi-target learning...

In [17]:
result = []

for name in methods:
    try:
        result.append(apply_method_multi(X_train, y_train, X_test, y_test, name, methods[name]))
    except:
        pass



In [18]:
print(tabulate(result, headers=['Method', 'Error']))

Method                                     Error
-------------------------------------  ---------
Linear Regression                        14.2773
Random Forest                            11.2798
Neural Network Multi-layer Perceptron  1432.49
Regression Tree                          11.2877


In [19]:
prediction[:10]

array([ 307.4088785 ,  295.20472441,  307.4088785 ,  307.4088785 ,
        295.20472441,  340.03539823,  295.20472441,  340.03539823,
        324.88154897,  307.4088785 ])

In [20]:
raw_data[:10]

array([322, 300, 304, 338, 329, 299, 293, 332, 328, 321])

## Not good result? Discuss...

#### Possible Issue

- Need more features
- Sample size?
- Adjust params for our model training process...
- Normally distributed data?
- ... A lot to discover!