In [20]:
import pandas as pd

# Load Data

In [21]:
df = pd.read_csv('datasets/datos.csv', index_col='block_id')
df.head(10)

Unnamed: 0_level_0,quality,bits,inter_parts,non_zero_pixels,frame_width,frame_height,movement_level,mean,sub_mean_1,sub_mean_2,sub_mean_3,sub_mean_4,var_sub_blocks,sobel_h,sobel_v,variance,block_movement_h,block_movement_v,cost,relevant
block_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
0,22,2143,1,404,416,240,42352,6.675781,10.265625,4.335938,5.902344,6.199219,4.796791,17.397461,17.397461,87.584335,2.820312,2.539062,0.0,1.0
1,22,1735,0,381,416,240,42352,5.032227,3.699219,4.316406,6.855469,5.257812,1.416102,14.426758,14.233398,48.02533,1.84375,1.921875,8816.0,1.0
2,22,1971,0,460,416,240,42352,6.368164,4.90625,4.277344,9.050781,7.238281,3.615565,15.203125,13.953125,56.61348,3.0,4.9375,5104.0,1.0
3,22,2473,0,421,416,240,42352,8.407227,6.640625,5.996094,14.675781,6.316406,13.150191,21.725586,29.165039,283.079285,2.078125,2.5,21531.0,1.0
4,22,1093,1,167,416,240,42352,3.575195,7.53125,4.894531,0.191406,1.683594,8.1048,11.929688,14.353516,175.123245,3.070312,4.523438,15073.0,1.0
5,22,1018,2,200,416,240,42352,2.650391,1.773438,3.015625,1.269531,4.542969,1.597767,7.608398,7.557617,17.285976,0.570312,3.34375,15343.0,1.0
6,22,503,1,83,416,240,42352,1.423828,0.0,2.679688,0.28125,2.734375,1.656872,3.498047,5.064453,37.955135,1.6875,2.9375,7217.0,1.0
7,22,1552,2,308,416,240,42352,4.96875,5.871094,5.886719,1.632812,6.484375,3.770622,11.541992,13.59668,78.563477,2.585938,1.789062,19475.0,1.0
8,22,815,1,161,416,240,42352,2.291992,0.695312,2.753906,0.0,5.71875,4.939662,8.022461,7.231445,28.187202,2.046875,1.9375,11196.0,1.0
9,22,369,1,77,416,240,42352,0.907227,0.785156,2.84375,0.0,0.0,1.352786,2.682617,2.831055,6.808776,1.5625,1.8125,,0.0


# Categorical and Numerical Variables

In [22]:
categoricalCols = df.columns[df.dtypes == 'object']
categoricalCols

Index([], dtype='object')

There are no categorical variables

In [23]:
df.quality.unique()

array([22, 27, 32, 37], dtype=int64)

Since variable represents video quality we can treat it as a non-discrete variable, and we will probably obtain better results

# Null Treatment

In [24]:
df = df[df['relevant'].notna()]
classCol = df.pop("relevant")

We keep rows where relevant is not null

In [25]:
df.isnull().any()

quality             False
bits                False
inter_parts         False
non_zero_pixels     False
frame_width         False
frame_height        False
movement_level      False
mean                False
sub_mean_1          False
sub_mean_2          False
sub_mean_3           True
sub_mean_4          False
var_sub_blocks      False
sobel_h             False
sobel_v             False
variance            False
block_movement_h    False
block_movement_v    False
cost                 True
dtype: bool

In [26]:
df.sub_mean_3.describe()

count    15984.000000
mean         3.034398
std          4.424289
min          0.000000
25%          0.000000
50%          1.435547
75%          4.421875
max         49.039062
Name: sub_mean_3, dtype: float64

In [27]:
df.cost.describe()

count     15885.000000
mean      37025.493044
std       38720.225671
min           0.000000
25%        9905.000000
50%       23093.000000
75%       52280.000000
max      425705.000000
Name: cost, dtype: float64

In [28]:
from sklearn.impute import SimpleImputer

impNum = SimpleImputer(strategy = "mean")
columns = df.columns
index = df.index
df = pd.DataFrame(impNum.fit_transform(df), columns = columns, index = index)
df.isnull().any()

quality             False
bits                False
inter_parts         False
non_zero_pixels     False
frame_width         False
frame_height        False
movement_level      False
mean                False
sub_mean_1          False
sub_mean_2          False
sub_mean_3          False
sub_mean_4          False
var_sub_blocks      False
sobel_h             False
sobel_v             False
variance            False
block_movement_h    False
block_movement_v    False
cost                False
dtype: bool

# Feature Selection

In [36]:
from sklearn.feature_selection import SelectPercentile
from sklearn.feature_selection import mutual_info_classif

fsPercMi = SelectPercentile(mutual_info_classif, percentile = 40)
fsPercMi.fit(df, classCol)
colFilter = fsPercMi.get_support()
dfPercMi = df.iloc[:, colFilter]
dfPercMi

Unnamed: 0_level_0,quality,bits,non_zero_pixels,movement_level,mean,sub_mean_4,sobel_h,sobel_v
block_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,22.0,2143.0,404.0,42352.0,6.675781,6.199219,17.397461,17.397461
1,22.0,1735.0,381.0,42352.0,5.032227,5.257812,14.426758,14.233398
2,22.0,1971.0,460.0,42352.0,6.368164,7.238281,15.203125,13.953125
3,22.0,2473.0,421.0,42352.0,8.407227,6.316406,21.725586,29.165039
4,22.0,1093.0,167.0,42352.0,3.575195,1.683594,11.929688,14.353516
...,...,...,...,...,...,...,...,...
15995,37.0,66.0,3.0,11781038.0,0.835938,0.000000,1.432617,2.401367
15996,37.0,100.0,8.0,11781038.0,2.132812,0.000000,2.844727,5.889648
15997,37.0,141.0,15.0,11781038.0,4.184570,3.113281,7.579102,9.038086
15998,37.0,335.0,38.0,11781038.0,10.111328,23.636719,21.457031,34.392578


Keeping 40% Original Variables, with Mutual Information Criterion.  
Features kept are quality, bits, non_zero_pixels, movement_level, mean, sub_mean_4, sobel_h and sobel_v.

In [37]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

fsKBestChi2 = SelectKBest(chi2, k = 4)
fsKBestChi2.fit(df, classCol)
colFilter = fsKBestChi2.get_support()
dfKBestChi2 = df.iloc[:, colFilter]

dfKBestChi2

Unnamed: 0_level_0,bits,frame_width,movement_level,cost
block_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,2143.0,416.0,42352.0,0.0
1,1735.0,416.0,42352.0,8816.0
2,1971.0,416.0,42352.0,5104.0
3,2473.0,416.0,42352.0,21531.0
4,1093.0,416.0,42352.0,15073.0
...,...,...,...,...
15995,66.0,2560.0,11781038.0,18704.0
15996,100.0,2560.0,11781038.0,41738.0
15997,141.0,2560.0,11781038.0,20642.0
15998,335.0,2560.0,11781038.0,125078.0


Using K-Best with Chi-Square.  
Features kept are quality, bits, frame_width, movement_level and cost.

# Standarize Variables

In [41]:
import cufflinks as cf
from plotly.offline import iplot

#Conect plotly and pandas
cf.go_offline()
cf.set_config_file(offline=False, world_readable=True)

In [42]:
dfKBestChi2.iplot(kind='histogram',subplots=True,bins=50)

In [44]:
dfKBestChi2.skew()

bits              2.199336
frame_width       0.127924
movement_level    0.655840
cost              2.028507
dtype: float64

Cost and bits variables seem skewed right or positively, as we can see from both the histogram and the skew level (greater than 1).

In [43]:
dfPercMi.iplot(kind='histogram',subplots=True,bins=50)

In [45]:
dfPercMi.skew()

quality            -0.000201
bits                2.199336
non_zero_pixels     2.266299
movement_level      0.655840
mean               63.207199
sub_mean_4          2.382977
sobel_h             1.771370
sobel_v             1.832542
dtype: float64

Non_zero_pixels, bits, sub_mean_4, sobel_h and sobel_v variables seem skewed right or positively, as we can see from both the histogram and the skew level (greater than 1).

In [48]:
df["mean"].describe()

array([6.675781, 5.032227, 6.368164, ..., 2.132812, 4.18457 , 5.542969])

In [49]:
df["mean"].nunique()

6598