In [74]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.cm as cm
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from scipy.cluster.hierarchy import linkage, dendrogram
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import silhouette_samples
from sklearn.metrics import silhouette_score
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.svm import SVC
from sklearn import preprocessing
from sklearn import utils
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import r2_score
from sklearn.utils import resample
from sklearn.metrics import accuracy_score

import warnings # supress warnings
warnings.filterwarnings('ignore')

In [75]:
#Firstly, we read the dataset
df = pd.read_csv('data.csv')
np.random.seed(8)

#Then, we plot a correlation matrix to find the correlation between the class and the rest of our variables. As we can see, all values are pretty low (none of them go past ~0.25).
#The ideal would be >0.5.
corrMatrix = df.corr()

#We can plot it into a heatmap, for a more visual take on it.
ax = sns.heatmap(
    corrMatrix, 
    vmin=-1, vmax=1, center=0,
    cmap=sns.diverging_palette(20, 220, n=200),
    square=True
)
ax.set_xticklabels(
    ax.get_xticklabels(),
    rotation=45,
    horizontalalignment='right'
);

corrMatrix.style.background_gradient(cmap='coolwarm')

TypeError: 'float' object cannot be interpreted as an integer

In [76]:
#Printing the values our class takes allows us to realize that it's incredibly imbalanced. We will correct this later.
print(df['relevant'].value_counts())

1.0    13156
0.0     2842
Name: relevant, dtype: int64


In [77]:
#In order to do some feature selection, we use our previously plotted correlation matrix.
cor_target = abs(corrMatrix["relevant"])
#Then we select our "highly correlated" features. Given how low all the values are, our standards have to be just as low in order to pick any variables at all.
relevant_features = cor_target[cor_target>0.12]
print(relevant_features)

quality              0.250797
bits                 0.252393
skip_parts           0.241371
inter_other_parts    0.195646
non_zero_pixels      0.236884
frame_width          0.148213
frame_height         0.137324
sub_mean_1           0.141184
sub_mean_2           0.138525
sub_mean_3           0.151905
sub_mean_4           0.179225
sobel_h              0.194505
sobel_v              0.197248
relevant             1.000000
Name: relevant, dtype: float64


In [78]:
#We get rid of the variables we deem not important enough for our model.
df.pop('block_movement_h')
df.pop('block_movement_v')
df.pop('var_movement_h')
df.pop('var_movement_v')
df.pop('cost_1')
df.pop('cost_2')
df.pop('inter_16x16_parts')
df.pop('movement_level')
df.pop('mean')
df.pop('var_sub_blocks')
df.pop('intra_parts')
df.pop('inter_4x4_parts')
df.pop('variance')

#The following are just comprobations to figure out if we have any null (missing) or non-finite values. 
print(df.isnull().any())
print(np.any(np.isnan(df))) 
print(np.all(np.isfinite(df)))

quality              False
bits                 False
skip_parts           False
inter_other_parts    False
non_zero_pixels      False
frame_width          False
frame_height         False
sub_mean_1           False
sub_mean_2           False
sub_mean_3            True
sub_mean_4           False
sobel_h              False
sobel_v              False
relevant              True
dtype: bool
True
False


In [79]:
cat_mask = (df.dtypes==object)
cat_cols = df.columns[cat_mask].tolist()
df_cat = df[cat_cols]
df_num = df.drop(cat_cols,axis=1)

In [80]:
#To take care of our missing values, we will use SingleImputer, replacing those null values with our mean for that variable.
imp_num = SimpleImputer(strategy='mean')
columns = df_num.columns
index = df_num.index
df_num = pd.DataFrame(imp_num.fit_transform(df_num),columns=columns,index=index)
df_preprocessed=df_num
df_preprocessed=df_preprocessed.astype(int)

#We check whether our missing values are gone (they are).
print(np.any(np.isnan(df_preprocessed))) 
print(np.all(np.isfinite(df_preprocessed)))

False
True


In [81]:
#Some of our values have a very high variance, so we normalize our data to reduce their impact on our conclusions.
df_columns=df_preprocessed.columns
mms = MinMaxScaler()
df_preprocessed = mms.fit_transform(df_preprocessed)
df_preprocessed=pd.DataFrame(df_preprocessed)
df.columns=df_columns

In [82]:
#We rename the columns back to their original names after losing them in the normalization process.
cols_names={0:"quality",
            1:"bits",
            2:"skip_parts",
            3:"inter_other_parts",
            4:"non_zero_pixels",
            5:"frame_width",
            6:"frame_height",
            7:"sub_mean_1",
            8:"sub_mean_2",
            9:"sub_mean_3",
            10:"sub_mean_4",
            11:"sobel_h",
            12:"sobel_v",
            13:"relevant"
}
df_preprocessed=df_preprocessed.rename(columns=cols_names)
print(df_preprocessed)

       quality      bits  skip_parts  inter_other_parts  non_zero_pixels  \
0          0.0  0.449172        0.00           0.777778         0.487923   
1          0.0  0.363655        0.50           0.296296         0.460145   
2          0.0  0.413121        0.00           0.296296         0.555556   
3          0.0  0.518340        0.00           0.740741         0.508454   
4          0.0  0.229092        0.00           0.296296         0.201691   
5          0.0  0.213372        0.00           0.296296         0.241546   
6          0.0  0.105429        0.00           0.222222         0.100242   
7          0.0  0.325299        0.00           0.370370         0.371981   
8          0.0  0.170824        0.25           0.370370         0.194444   
9          0.0  0.077342        0.25           0.148148         0.092995   
10         0.0  0.005450        0.50           0.000000         0.000000   
11         0.0  0.065395        0.00           0.296296         0.065217   
12         0

In [83]:
#As a means to make a better model, we discretize our numerical variables to get ranges that should be more easily classified.
pd.cut(df_preprocessed['quality'],5)
pd.cut(df_preprocessed['bits'],5)
pd.cut(df_preprocessed['skip_parts'],5)
pd.cut(df_preprocessed['inter_other_parts'],5)
pd.cut(df_preprocessed['non_zero_pixels'],5)
pd.cut(df_preprocessed['frame_width'],5)
pd.cut(df_preprocessed['frame_height'],5)
pd.cut(df_preprocessed['sub_mean_1'],5)
pd.cut(df_preprocessed['sub_mean_2'],5)
pd.cut(df_preprocessed['sub_mean_3'],5)
pd.cut(df_preprocessed['sub_mean_4'],5)
pd.cut(df_preprocessed['sobel_h'],5)
pd.cut(df_preprocessed['sobel_v'],5)
pd.cut(df_preprocessed['relevant'],2)

0           (0.5, 1.0]
1           (0.5, 1.0]
2           (0.5, 1.0]
3           (0.5, 1.0]
4           (0.5, 1.0]
5           (0.5, 1.0]
6           (0.5, 1.0]
7           (0.5, 1.0]
8           (0.5, 1.0]
9        (-0.001, 0.5]
10          (0.5, 1.0]
11          (0.5, 1.0]
12          (0.5, 1.0]
13          (0.5, 1.0]
14          (0.5, 1.0]
15          (0.5, 1.0]
16          (0.5, 1.0]
17          (0.5, 1.0]
18          (0.5, 1.0]
19          (0.5, 1.0]
20          (0.5, 1.0]
21          (0.5, 1.0]
22          (0.5, 1.0]
23          (0.5, 1.0]
24          (0.5, 1.0]
25          (0.5, 1.0]
26          (0.5, 1.0]
27          (0.5, 1.0]
28          (0.5, 1.0]
29          (0.5, 1.0]
             ...      
15970       (0.5, 1.0]
15971       (0.5, 1.0]
15972       (0.5, 1.0]
15973       (0.5, 1.0]
15974       (0.5, 1.0]
15975       (0.5, 1.0]
15976       (0.5, 1.0]
15977       (0.5, 1.0]
15978    (-0.001, 0.5]
15979    (-0.001, 0.5]
15980       (0.5, 1.0]
15981    (-0.001, 0.5]
15982    (-

In [84]:
#Earlier on, we mentioned that our class was very imbalanced. We will save its values for now and then remove the variable itself in order to make our samples.
y=df_preprocessed.relevant
X = df_preprocessed.drop('relevant', axis=1)

In [85]:
#Here we split our dataset into 70% training and 30% test.
x_train, x_test, y_train, y_test = train_test_split(X,y, test_size=0.3)

In [86]:
#We do the upsampling process.
X = pd.concat([x_train, y_train], axis=1)
not_relevant = X[X.relevant==0]
relevant = X[X.relevant==1]

relevant_upsampled = resample(relevant,
                          replace=True, #Sample with replacement
                          n_samples=len(not_relevant), #Matching number in majority class
                          random_state=27) #Reproducible results
upsampled = pd.concat([not_relevant, relevant_upsampled])

In [87]:
#Checking the number of values for the relevant class, we see it is now even.
upsampled.relevant.value_counts()

1.0    2009
0.0    2009
Name: relevant, dtype: int64

In [88]:
y_train = upsampled.relevant
x_train = upsampled.drop('relevant', axis=1)

In [89]:
#LinearRegression model
reg = LinearRegression().fit(x_train, y_train)
prediction=reg.predict(x_test)
y_test_array=y_test.to_numpy()
for i in range(len(prediction)):
    prediction[i]=int(prediction[i])
    
#Accuracy score and classification report.
print(accuracy_score(y_test, prediction))
print(classification_report(y_test, prediction))

#Confusion matrix.
conf_lr = pd.DataFrame(
    confusion_matrix(y_test,prediction),
    columns=['Prediction 0', 'Prediction 1'],
    index=['True 0', 'True 1']
)
print(conf_lr)

0.21604166666666666
              precision    recall  f1-score   support

         0.0       0.18      1.00      0.31       835
         1.0       1.00      0.05      0.10      3965

    accuracy                           0.22      4800
   macro avg       0.59      0.53      0.20      4800
weighted avg       0.85      0.22      0.13      4800

        Prediction 0  Prediction 1
True 0           834             1
True 1          3762           203


In [90]:
#RandomForestRegressor model.
rf = RandomForestRegressor(n_estimators=100)
rf.fit(x_train,y_train)
y_pred_rf= rf.predict(x_test)
y_test_array_rf=y_test.to_numpy()
for i in range(len(y_pred_rf)):
    y_pred_rf[i]=int(y_pred_rf[i])

#Accuracy score and classification report.
print(accuracy_score(y_test, y_pred_rf))
print(classification_report(y_test,y_pred_rf))

#Confusion matrix.
conf_rf = pd.DataFrame(
    confusion_matrix(y_test,y_pred_rf),
    columns=['Prediction 0', 'Prediction 1'],
    index=['True 0', 'True 1']
)
print(conf_rf)

0.1825
              precision    recall  f1-score   support

         0.0       0.18      1.00      0.30       835
         1.0       1.00      0.01      0.02      3965

    accuracy                           0.18      4800
   macro avg       0.59      0.51      0.16      4800
weighted avg       0.86      0.18      0.07      4800

        Prediction 0  Prediction 1
True 0           835             0
True 1          3924            41


In [91]:
#NaiveBayes model.
nb=GaussianNB()
nb.fit(x_train, y_train)
y_pred_nb_gaussian=nb.predict(x_test)

#Accuracy score and classification report.
print(accuracy_score(y_test, y_pred_nb_gaussian))
print(classification_report(y_test,y_pred_nb_gaussian))

#Confusion matrix.
conf_nb=pd.DataFrame(
    confusion_matrix(y_test,y_pred_nb_gaussian),
    columns = ['Predicted 0', 'Prediction 1'],
    index = ['True 0', 'True 1']
)
print(conf_nb)

0.5339583333333333
              precision    recall  f1-score   support

         0.0       0.25      0.86      0.39       835
         1.0       0.94      0.47      0.62      3965

    accuracy                           0.53      4800
   macro avg       0.60      0.66      0.51      4800
weighted avg       0.82      0.53      0.58      4800

        Predicted 0  Prediction 1
True 0          714           121
True 1         2116          1849


In [92]:
#SVM model.
svm = SVC(C=10.0)
svm.fit(x_train, y_train)
y_pred_svm=svm.predict(x_test)

#Accuracy score and classification report.
print(accuracy_score(y_test, y_pred_svm))
print(classification_report(y_test,y_pred_svm))

#Confusion matrix.
conf_svm=pd.DataFrame(
    confusion_matrix(y_test,y_pred_svm),
    columns = ['Predicted 0', 'Prediction 1'],
    index = ['True 0', 'True 1']
)
print(conf_svm)

0.653125
              precision    recall  f1-score   support

         0.0       0.30      0.77      0.44       835
         1.0       0.93      0.63      0.75      3965

    accuracy                           0.65      4800
   macro avg       0.62      0.70      0.59      4800
weighted avg       0.82      0.65      0.70      4800

        Predicted 0  Prediction 1
True 0          644           191
True 1         1474          2491


In [93]:
#LogisticRegression model
clf = LogisticRegression(random_state=0).fit(x_train, y_train)
y_pred_clf=clf.predict(x_test)

#Accuracy score and classification report.
print(accuracy_score(y_test, y_pred_clf))
print(classification_report(y_test,y_pred_clf))

#Confusion matrix.
conf_clf=pd.DataFrame(
    confusion_matrix(y_test,y_pred_clf),
    columns = ['Predicted 0', 'Prediction 1'],
    index = ['True 0', 'True 1']
)
print(conf_svm)

0.66875
              precision    recall  f1-score   support

         0.0       0.31      0.74      0.44       835
         1.0       0.92      0.65      0.77      3965

    accuracy                           0.67      4800
   macro avg       0.62      0.70      0.60      4800
weighted avg       0.82      0.67      0.71      4800

        Predicted 0  Prediction 1
True 0          644           191
True 1         1474          2491


In [96]:
#KNN model.
kne = KNeighborsClassifier(n_neighbors=3).fit(x_train, y_train)
y_pred_kne=kne.predict(x_test)

#Accuracy score and classification report.
print(accuracy_score(y_test, y_pred_kne))
print(classification_report(y_test,y_pred_kne))

#Confusion matrix.
conf_kne=pd.DataFrame(
    confusion_matrix(y_test,y_pred_kne),
    columns = ['Predicted 0', 'Prediction 1'],
    index = ['True 0', 'True 1']
)
print(conf_kne)

0.656875
              precision    recall  f1-score   support

         0.0       0.29      0.68      0.41       835
         1.0       0.91      0.65      0.76      3965

    accuracy                           0.66      4800
   macro avg       0.60      0.67      0.58      4800
weighted avg       0.80      0.66      0.70      4800

        Predicted 0  Prediction 1
True 0          569           266
True 1         1381          2584
