In [1]:
# Datset source
# https://archive.ics.uci.edu/ml/datasets/BlogFeedback

In [2]:
# Problem statement: Instances in this dataset contain features extracted from blog posts.
# The task associated with the data is to predict how many comments the post will receive.

In [3]:
# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

# Common imports
import numpy as np
import os

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Ignore useless warnings (see SciPy issue #5998)
import warnings
warnings.filterwarnings(action="ignore", message="^internal gelsd")

In [4]:
# Read the dataset

import pandas as pd
pd.options.display.max_columns = 1000
pd.options.display.max_rows = 1000
blfb_df = pd.read_csv('BlogFeedback/blogData_train.csv', sep=',', header=None)
print(blfb_df.shape)
blfb_df.head()

(52397, 281)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159,160,161,162,163,164,165,166,167,168,169,170,171,172,173,174,175,176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,191,192,193,194,195,196,197,198,199,200,201,202,203,204,205,206,207,208,209,210,211,212,213,214,215,216,217,218,219,220,221,222,223,224,225,226,227,228,229,230,231,232,233,234,235,236,237,238,239,240,241,242,243,244,245,246,247,248,249,250,251,252,253,254,255,256,257,258,259,260,261,262,263,264,265,266,267,268,269,270,271,272,273,274,275,276,277,278,279,280
0,40.30467,53.845657,0.0,401.0,15.0,15.52416,32.44188,0.0,377.0,3.0,14.044226,32.615417,0.0,377.0,2.0,34.567566,48.475178,0.0,378.0,12.0,1.479934,46.18691,-356.0,377.0,0.0,1.076167,1.795416,0.0,11.0,0.0,0.400491,1.078097,0.0,9.0,0.0,0.377559,1.07421,0.0,9.0,0.0,0.972973,1.704671,0.0,10.0,0.0,0.022932,1.521174,-8.0,9.0,0.0,2.0,2.0,0.0,2.0,2.0,0.0,0.0,0.0,0.0,0.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,40.30467,53.845657,0.0,401.0,15.0,15.52416,32.44188,0.0,377.0,3.0,14.044226,32.615417,0.0,377.0,2.0,34.567566,48.475178,0.0,378.0,12.0,1.479934,46.18691,-356.0,377.0,0.0,1.076167,1.795416,0.0,11.0,0.0,0.400491,1.078097,0.0,9.0,0.0,0.377559,1.07421,0.0,9.0,0.0,0.972973,1.704671,0.0,10.0,0.0,0.022932,1.521174,-8.0,9.0,0.0,6.0,2.0,4.0,5.0,-2.0,0.0,0.0,0.0,0.0,0.0,35.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,40.30467,53.845657,0.0,401.0,15.0,15.52416,32.44188,0.0,377.0,3.0,14.044226,32.615417,0.0,377.0,2.0,34.567566,48.475178,0.0,378.0,12.0,1.479934,46.18691,-356.0,377.0,0.0,1.076167,1.795416,0.0,11.0,0.0,0.400491,1.078097,0.0,9.0,0.0,0.377559,1.07421,0.0,9.0,0.0,0.972973,1.704671,0.0,10.0,0.0,0.022932,1.521174,-8.0,9.0,0.0,6.0,2.0,4.0,5.0,-2.0,0.0,0.0,0.0,0.0,0.0,35.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,40.30467,53.845657,0.0,401.0,15.0,15.52416,32.44188,0.0,377.0,3.0,14.044226,32.615417,0.0,377.0,2.0,34.567566,48.475178,0.0,378.0,12.0,1.479934,46.18691,-356.0,377.0,0.0,1.076167,1.795416,0.0,11.0,0.0,0.400491,1.078097,0.0,9.0,0.0,0.377559,1.07421,0.0,9.0,0.0,0.972973,1.704671,0.0,10.0,0.0,0.022932,1.521174,-8.0,9.0,0.0,2.0,2.0,0.0,2.0,2.0,0.0,0.0,0.0,0.0,0.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,40.30467,53.845657,0.0,401.0,15.0,15.52416,32.44188,0.0,377.0,3.0,14.044226,32.615417,0.0,377.0,2.0,34.567566,48.475178,0.0,378.0,12.0,1.479934,46.18691,-356.0,377.0,0.0,1.076167,1.795416,0.0,11.0,0.0,0.400491,1.078097,0.0,9.0,0.0,0.377559,1.07421,0.0,9.0,0.0,0.972973,1.704671,0.0,10.0,0.0,0.022932,1.521174,-8.0,9.0,0.0,3.0,1.0,2.0,2.0,-1.0,0.0,0.0,0.0,0.0,0.0,34.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,27.0


In [5]:
# Check for NAN values in the entire dataframe

blfb_df.isnull().sum().sum()

0

In [6]:
# Info about the dataframe

blfb_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 52397 entries, 0 to 52396
Columns: 281 entries, 0 to 280
dtypes: float64(281)
memory usage: 112.3 MB


In [7]:
# Some statistics about the dataframe

blfb_df.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159,160,161,162,163,164,165,166,167,168,169,170,171,172,173,174,175,176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,191,192,193,194,195,196,197,198,199,200,201,202,203,204,205,206,207,208,209,210,211,212,213,214,215,216,217,218,219,220,221,222,223,224,225,226,227,228,229,230,231,232,233,234,235,236,237,238,239,240,241,242,243,244,245,246,247,248,249,250,251,252,253,254,255,256,257,258,259,260,261,262,263,264,265,266,267,268,269,270,271,272,273,274,275,276,277,278,279,280
count,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0
mean,39.444167,46.806717,0.358914,339.853102,24.681661,15.214611,27.959159,0.002748,258.66603,5.829151,14.053114,28.114936,0.0,256.865145,3.639674,34.898467,41.610585,0.358914,286.265569,21.931008,1.161498,39.375051,-226.825047,252.386549,-0.467164,0.501345,0.778465,0.002195,5.537531,0.279176,0.191614,0.486134,0.0,4.387637,0.019257,0.179896,0.481069,0.0,4.372483,0.000401,0.457183,0.725934,0.002195,5.033895,0.17125,0.011718,0.685857,-3.926923,4.281505,0.000286,39.444167,15.214612,14.053114,34.898467,1.161498,0.501345,0.191614,0.179896,0.457183,0.011718,34.777888,2850.294864,0.001641,0.175697,7.6e-05,0.000305,0.35378,0.157795,0.241407,0.002424,0.003416,0.035002,7.6e-05,0.000649,0.004809,7.6e-05,0.016299,0.001412,0.223582,7.6e-05,7.6e-05,0.00334,0.001565,0.000191,0.000305,0.02086,0.000191,0.011623,0.063286,0.000134,0.000305,0.000248,0.004771,7.6e-05,0.000191,0.089623,7.6e-05,0.00042,0.000363,0.001107,0.600626,0.076378,0.030193,0.050213,0.003054,0.001279,0.002958,0.016127,0.000134,0.02523,0.000191,0.00042,0.000305,0.149493,0.067179,0.020803,0.00166,0.012272,0.051033,0.162509,0.037922,0.234861,0.000439,0.000134,0.010497,0.000477,0.006203,0.014944,0.000134,7.6e-05,0.004962,7.6e-05,0.000592,0.040403,0.027292,0.047274,0.003951,0.029639,0.252896,0.076894,0.074642,0.002462,0.651545,0.007768,0.003703,0.003073,0.005077,0.000248,7.6e-05,0.002271,0.268183,0.00208,0.049259,0.032387,0.000134,7.6e-05,0.008512,0.33437,0.111629,0.001221,7.6e-05,0.029086,0.027101,0.005611,0.001317,7.6e-05,0.001279,0.000267,7.6e-05,0.163292,0.00084,7.6e-05,0.000305,0.017005,0.002996,0.001221,0.003951,0.00042,7.6e-05,0.094337,0.042388,0.009848,0.076817,0.104281,0.000363,0.015822,0.02191,0.005726,0.014753,7.6e-05,0.108613,0.016337,0.130065,0.150829,0.020478,0.016909,0.077314,7.6e-05,0.000305,7.6e-05,0.003779,0.169723,0.00084,0.001794,0.01044,0.004141,0.051339,0.012043,0.001279,0.26471,0.002596,7.6e-05,0.467279,0.000496,0.004141,0.06384,0.000592,0.002214,0.027521,0.002271,0.042254,0.001431,0.000134,0.000305,0.006508,0.358322,0.006317,0.136019,0.001565,0.117373,0.004256,0.39111,0.156688,0.041873,0.001393,7.6e-05,0.005382,0.002424,0.000935,0.001813,0.027273,0.014466,7.6e-05,0.000134,0.020516,0.6383,0.123366,0.216768,0.00771,7.6e-05,0.015115,0.006393,0.000191,0.025956,0.014676,0.000248,0.042044,0.00229,0.003435,0.002615,0.008455,0.000305,0.113919,0.115808,0.1381,0.165296,0.167758,0.16165,0.13747,0.159227,0.167681,0.171327,0.162242,0.154455,0.096151,0.088917,0.119167,0.0,1.242094,0.769505,6.764719
std,79.121821,62.359996,6.840717,441.430109,69.598976,32.251189,38.584013,0.131903,321.348052,23.768317,28.664559,39.619195,0.0,320.364454,14.584109,66.923819,52.831047,6.840717,374.371147,61.383375,3.973923,56.862283,269.06704,319.11355,3.332347,0.960882,0.955727,0.075003,6.989038,0.861055,0.359482,0.635827,0.0,5.266506,0.136906,0.341918,0.640526,0.0,5.276613,0.017741,0.899341,0.911919,0.075003,6.503113,0.685764,0.022452,0.942957,4.533971,5.180441,0.018533,111.085471,57.537818,56.407499,94.875857,69.278848,1.562923,0.877402,0.870984,1.47221,1.166225,20.652368,3814.180711,0.04048,0.380566,0.008737,0.017472,0.478146,0.364552,0.427941,0.049173,0.058349,0.183787,0.008737,0.025465,0.069184,0.008737,0.126623,0.037554,0.416649,0.008737,0.008737,0.057696,0.039529,0.013814,0.017472,0.142917,0.013814,0.107182,0.243479,0.011558,0.017472,0.01575,0.06891,0.008737,0.013814,0.285644,0.008737,0.020487,0.019039,0.033253,0.489774,0.265605,0.171118,0.218386,0.055176,0.035736,0.054309,0.125965,0.011558,0.156826,0.013814,0.020487,0.017472,0.356577,0.250335,0.142725,0.040715,0.110097,0.220068,0.368921,0.19101,0.423916,0.020947,0.011558,0.101916,0.021838,0.078513,0.121328,0.011558,0.008737,0.070268,0.008737,0.024317,0.196905,0.162933,0.212226,0.06273,0.169591,0.434676,0.266425,0.262815,0.049558,0.476486,0.087792,0.060736,0.055347,0.07107,0.01575,0.008737,0.047603,0.443018,0.045563,0.216409,0.177028,0.011558,0.008737,0.091868,0.471774,0.314912,0.034928,0.008737,0.168048,0.162379,0.074697,0.036265,0.008737,0.035736,0.016344,0.008737,0.369635,0.028966,0.008737,0.017472,0.12929,0.054657,0.034928,0.06273,0.020487,0.008737,0.2923,0.201474,0.098748,0.266304,0.305627,0.019039,0.124786,0.14639,0.075451,0.120563,0.008737,0.311156,0.126768,0.336378,0.357886,0.141631,0.128933,0.267091,0.008737,0.017472,0.008737,0.061357,0.375393,0.028966,0.042318,0.10164,0.064221,0.22069,0.109077,0.035736,0.441183,0.050881,0.008737,0.498933,0.02227,0.064221,0.244469,0.024317,0.047,0.163596,0.047603,0.201171,0.037807,0.011558,0.017472,0.08041,0.479512,0.07923,0.342812,0.039529,0.321867,0.065099,0.488004,0.36351,0.2003,0.0373,0.008737,0.073165,0.049173,0.030567,0.042542,0.162878,0.119405,0.008737,0.011558,0.14176,0.480497,0.32886,0.412047,0.08747,0.008737,0.122013,0.079704,0.013814,0.159005,0.120255,0.01575,0.200692,0.047802,0.058511,0.051067,0.091561,0.017472,0.317716,0.319998,0.345008,0.371451,0.373655,0.368133,0.344346,0.365891,0.373587,0.376798,0.368676,0.361388,0.2948,0.284627,1.438194,0.0,27.497979,20.338052,37.706565
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.666667,0.0,-1256.0,0.0,-138.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.055556,0.0,-20.0,0.0,-0.5,0.0,0.0,0.0,0.0,-1256.0,0.0,0.0,0.0,0.0,-20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2.285714,5.214318,0.0,29.0,0.0,0.891566,3.075076,0.0,22.0,0.0,0.775,3.044565,0.0,22.0,0.0,1.824588,4.528968,0.0,21.0,0.0,0.057971,4.087037,-369.0,19.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-7.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,0.0,0.0,0.0,0.0,0.0,14.0,93.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,10.63066,19.35312,0.0,162.0,4.0,4.150685,11.051215,0.0,121.0,1.0,3.817239,11.001102,0.0,121.0,0.0,9.776869,16.073494,0.0,128.0,3.0,0.22381,14.501275,-107.0,116.0,0.0,0.108696,0.394463,0.0,2.0,0.0,0.043165,0.248167,0.0,2.0,0.0,0.037634,0.232074,0.0,2.0,0.0,0.092637,0.371303,0.0,2.0,0.0,0.0,0.327737,-2.0,2.0,0.0,3.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,35.0,1859.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,40.30467,77.44283,0.0,478.0,15.0,15.998589,45.701206,0.0,387.0,2.0,14.640625,45.751896,0.0,387.0,1.0,35.83086,70.30784,0.0,421.0,12.0,0.945946,62.566227,-18.0,387.0,0.0,0.498462,1.121571,0.0,10.0,0.0,0.203822,0.695269,0.0,8.0,0.0,0.193846,0.678433,0.0,8.0,0.0,0.435897,1.032096,0.0,9.0,0.0,0.014286,0.942809,0.0,8.0,0.0,25.0,4.0,3.0,21.0,1.0,0.0,0.0,0.0,0.0,0.0,55.0,3959.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
max,1122.6666,559.4326,726.0,2044.0,1314.0,442.66666,359.53006,14.0,1424.0,588.0,438.0,363.74365,0.0,1424.0,588.0,1102.0,581.2171,726.0,1932.0,1290.0,30.68948,540.22546,0.0,1422.0,191.0,8.952662,5.194229,8.0,30.0,11.0,3.153846,4.76314,0.0,24.0,1.0,3.0,4.786569,0.0,24.0,1.0,8.491124,4.957952,8.0,26.0,11.0,0.666667,7.778174,0.0,23.0,1.0,2044.0,1424.0,1424.0,1932.0,1422.0,30.0,24.0,24.0,26.0,23.0,72.0,57894.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,136.0,0.0,1778.0,1778.0,1424.0


In [8]:
# To make this notebook's output identical at every run

np.random.seed(2)

In [9]:
# Plot correlation between mpg and other features

corr_matrix = blfb_df.corr()
corr_matrix[280].sort_values(ascending=False)

280    1.000000
9      0.506540
20     0.503375
5      0.497631
4      0.491707
10     0.490111
14     0.489674
19     0.486316
0      0.485464
51     0.472061
15     0.471999
34     0.461627
21     0.440003
11     0.439152
6      0.433578
1      0.424616
16     0.384654
3      0.356604
29     0.338961
35     0.337775
30     0.335829
40     0.329670
25     0.328525
44     0.323661
13     0.322775
8      0.322106
23     0.320133
50     0.314446
53     0.314177
18     0.299688
54     0.296273
46     0.292805
36     0.285755
31     0.283884
26     0.266815
41     0.265203
56     0.260903
33     0.251493
38     0.251485
28     0.247457
48     0.245544
45     0.233080
43     0.232089
58     0.198638
55     0.191917
59     0.146145
52     0.117642
153    0.080473
57     0.067141
245    0.064753
66     0.064112
157    0.063923
231    0.063903
100    0.061460
68     0.061238
142    0.060322
209    0.058703
247    0.055606
193    0.055318
138    0.054750
2      0.053221
17     0.053221
225    0

In [10]:
# Split the dataframe into features and labels

X = blfb_df.drop([280], axis=1).values
y = blfb_df.loc[:, 280].values
print("X shape: ", X.shape, "y shape: ", y.shape)
print("Sample X values: ", X[:5], "\n", "Sample y values: ", y[:5])

X shape:  (52397, 280) y shape:  (52397,)
Sample X values:  [[40.30467  53.845657  0.       ...  0.        0.        0.      ]
 [40.30467  53.845657  0.       ...  0.        0.        0.      ]
 [40.30467  53.845657  0.       ...  0.        0.        0.      ]
 [40.30467  53.845657  0.       ...  0.        0.        0.      ]
 [40.30467  53.845657  0.       ...  0.        0.        0.      ]] 
 Sample y values:  [ 1.  0.  0.  1. 27.]


In [11]:
# Split the dataset into train, validation and test sets

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.05, random_state=2)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.05, random_state=2)
print(" X_train shape: ", X_train.shape,"\n", "y_train shape: ", y_train.shape,"\n",
     "X_val shape: ", X_val.shape,"\n", "y_val shape: ", y_val.shape,"\n",
     "X_test shape: ", X_test.shape,"\n", "y_test shape: ", y_test.shape,"\n")

 X_train shape:  (47288, 280) 
 y_train shape:  (47288,) 
 X_val shape:  (2489, 280) 
 y_val shape:  (2489,) 
 X_test shape:  (2620, 280) 
 y_test shape:  (2620,) 



In [12]:
# Model 1
# Sklearn Simple Linear Regression model with default parameters

from sklearn.linear_model import LinearRegression
lr_model_1 = LinearRegression()
lr_model_1.fit(X_train, y_train)
print("Train set score: ", lr_model_1.score(X_train, y_train))
print("Validation set score: ", lr_model_1.score(X_val, y_val))
print("Test set score: ", lr_model_1.score(X_test, y_test))

Train set score:  0.36185034379194736
Validation set score:  0.33685626509154276
Test set score:  0.4528242261622508


In [13]:
# Mean Squared Errors of train, validation and test set predictions

from sklearn.metrics import mean_squared_error
print("Train set mse: ", mean_squared_error(y_train, lr_model_1.predict(X_train)))
print("Validation set mse: ", mean_squared_error(y_val, lr_model_1.predict(X_val)))
print("Test set mse: ", mean_squared_error(y_test, lr_model_1.predict(X_test)))

Train set mse:  913.7090200347004
Validation set mse:  1024.1519082296165
Test set mse:  614.9178959575355


In [14]:
# Here the R^2 values and MSE values can be improved using more complex models and since ordinary least squares is invariant we won't normalize the data