# Linear Regression with Train Test Split

We train the model on the training dataset and then check how well it behaves on the testing one.

Ultimately we are trying to avoid the scenario where the model learns to predict the training data very well but fails miserably when given new samples.

In [1]:
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
a = np.arange(1,101)

In [3]:
# the result will be in array

a

array([  1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,
        14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,
        27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,
        40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,  52,
        53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,  65,
        66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,
        79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  91,
        92,  93,  94,  95,  96,  97,  98,  99, 100])

In [4]:
b = np.arange(501,601)

In [5]:
b

array([501, 502, 503, 504, 505, 506, 507, 508, 509, 510, 511, 512, 513,
       514, 515, 516, 517, 518, 519, 520, 521, 522, 523, 524, 525, 526,
       527, 528, 529, 530, 531, 532, 533, 534, 535, 536, 537, 538, 539,
       540, 541, 542, 543, 544, 545, 546, 547, 548, 549, 550, 551, 552,
       553, 554, 555, 556, 557, 558, 559, 560, 561, 562, 563, 564, 565,
       566, 567, 568, 569, 570, 571, 572, 573, 574, 575, 576, 577, 578,
       579, 580, 581, 582, 583, 584, 585, 586, 587, 588, 589, 590, 591,
       592, 593, 594, 595, 596, 597, 598, 599, 600])

In [6]:
# it takes an array and splits it into two arrays
# the first array are training array
# the second array are testing array
# both array are shuffled

train_test_split(a)

[array([ 37,  27,  79,  52,  85,  44,  59,  18,  46,  74,  22,  71,  25,
         86,   5,  16,   1,  26,  58,  41,  80,  87,  66,  10,   9,  55,
         53,   2,  35,  84,  91,  97,  34,   3,  73,  93,  11,  81,  98,
         88,  70,  65, 100,  61,  64,  67,  51,  62,  43,  99,  21,  76,
         13,  78,  28,  56,  31,  68,   7,  36,   8,  15,  92,  19,  49,
         39,  50,  82,  48,  69,  40,  14,  57,  33,  72]),
 array([12, 89, 90, 29, 20,  6, 83, 75, 96, 17, 95, 77, 42, 63, 47, 32, 30,
        45,  4, 23, 24, 94, 54, 60, 38])]

In [7]:
a_train, a_test = train_test_split(a)

In [8]:
a_train.shape, a_test.shape

((75,), (25,))

In [9]:
# using the test_size can change the number in both array

a_train, a_test = train_test_split(a, test_size = 0.2)

In [10]:
# so the train have 80 numbers
# the test have 20 numbers
# each time run the code, it get a different shuffle

a_train.shape, a_test.shape

((80,), (20,))

In [11]:
a_train

array([ 41,  84,  11,   5,  59,  79,  51,  19,  97,  63,  81,  50,  85,
        66,  16,  61,  65,  46,  78,  92,  68,  49,  70,  24,  83,  13,
        57,  42,  18,   6,   1,  86,  39,  98,  43,  31,  22,  77,  52,
        69, 100,  90,  64,  14,  80,  82,   4,  94,  54,  67,  58,  25,
        33,  95,  56,  30,   3,  40,  62,  44,  28,  36,   9,  53,  47,
        89,  17,  96,  23,  10,  74,  55,  12,  75,  35,  76,  26,  20,
        91,   8])

In [12]:
a_test

array([32, 72,  7, 15, 48, 34, 88, 99, 45, 37, 87, 21, 71, 93, 27, 73, 29,
       60, 38,  2])

In [13]:
# using the random_state it can let two array are the same number if we rerun the code
# it can prevent the objective assessment
# so the number will not change but still random

a_train, a_test = train_test_split(a, test_size = 0.2, random_state = 42)

In [14]:
a_train

array([ 56,  89,  27,  43,  70,  16,  41,  97,  10,  73,  12,  48,  86,
        29,  94,   6,  67,  66,  36,  17,  50,  35,   8,  96,  28,  20,
        82,  26,  63,  14,  25,   4,  18,  39,   9,  79,   7,  65,  37,
        90,  57, 100,  55,  44,  51,  68,  47,  69,  62,  98,  80,  42,
        59,  49,  99,  58,  76,  33,  95,  60,  64,  85,  38,  30,   2,
        53,  22,   3,  24,  88,  92,  75,  87,  83,  21,  61,  72,  15,
        93,  52])

In [15]:
a_test

array([84, 54, 71, 46, 45, 40, 23, 81, 11,  1, 19, 31, 74, 34, 91,  5, 77,
       78, 13, 32])

In [16]:
# it can use the two array in the same time
# when we split a and b using the train_test_split, their elements are shuffled in the same way
# this is extremely important for regressions because we want to certain observations inputs to match with its target after shuffling

a_train, a_test, b_train, b_test = train_test_split(a, b, test_size = 0.2, random_state = 42)

In [17]:
a_train

array([ 56,  89,  27,  43,  70,  16,  41,  97,  10,  73,  12,  48,  86,
        29,  94,   6,  67,  66,  36,  17,  50,  35,   8,  96,  28,  20,
        82,  26,  63,  14,  25,   4,  18,  39,   9,  79,   7,  65,  37,
        90,  57, 100,  55,  44,  51,  68,  47,  69,  62,  98,  80,  42,
        59,  49,  99,  58,  76,  33,  95,  60,  64,  85,  38,  30,   2,
        53,  22,   3,  24,  88,  92,  75,  87,  83,  21,  61,  72,  15,
        93,  52])

In [18]:
a_test

array([84, 54, 71, 46, 45, 40, 23, 81, 11,  1, 19, 31, 74, 34, 91,  5, 77,
       78, 13, 32])

In [19]:
b_train

array([556, 589, 527, 543, 570, 516, 541, 597, 510, 573, 512, 548, 586,
       529, 594, 506, 567, 566, 536, 517, 550, 535, 508, 596, 528, 520,
       582, 526, 563, 514, 525, 504, 518, 539, 509, 579, 507, 565, 537,
       590, 557, 600, 555, 544, 551, 568, 547, 569, 562, 598, 580, 542,
       559, 549, 599, 558, 576, 533, 595, 560, 564, 585, 538, 530, 502,
       553, 522, 503, 524, 588, 592, 575, 587, 583, 521, 561, 572, 515,
       593, 552])

In [20]:
b_test

array([584, 554, 571, 546, 545, 540, 523, 581, 511, 501, 519, 531, 574,
       534, 591, 505, 577, 578, 513, 532])