In [1]:
import pandas as pd
import random

In [2]:
memory_list = []
for _ in range(100):
    for chr in ['A', 'B', 'C']:
        for _ in range(1):
            memory_list.append([chr, random.randrange(start=0, stop=365, step=1)])


In [3]:
# Print out the first five elements in the constructed list list[nested lists]
memory_list[0:5]

[['A', 53], ['B', 196], ['C', 50], ['A', 182], ['B', 205]]

In [4]:
df = pd.DataFrame(memory_list, columns=['Category', 'Value'])

In [5]:
df.head()

Unnamed: 0,Category,Value
0,A,53
1,B,196
2,C,50
3,A,182
4,B,205


In [6]:
print(f'Number of rows in the DataFrame: [{df.shape[0]}]. Number of columns: [{df.shape[1]}].')
print(f'DataFrame columns: {[col for col in df.columns]}')

Number of rows in the DataFrame: [300]. Number of columns: [2].
DataFrame columns: ['Category', 'Value']


In [7]:
# The df.sample() method in Pandas is a powerful tool for randomly sampling rows from a DataFrame. 
# It's particularly useful for tasks like: Creating training and testing sets
# Key Parameters:
# n: The number of rows to sample.
# frac: The fraction of rows to sample.
# replace: Whether to sample with replacement (default is False).
# weights: Weights associated with each row, used for weighted sampling.
# random_state: A seed for the random number generator, ensuring reproducibility.
# axis: The axis along which to sample (0 for rows, 1 for columns).


df = df.sample(frac=1, random_state=42, replace=False) # Ex: frac=0.2 Sample 20% of the rows

In [8]:
print(f'Number of rows in the DataFrame: [{df.shape[0]}]. Number of columns: [{df.shape[1]}].')

Number of rows in the DataFrame: [300]. Number of columns: [2].


In [16]:
# The issue is with how df.loc[0:size-1, ['Value']] is being used. 
# Since you’re shuffling the DataFrame with df.sample(frac=1, random_state=42, replace=False), 
# the indices are no longer in sequential order from 0 to df.shape[0] - 1. 
# Instead, the indices are mixed up after shuffling, so df.loc[0:size-1, ['Value']] 
# will not select the first size rows as intended; 
# it’s actually pulling rows based on the existing index values 0 to size-1.

# Solution
# To select the first size rows after shuffling, use .iloc rather than .loc, (as in X_train = df.loc[0:size-1, ['Value']])
# as .iloc allows selection by position regardless of index labels

# Alternatively, if you want to keep using column names, you can still use .loc but reset the index after shuffling to ensure a sequential order:
# df = df.sample(frac=1, random_state=42, replace=False).reset_index(drop=True)

# Use .loc now that indices are in sequential order
# X_train = df.loc[0:size-1, ['Value']]
# X_test = df.loc[size:, ['Value']]
# y_train = df.loc[0:size-1, 'Category']
# y_test = df.loc[size:, 'Category']

size = int(df.shape[0] * 0.7)
print(f'Splitting point: [{size}]')

X_train = df.iloc[0:size-1, 1]
X_test = df.iloc[size:, 1]
y_train = df.iloc[0:size-1, 0]
y_test = df.iloc[size:, 0]

Splitting point: [210]


In [17]:
print('Size of X_train =', X_train.shape[0])
X_train.head()

Size of X_train = 209


203    140
266     20
152    284
9       19
233     48
Name: Value, dtype: int64

In [19]:
print('Size of X_test =', X_test.shape[0])
X_test.head()

Size of X_test = 90


128     66
290     85
8       47
70      17
264    117
Name: Value, dtype: int64

In [20]:
print('Size of y_train =', y_train.shape[0])
y_train.head()

Size of y_train = 209


203    C
266    C
152    C
9      A
233    C
Name: Category, dtype: object

In [21]:
print('Size of y_test =', y_test.shape[0])
y_test.head()

Size of y_test = 90


128    C
290    C
8      C
70     B
264    A
Name: Category, dtype: object