# Why Use .values in train_test_split?

This notebook explains the difference between using a pandas Series directly vs using .values when splitting data.

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

## What are pandas Series and NumPy arrays?

In [2]:
# Create a simple DataFrame
data = {
    'review': ['great movie', 'terrible film', 'amazing', 'bad', 'excellent'],
    'sentiment': ['positive', 'negative', 'positive', 'negative', 'positive']
}
df = pd.DataFrame(data)
print('DataFrame:')
print(df)
print(f'\nDataFrame index: {df.index.tolist()}')

DataFrame:
          review sentiment
0    great movie  positive
1  terrible film  negative
2        amazing  positive
3            bad  negative
4      excellent  positive

DataFrame index: [0, 1, 2, 3, 4]


In [3]:
# What is df['review']? It's a pandas Series
print('Type of df[\'review\']: ', type(df['review']))
print('\ndf[\'review\']:')
print(df['review'])

Type of df['review']:  <class 'pandas.core.series.Series'>

df['review']:
0      great movie
1    terrible film
2          amazing
3              bad
4        excellent
Name: review, dtype: object


In [4]:
# What is df['review'].values? It's a NumPy array
print('Type of df[\'review\'].values: ', type(df['review'].values))
print('\ndf[\'review\'].values:')
print(df['review'].values)

Type of df['review'].values:  <class 'numpy.ndarray'>

df['review'].values:
['great movie' 'terrible film' 'amazing' 'bad' 'excellent']


## Key Difference: INDEX

In [5]:
print('=== PANDAS SERIES (has INDEX) ===')
series = df['review']
print('Value:', series.values)
print('Index:', series.index.tolist())

print('\n=== NUMPY ARRAY (no INDEX) ===')
array = df['review'].values
print('Values:', array)
print('Has index? No, it\'s just data')

=== PANDAS SERIES (has INDEX) ===
Value: ['great movie' 'terrible film' 'amazing' 'bad' 'excellent']
Index: [0, 1, 2, 3, 4]

=== NUMPY ARRAY (no INDEX) ===
Values: ['great movie' 'terrible film' 'amazing' 'bad' 'excellent']
Has index? No, it's just data


## The Problem: When you use pandas Series directly

In [6]:
# Let's split using pandas Series (WITHOUT .values)
X_train_series, X_test_series, y_train_series, y_test_series = train_test_split(
    df['review'],           # PANDAS SERIES (with index)
    df['sentiment'],        # PANDAS SERIES (with index)
    test_size=0.4,
    random_state=42
)

print('When using pandas Series:')
print('\nX_train_series type:', type(X_train_series))
print('X_train_series index:', X_train_series.index.tolist())
print('X_train_series values:'), print(X_train_series.values)

When using pandas Series:

X_train_series type: <class 'pandas.core.series.Series'>
X_train_series index: [2, 0, 3]
X_train_series values:
['amazing' 'great movie' 'bad']


(None, None)

In [7]:
# Now split using .values
X_train_array, X_test_array, y_train_array, y_test_array = train_test_split(
    df['review'].values,           # NUMPY ARRAY (no index)
    df['sentiment'].values,        # NUMPY ARRAY (no index)
    test_size=0.4,
    random_state=42
)

print('When using .values:')
print('\nX_train_array type:', type(X_train_array))
print('X_train_array has index? No, it\'s just an array')
print('X_train_array values:'), print(X_train_array)

When using .values:

X_train_array type: <class 'numpy.ndarray'>
X_train_array has index? No, it's just an array
X_train_array values:
['amazing' 'great movie' 'bad']


(None, None)

## Why This Matters: The Index Problem

In [8]:
# PROBLEM: When you use Series, the indices get carried through
print('X_train_series (using Series):'),
for i, review in enumerate(X_train_series):
    print(f'  Position {i}: index={X_train_series.index[i]}, value={review}')

X_train_series (using Series):
  Position 0: index=2, value=amazing
  Position 1: index=0, value=great movie
  Position 2: index=3, value=bad


In [9]:
# CLEANER: When you use .values, you get clean indices
print('X_train_array (using .values):'),
for i, review in enumerate(X_train_array):
    print(f'  Position {i}: value={review}')

X_train_array (using .values):
  Position 0: value=amazing
  Position 1: value=great movie
  Position 2: value=bad


## Real-World Example: TfidfVectorizer

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Create sample IMDB-like data
data_large = {
    'review': [
        'this movie is great and amazing',
        'terrible waste of time',
        'absolutely fantastic film',
        'horrible acting bad plot'
    ],
    'sentiment': ['positive', 'negative', 'positive', 'negative']
}
df_large = pd.DataFrame(data_large)

y = (df_large['sentiment'] == 'positive').astype(int).values

# Split 1: Using Series (WITHOUT .values)
X_train_series, X_test_series, y_train_s, y_test_s = train_test_split(
    df_large['review'],      # Series with index
    y,
    test_size=0.5,
    random_state=42
)

# Split 2: Using .values (RECOMMENDED)
X_train_array, X_test_array, y_train_a, y_test_a = train_test_split(
    df_large['review'].values,  # Array without index
    y,
    test_size=0.5,
    random_state=42
)

In [11]:
# Now try vectorizing
vectorizer = TfidfVectorizer(max_features=10)

print('=== Using Series (with index) ===')
try:
    X_train_vec_series = vectorizer.fit_transform(X_train_series).toarray()
    print('Shape:', X_train_vec_series.shape)
    print('Vectorization works, but indices are preserved in the Series')
except Exception as e:
    print(f'Error: {e}')

=== Using Series (with index) ===
Shape: (2, 9)
Vectorization works, but indices are preserved in the Series


In [12]:
print('\n=== Using .values (no index) ===')
try:
    X_train_vec_array = vectorizer.fit_transform(X_train_array).toarray()
    print('Shape:', X_train_vec_array.shape)
    print('Vectorization works smoothly and cleanly')
except Exception as e:
    print(f'Error: {e}')


=== Using .values (no index) ===
Shape: (2, 9)
Vectorization works smoothly and cleanly


## Summary: Three Reasons to Use .values

### 1. **Clean Indices**
- With `df['review']`: Train split gets indices like [0, 2] and test gets [1, 3]
- With `df['review'].values`: Train and test get indices [0, 1] and [2, 3]

### 2. **Consistency**
```python
# Good practice: Both X and y are NumPy arrays
X_train, X_test, y_train, y_test = train_test_split(
    df['review'].values,    # NumPy array
    y,                       # NumPy array from .values or direct conversion
    test_size=0.2
)
```

### 3. **Compatibility**
- Scikit-learn functions (TfidfVectorizer, LogisticRegression, etc.) expect NumPy arrays
- While they can handle pandas Series, NumPy arrays are the standard expected input
- Using `.values` ensures you're passing the right data type

In [13]:
# BEST PRACTICE for IMDB dataset
print('RECOMMENDED CODE FOR IMDB:')
print()
print('# Load data')
print("df = pd.read_csv('IMDB Dataset.csv')")
print()
print('# Encode sentiment')
print("y = (df['sentiment'] == 'positive').astype(int).values")
print()
print('# Split with .values')
print("X_train, X_test, y_train, y_test = train_test_split(")
print("    df['review'].values,  # <- Use .values!")
print("    y,")
print("    test_size=0.2,")
print("    random_state=42,")
print("    stratify=y")
print(")")

RECOMMENDED CODE FOR IMDB:

# Load data
df = pd.read_csv('IMDB Dataset.csv')

# Encode sentiment
y = (df['sentiment'] == 'positive').astype(int).values

# Split with .values
X_train, X_test, y_train, y_test = train_test_split(
    df['review'].values,  # <- Use .values!
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)
