In [3]:
import torch
try:
    import torch_xla as xla
    # Check if TPU is available
    if xla.device_count() > 0:
        print("TPU is available")
        device = xla.devices()
        print(f"TPU device: {device}")
    else:
        print("TPU is not available")
except ImportError:
    print("torch_xla is not installed. TPU support is not available.")

TPU is available
TPU device: [device(type='xla', index=0)]


In [6]:
# !pip install torch_xla torch

In [7]:
import torch
import time

try:
    import torch_xla.core.xla_model as xm
    
    # Get TPU device
    device = xm.xla_device()
    print(f"Using device: {device}")
    
    # Create large matrices on TPU
    size = 5000
    a = torch.randn(size, size, device=device)
    b = torch.randn(size, size, device=device)
    
    # Perform matrix multiplication on TPU
    start_time = time.time()
    c = torch.matmul(a, b)
    xm.mark_step()  # Synchronize TPU operations
    tpu_time = time.time() - start_time
    
    print(f"Matrix multiplication ({size}x{size}) on TPU took: {tpu_time:.4f} seconds")
    print(f"Result shape: {c.shape}")
    print(f"Result sample values: {c[0, :5]}")
    
    # Simple neural network operation example
    input_tensor = torch.randn(1000, 512, device=device)
    weights = torch.randn(512, 256, device=device)
    bias = torch.randn(256, device=device)
    
    # Forward pass: linear layer + ReLU
    start_time = time.time()
    output = torch.matmul(input_tensor, weights) + bias
    output = torch.relu(output)
    xm.mark_step()
    nn_time = time.time() - start_time
    
    print(f"\nNeural network operation on TPU took: {nn_time:.4f} seconds")
    print(f"Output shape: {output.shape}")
    
except ImportError:
    print("torch_xla not available. Running on CPU instead.")
    
    # Fallback to CPU
    device = torch.device('cpu')
    print(f"Using device: {device}")
    
    size = 5000
    a = torch.randn(size, size, device=device)
    b = torch.randn(size, size, device=device)
    
    start_time = time.time()
    c = torch.matmul(a, b)
    cpu_time = time.time() - start_time
    
    print(f"Matrix multiplication ({size}x{size}) on CPU took: {cpu_time:.4f} seconds")
    print(f"Result shape: {c.shape}")
    print(f"Result sample values: {c[0, :5]}")


  device = xm.xla_device()
  xm.mark_step()  # Synchronize TPU operations


Using device: xla:0
Matrix multiplication (5000x5000) on TPU took: 1.4102 seconds
Result shape: torch.Size([5000, 5000])
Result sample values: tensor([-56.1368,  37.4464, -49.9332, -18.6929, 127.0401], device='xla:0')


  xm.mark_step()



Neural network operation on TPU took: 0.4315 seconds
Output shape: torch.Size([1000, 256])


In [None]:
import torch
import time

# Check for available GPU backends
print("PyTorch version:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())

if torch.cuda.is_available():
    print(f"CUDA version: {torch.version.cuda}")
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f} GB")
    device = torch.device('cuda')
elif hasattr(torch.backends, 'mps') and torch.backends.mps.is_available():
    print("MPS (Apple Silicon) available")
    device = torch.device('mps')
else:
    print("No GPU acceleration available, using CPU")
    device = torch.device('cpu')

print(f"\nUsing device: {device}")

# Warm up GPU
if device.type == 'cuda':
    torch.cuda.synchronize()
    dummy = torch.randn(100, 100, device=device)
    _ = torch.matmul(dummy, dummy)
    torch.cuda.synchronize()

# Test 1: Large matrix multiplication
print("\n=== Test 1: Matrix Multiplication ===")
size = 5000
a = torch.randn(size, size, device=device)
b = torch.randn(size, size, device=device)

if device.type == 'cuda':
    torch.cuda.synchronize()
    
start_time = time.time()
c = torch.matmul(a, b)

if device.type == 'cuda':
    torch.cuda.synchronize()
    
gpu_time = time.time() - start_time

print(f"Matrix multiplication ({size}x{size}) on {device} took: {gpu_time:.4f} seconds")
print(f"Result shape: {c.shape}")
print(f"Result sample values: {c[0, :5].cpu()}")

# Test 2: Neural network operations
print("\n=== Test 2: Neural Network Operations ===")
batch_size = 1000
input_tensor = torch.randn(batch_size, 512, device=device)
weights = torch.randn(512, 256, device=device)
bias = torch.randn(256, device=device)

if device.type == 'cuda':
    torch.cuda.synchronize()
    
start_time = time.time()
output = torch.matmul(input_tensor, weights) + bias
output = torch.relu(output)

if device.type == 'cuda':
    torch.cuda.synchronize()
    
nn_time = time.time() - start_time

print(f"Neural network operation on {device} took: {nn_time:.4f} seconds")
print(f"Output shape: {output.shape}")

# Test 3: Convolution operation (common in image processing)
print("\n=== Test 3: 2D Convolution ===")
conv_layer = torch.nn.Conv2d(3, 64, kernel_size=3, padding=1).to(device)
images = torch.randn(32, 3, 224, 224, device=device)  # Batch of 32 RGB images

if device.type == 'cuda':
    torch.cuda.synchronize()
    
start_time = time.time()
conv_output = conv_layer(images)

if device.type == 'cuda':
    torch.cuda.synchronize()
    
conv_time = time.time() - start_time

print(f"Convolution operation on {device} took: {conv_time:.4f} seconds")
print(f"Input shape: {images.shape}, Output shape: {conv_output.shape}")

# Memory usage info
if device.type == 'cuda':
    print(f"\n=== GPU Memory Usage ===")
    print(f"Allocated: {torch.cuda.memory_allocated(0) / 1024**3:.2f} GB")
    print(f"Cached: {torch.cuda.memory_reserved(0) / 1024**3:.2f} GB")
