Skip to content

Commit

Permalink
fix: remove race condition bug in refresh logic (#1390)
Browse files Browse the repository at this point in the history
Update the logic in forceRefresh() to reduce the churn on the thread pool when the certificate refresh API calls are failing.

New forceRefresh() logic ensures that:

Only 1 refresh cycle may run at a time.
If a refresh cycle is in progress, then it will not be canceled until it succeeds.
Add new test cases to validate race conditions, deadlocks, and concurrency.

Add additional logging to help diagnose production problems with certificate refresh.

Related to #1314

Fixes #1209
Fixes #1159
  • Loading branch information
hessjcg committed Jul 20, 2023
1 parent 75fef46 commit c0a5d58
Show file tree
Hide file tree
Showing 7 changed files with 410 additions and 47 deletions.
78 changes: 54 additions & 24 deletions core/src/main/java/com/google/cloud/sql/core/CloudSqlInstance.java
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,6 @@
import dev.failsafe.RateLimiter;
import java.io.IOException;
import java.security.KeyPair;
import java.time.Duration;
import java.time.Instant;
import java.time.temporal.ChronoUnit;
import java.util.List;
Expand Down Expand Up @@ -57,8 +56,7 @@ class CloudSqlInstance {
private final ListenableFuture<KeyPair> keyPair;
private final Object instanceDataGuard = new Object();
// Limit forced refreshes to 1 every minute.
private final RateLimiter<Object> forcedRenewRateLimiter =
RateLimiter.burstyBuilder(2, Duration.ofSeconds(30)).build();
private final RateLimiter<Object> forcedRenewRateLimiter;

private final RefreshCalculator refreshCalculator = new RefreshCalculator();

Expand All @@ -68,6 +66,9 @@ class CloudSqlInstance {
@GuardedBy("instanceDataGuard")
private ListenableFuture<InstanceData> nextInstanceData;

@GuardedBy("instanceDataGuard")
private boolean forceRefreshRunning;

/**
* Initializes a new Cloud SQL instance based on the given connection name.
*
Expand All @@ -82,12 +83,14 @@ class CloudSqlInstance {
AuthType authType,
CredentialFactory tokenSourceFactory,
ListeningScheduledExecutorService executor,
ListenableFuture<KeyPair> keyPair) {
ListenableFuture<KeyPair> keyPair,
RateLimiter<Object> forcedRenewRateLimiter) {
this.instanceName = new CloudSqlInstanceName(connectionName);
this.instanceDataSupplier = instanceDataSupplier;
this.authType = authType;
this.executor = executor;
this.keyPair = keyPair;
this.forcedRenewRateLimiter = forcedRenewRateLimiter;

if (authType == AuthType.IAM) {
HttpRequestInitializer source = tokenSourceFactory.create();
Expand Down Expand Up @@ -159,20 +162,21 @@ String getPreferredIp(List<String> preferredTypes) {
*/
void forceRefresh() {
synchronized (instanceDataGuard) {
nextInstanceData.cancel(false);
if (nextInstanceData.isCancelled()) {
logger.fine(
"Force Refresh: the next refresh operation was cancelled."
+ " Scheduling new refresh operation immediately.");
currentInstanceData = executor.submit(this::performRefresh);
nextInstanceData = currentInstanceData;
} else {
logger.fine(
"Force Refresh: the next refresh operation is already running."
+ " Marking it as the current operation.");
// Otherwise it's already running, so just move next to current.
currentInstanceData = nextInstanceData;
// Don't force a refresh until the current forceRefresh operation
// has produced a successful refresh.
if (forceRefreshRunning) {
return;
}

forceRefreshRunning = true;
nextInstanceData.cancel(false);
logger.fine(
String.format(
"[%s] Force Refresh: the next refresh operation was cancelled."
+ " Scheduling new refresh operation immediately.",
instanceName));
currentInstanceData = executor.submit(this::performRefresh);
nextInstanceData = currentInstanceData;
}
}

Expand All @@ -182,10 +186,14 @@ void forceRefresh() {
* would expire.
*/
private InstanceData performRefresh() throws InterruptedException, ExecutionException {
logger.fine("Refresh Operation: Acquiring rate limiter permit.");
logger.fine(
String.format("[%s] Refresh Operation: Acquiring rate limiter permit.", instanceName));
// To avoid unreasonable SQL Admin API usage, use a rate limit to throttle our usage.
forcedRenewRateLimiter.acquirePermit();
logger.fine("Refresh Operation: Acquired rate limiter permit. Starting refresh...");
logger.fine(
String.format(
"[%s] Refresh Operation: Acquired rate limiter permit. Starting refresh...",
instanceName));

try {
InstanceData data =
Expand All @@ -194,15 +202,16 @@ private InstanceData performRefresh() throws InterruptedException, ExecutionExce

logger.fine(
String.format(
"Refresh Operation: Completed refresh with new certificate expiration at %s.",
data.getExpiration().toInstant().toString()));
"[%s] Refresh Operation: Completed refresh with new certificate expiration at %s.",
instanceName, data.getExpiration().toInstant().toString()));
long secondsToRefresh =
refreshCalculator.calculateSecondsUntilNextRefresh(
Instant.now(), data.getExpiration().toInstant());

logger.fine(
String.format(
"Refresh Operation: Next operation scheduled at %s.",
"[%s] Refresh Operation: Next operation scheduled at %s.",
instanceName,
Instant.now()
.plus(secondsToRefresh, ChronoUnit.SECONDS)
.truncatedTo(ChronoUnit.SECONDS)
Expand All @@ -212,12 +221,17 @@ private InstanceData performRefresh() throws InterruptedException, ExecutionExce
currentInstanceData = Futures.immediateFuture(data);
nextInstanceData =
executor.schedule(this::performRefresh, secondsToRefresh, TimeUnit.SECONDS);
// Refresh completed successfully, reset forceRefreshRunning.
forceRefreshRunning = false;
}

return data;
} catch (ExecutionException | InterruptedException e) {
logger.log(
Level.FINE, "Refresh Operation: Failed! Starting next refresh operation immediately.", e);
Level.FINE,
String.format(
"[%s] Refresh Operation: Failed! Starting next refresh operation immediately.",
instanceName),
e);
synchronized (instanceDataGuard) {
nextInstanceData = executor.submit(this::performRefresh);
}
Expand All @@ -228,4 +242,20 @@ private InstanceData performRefresh() throws InterruptedException, ExecutionExce
SslData getSslData() {
return getInstanceData().getSslData();
}

ListenableFuture<InstanceData> getNext() {
synchronized (instanceDataGuard) {
return this.nextInstanceData;
}
}

ListenableFuture<InstanceData> getCurrent() {
synchronized (instanceDataGuard) {
return this.currentInstanceData;
}
}

public CloudSqlInstanceName getInstanceName() {
return instanceName;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -70,4 +70,9 @@ String getRegionId() {
String getInstanceId() {
return instanceId;
}

@Override
public String toString() {
return connectionName;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -24,13 +24,15 @@
import com.google.common.util.concurrent.ListenableFuture;
import com.google.common.util.concurrent.ListeningScheduledExecutorService;
import com.google.common.util.concurrent.MoreExecutors;
import dev.failsafe.RateLimiter;
import java.io.File;
import java.io.IOException;
import java.net.InetSocketAddress;
import java.net.Socket;
import java.security.KeyPair;
import java.security.KeyPairGenerator;
import java.security.NoSuchAlgorithmException;
import java.time.Duration;
import java.util.ArrayList;
import java.util.List;
import java.util.Properties;
Expand Down Expand Up @@ -125,6 +127,7 @@ static ListeningScheduledExecutorService getDefaultExecutor() {
// there should be enough free threads so that there will not be a deadlock. Most users
// configure 3 or fewer instances, requiring 6 threads during refresh. By setting
// this to 8, it's enough threads for most users, plus a safety factor of 2.

ScheduledThreadPoolExecutor executor =
(ScheduledThreadPoolExecutor) Executors.newScheduledThreadPool(8);

Expand Down Expand Up @@ -350,6 +353,12 @@ CloudSqlInstance getCloudSqlInstance(String instanceName, AuthType authType) {
instanceName,
k ->
new CloudSqlInstance(
k, adminApiService, authType, credentialFactory, executor, localKeyPair));
k,
adminApiService,
authType,
credentialFactory,
executor,
localKeyPair,
RateLimiter.burstyBuilder(2, Duration.ofSeconds(30)).build()));
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -103,8 +103,7 @@ public InstanceData getInstanceData(
ListenableFuture<KeyPair> keyPair)
throws ExecutionException, InterruptedException {

ListenableFuture<Optional<AccessToken>> token =
executor.submit(() -> accessTokenSupplier.get());
ListenableFuture<Optional<AccessToken>> token = executor.submit(accessTokenSupplier::get);

// Fetch the metadata
ListenableFuture<Metadata> metadataFuture =
Expand Down Expand Up @@ -152,14 +151,18 @@ public InstanceData getInstanceData(
.orElse(x509Certificate.getNotAfter());
}

logger.fine(String.format("[%s] INSTANCE DATA DONE", instanceName));

return new InstanceData(
Futures.getDone(metadataFuture),
Futures.getDone(sslContextFuture),
expiration);
},
executor);

return done.get();
InstanceData instanceData = done.get();
logger.fine(String.format("[%s] ALL FUTURES DONE", instanceName));
return instanceData;
}

String getApplicationName() {
Expand Down Expand Up @@ -219,6 +222,9 @@ private Metadata fetchMetadata(CloudSqlInstanceName instanceName, AuthType authT
try {
Certificate instanceCaCertificate =
createCertificate(instanceMetadata.getServerCaCert().getCert());

logger.fine(String.format("[%s] METADATA DONE", instanceName));

return new Metadata(ipAddrs, instanceCaCertificate);
} catch (CertificateException ex) {
throw new RuntimeException(
Expand Down Expand Up @@ -288,6 +294,8 @@ private Certificate fetchEphemeralCertificate(
ex);
}

logger.fine(String.format("[%s %d] CERT DONE", instanceName, Thread.currentThread().getId()));

return ephemeralCertificate;
}

Expand Down Expand Up @@ -339,6 +347,9 @@ private SslData createSslData(

sslContext.init(kmf.getKeyManagers(), tmf.getTrustManagers(), new SecureRandom());

logger.fine(
String.format("[%s %d] SSL CONTEXT", instanceName, Thread.currentThread().getId()));

return new SslData(sslContext, kmf, tmf);
} catch (GeneralSecurityException | IOException ex) {
throw new RuntimeException(
Expand Down
Loading

0 comments on commit c0a5d58

Please sign in to comment.