In [None]:
import pandas as pd
twitter = pd.read_csv('twitter.csv')
print(twitter.head())

In [None]:
from sklearn.preprocessing import MinMaxScaler

# Apply min-max scaling to twitter['timestamp'] column
scaler_timestamp = MinMaxScaler()
twitter['timestamp_scaled'] = scaler_timestamp.fit_transform(twitter[['timestamp']])

# Apply min-max scaling to latitude and longitude
scaler_latlong = MinMaxScaler()
twitter[['lat_scaled', 'long_scaled']] = scaler_latlong.fit_transform(twitter[['latitude', 'longitude']])

print(f"Scaled Features Summary:")
print(f"  Timestamp range: [{twitter['timestamp_scaled'].min()}, {twitter['timestamp_scaled'].max()}]")
print(f"  Latitude range: [{twitter['lat_scaled'].min():.4f}, {twitter['lat_scaled'].max():.4f}]")
print(f"  Longitude range: [{twitter['long_scaled'].min():.4f}, {twitter['long_scaled'].max():.4f}]")

## Twitter Dataset - DBSCAN with L2 (Euclidean) Distance


In [None]:
# Prepare features for clustering (scaled timestamp, lat, long - no response variable)
twitter_features_scaled = twitter[['timestamp_scaled', 'lat_scaled', 'long_scaled']]

print(f"Twitter dataset size: {len(twitter_features_scaled)} samples")
print("Note: DBSCAN on large datasets is extremely memory-intensive")

# Use sampling for computational feasibility
# DBSCAN has O(n²) complexity - full dataset would require too much memory
sample_size = 100000  # Use 100k samples (adjust based on your memory)
print(f"\n⚠️  Using sample of {sample_size} samples for DBSCAN")
print("(Running on full dataset would likely crash due to memory constraints)")

twitter_sample = twitter.sample(n=min(sample_size, len(twitter)), random_state=42)
twitter_sample_idx = twitter_sample.index
twitter_features_sample = twitter_features_scaled.loc[twitter_sample_idx]

# Apply DBSCAN clustering
# Using L2 (Euclidean) distance on scaled features
twitter_dbscan = DBSCAN(
    eps=0.15,             # Maximum distance between samples
    min_samples=20,       # Minimum samples in a neighborhood
    metric='euclidean',   # Use L2 (Euclidean) distance
    n_jobs=-1             # Use all CPU cores
)

print(f"\nRunning DBSCAN on {len(twitter_features_sample)} samples...")
print(f"Parameters: eps={twitter_dbscan.eps}, min_samples={twitter_dbscan.min_samples}")

# Fit and predict clusters on sample
twitter_clusters_sample = twitter_dbscan.fit_predict(twitter_features_sample)

# Initialize full dataset clusters as noise (-1)
twitter_clusters = np.full(len(twitter), -1)
# Assign cluster labels to sampled points
twitter_clusters[twitter_sample_idx] = twitter_clusters_sample

# Add cluster assignments to the dataframe
twitter['dbscan_cluster'] = twitter_clusters

# Display results
print("\n✅ DBSCAN completed!")
print("\nTwitter DBSCAN Results (L2/Euclidean Distance):")
print(f"  Sampled: {len(twitter_features_sample)} samples")
print(f"  Clusters found: {len(set(twitter_clusters_sample)) - (1 if -1 in twitter_clusters_sample else 0)}")
print(f"  Noise points in sample: {list(twitter_clusters_sample).count(-1)} ({list(twitter_clusters_sample).count(-1)/len(twitter_clusters_sample)*100:.2f}%)")
print(f"  Unsampled points (marked as noise): {len(twitter) - len(twitter_features_sample)}")
print(f"\n  Cluster distribution (sample only - top 10 largest clusters):")
unique, counts = np.unique(twitter_clusters_sample, return_counts=True)
sorted_indices = np.argsort(counts)[::-1][:10]
for idx in sorted_indices:
    cluster, count = unique[idx], counts[idx]
    if cluster == -1:
        print(f"    Noise (-1): {count} samples")
    else:
        print(f"    Cluster {cluster}: {count} samples")

## Twitter Dataset - Compare with Ground Truth Labels

In [None]:
# Compare DBSCAN clusters with actual timezone labels
# Note: Only evaluating on the sampled points
actual_timezones_sample = twitter.loc[twitter_sample_idx, 'timezone']

# Calculate clustering evaluation metrics (on sample only)
ari_twitter = adjusted_rand_score(actual_timezones_sample, twitter_clusters_sample)
nmi_twitter = normalized_mutual_info_score(actual_timezones_sample, twitter_clusters_sample)
homogeneity_tw, completeness_tw, v_measure_tw = homogeneity_completeness_v_measure(actual_timezones_sample, twitter_clusters_sample)

print("Twitter Dataset - Clustering Evaluation (on sampled data):")
print(f"  Sample size: {len(twitter_clusters_sample)} points")
print(f"\n  Adjusted Rand Index (ARI): {ari_twitter:.4f}")
print(f"    (1.0 = perfect match, 0.0 = random, negative = worse than random)")
print(f"\n  Normalized Mutual Information (NMI): {nmi_twitter:.4f}")
print(f"    (1.0 = perfect match, 0.0 = no mutual information)")
print(f"\n  Homogeneity: {homogeneity_tw:.4f}")
print(f"    (1.0 = each cluster contains only members of a single class)")
print(f"  Completeness: {completeness_tw:.4f}")
print(f"    (1.0 = all members of a given class are in the same cluster)")
print(f"  V-Measure: {v_measure_tw:.4f}")
print(f"    (harmonic mean of homogeneity and completeness)")

# Show distribution of timezones vs clusters
print(f"\n  Number of actual timezone classes in sample: {actual_timezones_sample.nunique()}")
print(f"  Number of DBSCAN clusters found: {len(set(twitter_clusters_sample)) - (1 if -1 in twitter_clusters_sample else 0)}")

# Show some example mappings
print(f"\n  Sample cluster assignments vs actual timezones:")
sample_df = twitter.loc[twitter_sample_idx, ['timezone', 'dbscan_cluster']].head(20)
print(sample_df.to_string())

In [None]:
# Compare DBSCAN clusters with actual timezone labels
# Note: Only evaluating on the sampled points
actual_timezones_sample = twitter.loc[twitter_sample_idx, 'timezone']

# Calculate clustering evaluation metrics (on sample only)
ari_twitter = adjusted_rand_score(actual_timezones_sample, twitter_clusters_sample)
nmi_twitter = normalized_mutual_info_score(actual_timezones_sample, twitter_clusters_sample)
homogeneity_tw, completeness_tw, v_measure_tw = homogeneity_completeness_v_measure(actual_timezones_sample, twitter_clusters_sample)

print("Twitter Dataset - Clustering Evaluation (on sampled data):")
print(f"  Sample size: {len(twitter_clusters_sample)} points")
print(f"\n  Adjusted Rand Index (ARI): {ari_twitter:.4f}")
print(f"    (1.0 = perfect match, 0.0 = random, negative = worse than random)")
print(f"\n  Normalized Mutual Information (NMI): {nmi_twitter:.4f}")
print(f"    (1.0 = perfect match, 0.0 = no mutual information)")
print(f"\n  Homogeneity: {homogeneity_tw:.4f}")
print(f"    (1.0 = each cluster contains only members of a single class)")
print(f"  Completeness: {completeness_tw:.4f}")
print(f"    (1.0 = all members of a given class are in the same cluster)")
print(f"  V-Measure: {v_measure_tw:.4f}")
print(f"    (harmonic mean of homogeneity and completeness)")

# Show distribution of timezones vs clusters
print(f"\n  Number of actual timezone classes in sample: {actual_timezones_sample.nunique()}")
print(f"  Number of DBSCAN clusters found: {len(set(twitter_clusters_sample)) - (1 if -1 in twitter_clusters_sample else 0)}")

# Show some example mappings
print(f"\n  Sample cluster assignments vs actual timezones:")
sample_df = twitter.loc[twitter_sample_idx, ['timezone', 'dbscan_cluster']].head(20)
print(sample_df.to_string())

In [None]:
# Analyze cluster mapping for Twitter (on sampled data only)
twitter_sample_data = twitter.loc[twitter_sample_idx]

print("Cluster Mapping: DBSCAN Cluster → Ground Truth Timezones (Sampled Data)")
print("=" * 80)

# Get unique DBSCAN clusters from sample (excluding noise for main analysis)
unique_twitter_clusters = sorted([c for c in twitter_clusters_sample if c != -1])
unique_twitter_clusters = list(set(unique_twitter_clusters))[:10]  # Top 10

for dbscan_cluster in unique_twitter_clusters:
    # Get all samples in this DBSCAN cluster
    cluster_data = twitter_sample_data[twitter_sample_data['dbscan_cluster'] == dbscan_cluster]
    
    # Count ground truth timezones in this cluster
    timezone_counts = cluster_data['timezone'].value_counts()
    total_in_cluster = len(cluster_data)
    
    print(f"\nDBSCAN Cluster {dbscan_cluster} ({total_in_cluster} samples):")
    print(f"  Top ground truth timezones:")
    for i, (timezone, count) in enumerate(timezone_counts.head(5).items(), 1):
        percentage = (count / total_in_cluster) * 100
        print(f"    {i}. '{timezone}': {count} ({percentage:.1f}%)")
    
    # Purity: percentage of most common timezone
    if len(timezone_counts) > 0:
        purity = (timezone_counts.iloc[0] / total_in_cluster) * 100
        print(f"  Purity: {purity:.1f}% (dominated by '{timezone_counts.index[0]}')")

# Overall cluster purity
purity_twitter = calculate_purity(actual_timezones_sample, twitter_clusters_sample)
print(f"\n{'=' * 80}")
print(f"Overall Cluster Purity: {purity_twitter:.4f}")
print(f"  (Higher is better, 1.0 means each cluster is pure)")

# Show noise cluster statistics
noise_count_twitter = (twitter_clusters_sample == -1).sum()
if noise_count_twitter > 0:
    print(f"\nNoise Points (Cluster -1): {noise_count_twitter} samples")
    noise_timezones = twitter_sample_data[twitter_sample_data['dbscan_cluster'] == -1]['timezone'].value_counts()
    print(f"  Top timezones in noise:")
    for timezone, count in noise_timezones.head(5).items():
        print(f"    '{timezone}': {count}")