In [62]:
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed
import pickle
import os
import time
import pandas as pd
from osfclient.api import OSF

# Set your batch range here manually
start_idx = 280
end_idx = 300  # Example: process URL 0 to 9

# Define dependency files to search for
dependency_files = [
    'renv.lock', 'sessionInfo.txt', 'sessionInfo.RData', '.Rprofile', 'DESCRIPTION',
    'NAMESPACE', 'requirements.txt', 'environment.yml', 'Dockerfile', 'README.md',
    'README.txt', 'Makefile', 'metadata.yml', 'metadata.json', 'dependencies.R', 'dependency.R'
]
dependency_files_set = set(dependency_files)

# Load your OSF metadata
osf_metadata = pd.read_csv("StatCodeSearch-Code-Comment.csv")
source_url_list = list(osf_metadata.Source.dropna().unique())
batch_urls = source_url_list[start_idx:end_idx]

# Initialize OSF client and cache
osf = OSF()
file_cache = {}

# Search function
def search_dependency_files(storage, dependency_files_set):
    unique_files = []
    try:
        for file in storage.files:
            if file.name in dependency_files_set:
                unique_files.append(file.name)
                if len(unique_files) == len(dependency_files_set):
                    return unique_files
    except Exception as e:
        print(f"Error accessing storage: {e}")
    return unique_files

# Fetch files for one project
def fetch_unique_files(url):
    project_id = url.strip('/').split('/')[-1]
    retries = 5
    delay = 3

    for attempt in range(retries):
        try:
            if project_id in file_cache:
                return url, project_id, file_cache[project_id]
            
            project = osf.project(project_id)
            storage = project.storage('osfstorage')
            unique_files = search_dependency_files(storage, dependency_files_set)

            file_cache[project_id] = unique_files
            return url, project_id, unique_files

        except Exception as e:
            if "429" in str(e) and attempt < retries - 1:
                print(f"⚠️ Rate limited on project {project_id}, retrying in {delay} seconds...")
                time.sleep(delay)
                delay *= 2
            else:
                print(f"❌ Error processing project {project_id}: {e}")
                return url, project_id, []

# Run the batch
all_unique_files = {}
with ThreadPoolExecutor(max_workers=5) as executor:
    futures = {executor.submit(fetch_unique_files, url): url for url in batch_urls}
    for future in tqdm(as_completed(futures), total=len(futures), mininterval=0.5):
        try:
            url, project_id, unique_files = future.result()
            all_unique_files[url] = {
                "project_id": project_id,
                "files": unique_files
            }
        except Exception as e:
            print(f"Error retrieving result: {e}")

# Print result
print("\n🗂 Unique dependency files found in current batch:")
for url, details in all_unique_files.items():
    print(f"\n🔗 Project URL: {url}")
    print(f"📁 Project ID: {details['project_id']}")
    print("📄 Files Found:", details["files"])


100%|███████████████████████████████████████████| 16/16 [00:23<00:00,  1.48s/it]


🗂 Unique dependency files found in current batch:

🔗 Project URL: https://osf.io/zhf98/
📁 Project ID: zhf98
📄 Files Found: []

🔗 Project URL: https://osf.io/dkq3f/
📁 Project ID: dkq3f
📄 Files Found: ['README.txt']

🔗 Project URL: https://osf.io/w97h4/
📁 Project ID: w97h4
📄 Files Found: []

🔗 Project URL: https://osf.io/7z3mk/
📁 Project ID: 7z3mk
📄 Files Found: []

🔗 Project URL: https://osf.io/ajf3h/
📁 Project ID: ajf3h
📄 Files Found: []

🔗 Project URL: https://osf.io/9mc84/
📁 Project ID: 9mc84
📄 Files Found: []

🔗 Project URL: https://osf.io/5xdbu/
📁 Project ID: 5xdbu
📄 Files Found: []

🔗 Project URL: https://osf.io/p2xgq/
📁 Project ID: p2xgq
📄 Files Found: []

🔗 Project URL: https://osf.io/n7sep/
📁 Project ID: n7sep
📄 Files Found: []

🔗 Project URL: https://osf.io/kgtx6/
📁 Project ID: kgtx6
📄 Files Found: []

🔗 Project URL: https://osf.io/drv3a/
📁 Project ID: drv3a
📄 Files Found: []

🔗 Project URL: https://osf.io/b4gc7/
📁 Project ID: b4gc7
📄 Files Found: []

🔗 Project URL: https://o




In [63]:
import csv

output_file = "osf_dependency_result.csv"

# Check if the file exists to avoid writing the header again
write_header = not os.path.exists(output_file)

with open(output_file, mode='a', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    
    if write_header:
        writer.writerow(["Project URL", "Project ID", "Files Found"])

    for url, details in all_unique_files.items():
        project_id = details.get("project_id", "N/A")
        files = ", ".join(details.get("files", []))
        writer.writerow([url, project_id, files])


In [66]:
import pandas as pd
from collections import Counter

# List of dependencies
dependency_files = [
    'renv.lock', 'sessionInfo.txt', 'sessionInfo.RData', '.Rprofile', 'DESCRIPTION',
    'NAMESPACE', 'requirements.txt', 'environment.yml', 'Dockerfile', 'README.md',
    'README.txt', 'Makefile', 'metadata.yml', 'metadata.json', 'dependencies.R', 'dependency.R'
]

# Load CSV file
df = pd.read_csv("osf_dependency_result.csv")  # Replace with your actual file path

# Extract and split all found files
all_found_files = []

for entry in df["Files Found"].dropna():
    files = [f.strip() for f in entry.split(",")]
    all_found_files.extend(files)

# Count only those in dependency_files
filtered_counts = Counter(f for f in all_found_files if f in dependency_files)

# Display results
print("📊 Unique Dependency File Counts:")
for dep_file, count in filtered_counts.items():
    print(f"{dep_file}: {count}")


📊 Unique Dependency File Counts:
README.txt: 17
README.md: 13
DESCRIPTION: 2
NAMESPACE: 2
Makefile: 2
Dockerfile: 1
sessionInfo.txt: 1


In [65]:
import pandas as pd
from collections import defaultdict

# List of known dependency files
dependency_files = [
    'renv.lock', 'sessionInfo.txt', 'sessionInfo.RData', '.Rprofile', 'DESCRIPTION',
    'NAMESPACE', 'requirements.txt', 'environment.yml', 'Dockerfile', 'README.md',
    'README.txt', 'Makefile', 'metadata.yml', 'metadata.json', 'dependencies.R', 'dependency.R'
]

# Load your CSV
df = pd.read_csv("osf_dependency_result.csv")  # Replace with your actual file name

# Prepare storage for file-to-project mapping
file_to_projects = defaultdict(list)

# Process each row
for _, row in df.iterrows():
    project_id = row["Project ID"]
    files = str(row["Files Found"]).split(",") if pd.notna(row["Files Found"]) else []

    for f in files:
        f = f.strip()
        if f in dependency_files:
            file_to_projects[f].append(project_id)

# Print results
print("📦 Dependency File Usage Across Projects:\n")
for f in sorted(file_to_projects.keys()):
    projects = file_to_projects[f]
    print(f"{f} ({len(projects)} projects): {', '.join(projects)}")


📦 Dependency File Usage Across Projects:

DESCRIPTION (2 projects): w7pjy, vguey
Dockerfile (1 projects): 3uyjt
Makefile (2 projects): fb5tw, 3uyjt
NAMESPACE (2 projects): w7pjy, vguey
README.md (13 projects): 7h94n, emwgp, 5y27d, nd9yr, 2j47e, fb5tw, 7mey8, uygpq, 3fnjq, zh3f4, f6qsk, vguey, csy8q
README.txt (17 projects): 6ukwg, wbyj7, dqjyh, zcv4m, k853j, 9vr6q, rmcuy, 3wy58, 67ncp, 67ncp, cxv5k, 9jxzs, ex9fj, zh3f4, c8vfj, dez9b, dkq3f
sessionInfo.txt (1 projects): cqsr8


In [15]:
import pandas as pd

# Load your metadata file with source URLs
osf_metadata = pd.read_csv("StatCodeSearch-Code-Comment.csv")
source_url_list = list(osf_metadata["Source"].dropna().unique())

# Load the results file
results_df = pd.read_csv("osf_dependency_results.csv")  # Replace with your actual file path

# Filter the results based on matching URLs from source_url_list
filtered_results = results_df[results_df["Project URL"].isin(source_url_list)].copy()

# Save the filtered results to a new CSV
filtered_results.to_csv("filtered_dependency_results.csv", index=False)

print("✅ Filtered results saved to 'filtered_dependency_results.csv'")


✅ Filtered results saved to 'filtered_dependency_results.csv'


In [16]:
import pandas as pd

# Load main metadata file
osf_metadata = pd.read_csv("StatCodeSearch-Code-Comment.csv")
source_url_list = set(osf_metadata["Source"].dropna().unique())

# Load results file
results_df = pd.read_csv("osf_dependency_results.csv")  
result_urls = set(results_df["Project URL"].dropna().unique())

# Find URLs present in main file but missing in result file
missing_urls = list(source_url_list - result_urls)

# Show 5 examples
print("🔍 Sample of 5 missing Project URLs:")
print(missing_urls[:5])


🔍 Sample of 5 missing Project URLs:
['https://osf.io/qxf5t/', 'https://osf.io/m6pb2/', 'https://osf.io/wxgzu/', 'https://osf.io/u3wby/', 'https://osf.io/3w8eg/']
