**Install Dependencies**

In [None]:
# Install required packages
!pip install PyWavelets huggingface_hub scikit-image pandas tenacity numpy scipy



**Import and Setup**

In [None]:
#CORE IMPORTS
import os
import tarfile
import io
import pandas as pd
import numpy as np
import shutil
import logging
from skimage import io as skio
from skimage import color, measure, feature
from skimage.util import random_noise, img_as_float
from scipy import ndimage as ndi
from skimage.restoration import estimate_sigma
from huggingface_hub import HfApi, hf_hub_download
from tenacity import retry, stop_after_attempt, wait_exponential

#GOOGLE DRIVE SETUP
from google.colab import drive
drive.mount('/content/drive')  # Connect to Google Drive

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


**Configuration**

In [None]:
#PATHS & SETTINGS
OUTPUT_DIR = "/content/drive/MyDrive/Deepfake"  # Where to save results
os.makedirs(OUTPUT_DIR, exist_ok=True)  # Create folder if missing
IMAGE_SAVE_DIR = os.path.join(OUTPUT_DIR, "organized_images")
CATEGORIES = ['fake_train', 'fake_test', 'real_train', 'real_test']

#DATASET CONFIG
HF_TOKEN = "replace with your token here"  # Hugging Face access
DATASET_REPO = "xingjunm/WildDeepfake"             # Dataset name

#LOGGING SETUP
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)

In [None]:
# Test folder creation
print(f"Main output will go to: {OUTPUT_DIR}")
print(f"Images will save to: {IMAGE_SAVE_DIR}")

# Verify category folders
for category in CATEGORIES:
    category_path = os.path.join(IMAGE_SAVE_DIR, category)
    os.makedirs(category_path, exist_ok=True)
    print(f"Created: {category_path}")

Main output will go to: /content/drive/MyDrive/Deepfake
Images will save to: /content/drive/MyDrive/Deepfake/organized_images
Created: /content/drive/MyDrive/Deepfake/organized_images/fake_train
Created: /content/drive/MyDrive/Deepfake/organized_images/fake_test
Created: /content/drive/MyDrive/Deepfake/organized_images/real_train
Created: /content/drive/MyDrive/Deepfake/organized_images/real_test


**Feature Extraction**

In [None]:
def extract_features(img):
    """
    Extracts 6 key features from an image:
    1. Entropy - Measures image complexity
    2. Wrapped phase - Detects artificial patterns
    3. Noise level - Estimates digital manipulation
    4. Blur - Calculates focus consistency
    5. Keypoints - Counts unique visual features
    6. Blobs - Identifies irregular shapes
    """
    try:
        gray_img = color.rgb2gray(img)


        # Entropy
        entropy = measure.shannon_entropy(img)

        # Wrapped
        image_wrapped = np.angle(np.exp(1j * img))
        wrapped_range = np.max(image_wrapped) - np.min(image_wrapped)

        # Noise
        astro = img_as_float(img)
        astro = astro[30:180, 150:300]
        sigma = 0.08
        noisy = random_noise(astro, var=sigma**2)
        try:
            sigmas = estimate_sigma(noisy, channel_axis=-1)
        except TypeError:
            sigmas = estimate_sigma(noisy, multichannel=True)
        sigma_est = np.mean(sigmas)

        # Blur
        blurred_images = [ndi.uniform_filter(img, size=k) for k in range(2, 32, 2)]
        img_stack = np.stack(blurred_images)

        # Keypoints
        detector = feature.CENSURE()
        detector.detect(gray_img)

        # Blobs
        blobs_dog = feature.blob_dog(gray_img, max_sigma=1, threshold=0.1)


        return {
            'entropy': entropy,
            'wrapped': wrapped_range,
            'noise': sigma_est,
            'blur': np.mean(img_stack),
            'keypoints': len(detector.keypoints),
            'blobs': len(blobs_dog)
        }
    except Exception as e:
        logging.error(f"Feature extraction failed: {str(e)}")
        return None

**File Processing**

In [None]:
# Modify the configuration
CATEGORIES = ['fake_train', 'fake_test', 'real_train', 'real_test']  # All categories
IMAGE_SAVE_DIR = os.path.join(OUTPUT_DIR, "organized_images")

# Create category subfolders
for category in CATEGORIES:
    os.makedirs(os.path.join(IMAGE_SAVE_DIR, category), exist_ok=True)

def process_tar_file(repo_id, file_path, label):
    entries = []
    try:
        local_path = hf_hub_download(...)

        with tarfile.open(local_path, "r:*") as tar:
            for member in tar.getmembers():
                if member.isfile() and member.name.lower().endswith('.png'):
                    try:
                        # Extract image
                        f = tar.extractfile(member)
                        img_data = f.read()

                        # ===== NEW: Organized Saving =====
                        category = 'fake_train' if 'fake_train' in file_path else \
                                  'fake_test' if 'fake_test' in file_path else \
                                  'real_train' if 'real_train' in file_path else 'real_test'

                        img_name = os.path.basename(member.name)
                        img_save_path = os.path.join(IMAGE_SAVE_DIR, category, img_name)

                        with open(img_save_path, 'wb') as img_file:
                            img_file.write(img_data)
                        # ===== END NEW =====

                        # Rest of processing...

                    except Exception as e:
                        logging.error(f"Error processing {member.name}: {str(e)}")
        os.remove(local_path)
    except Exception as e:
        logging.error(f"Failed to process {file_path}: {str(e)}")

    return entries

In [None]:
@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=2, max=10))
def process_tar_file(repo_id, file_path, label):
    """
    Processes a .tar.gz file with:
    - Automatic retries on failure
    - Image saving to Google Drive subfolders
    - Feature extraction
    - Clean temp file removal

    Returns: List of feature dictionaries
    """
    entries = []
    try:
        # Download file
        local_path = hf_hub_download(
            repo_id=repo_id,
            filename=file_path,
            token=HF_TOKEN,
            repo_type="dataset",
            cache_dir="tmp_downloads"
        )

        # Determine category (fake_train/fake_test/real_train/real_test)
        category = next((cat for cat in CATEGORIES if cat in file_path), None)
        if not category:
            raise ValueError(f"Could not determine category for {file_path}")

        # Process archive
        with tarfile.open(local_path, "r:*") as tar:
            for member in tar.getmembers():
                if member.isfile() and member.name.lower().endswith('.png'):
                    try:
                        # Extract image
                        f = tar.extractfile(member)
                        img_data = f.read()
                        img = skio.imread(io.BytesIO(img_data))

                        # ===== Save to Google Drive =====
                        save_dir = os.path.join(IMAGE_SAVE_DIR, category)
                        os.makedirs(save_dir, exist_ok=True)

                        # Clean filename: remove subfolder paths
                        img_name = os.path.basename(member.name)
                        save_path = os.path.join(save_dir, img_name)

                        with open(save_path, 'wb') as img_file:
                            img_file.write(img_data)
                        # ===== End saving =====

                        # Extract features
                        features = extract_features(img)

                        if features:
                            entries.append({
                                'Final_Entropy': features['entropy'],
                                'Final_Wrapped': features['wrapped'],
                                'Final_Noise': features['noise'],
                                'Final_Blur': features['blur'],
                                'Final_Keypoints': features['keypoints'],
                                'Final_Blobs': features['blobs'],
                                'Final_Label': label,
                                'Image_Path': save_path  # Google Drive path
                            })

                    except Exception as e:
                        logging.error(f"Error processing {member.name}: {str(e)}")

        # Cleanup
        os.remove(local_path)
        logging.info(f"Processed {len(entries)} images (saved to {category}/)")

    except Exception as e:
        logging.error(f"Failed to process {file_path}: {str(e)}")

    return entries

**Dataset Structure**

In [None]:
def get_dataset_structure():
    """
    Gets the list of all .tar.gz files from Hugging Face,
    organized by category (fake_test only in this case)
    """
    api = HfApi(token=HF_TOKEN)
    files = api.list_repo_files(repo_id=DATASET_REPO, repo_type="dataset")

    file_map = {category: [] for category in CATEGORIES}

    for f in files:
        if f.endswith('.tar.gz'):
            for category in CATEGORIES:
                if f'deepfake_in_the_wild/{category}/' in f:
                    label = 'fake' if 'fake' in category else 'real'
                    file_map[category].append((f, label))
                    break

    return file_map

**Main Execution**

In [9]:
def main():
    """
    Orchestrates the entire workflow:
    1. Cleans temp files
    2. Gets file list
    3. Processes files in batches
    4. Saves progress checkpoints
    5. Outputs final CSV
    """
    if os.path.exists("tmp_downloads"):
        shutil.rmtree("tmp_downloads")

    # Get dataset structure
    file_map = get_dataset_structure()

    # Initialize DataFrame
    full_df = pd.DataFrame()

    # Process each category
    for category, files in file_map.items():
        logging.info(f"Processing {len(files)} files in {category}")

        for idx, (file_path, label) in enumerate(files):
            logging.info(f"Processing file {idx+1}/{len(files)}: {file_path}")
            entries = process_tar_file(DATASET_REPO, file_path, label)

            if entries:
                batch_df = pd.DataFrame(entries)
                full_df = pd.concat([full_df, batch_df], ignore_index=True)

                # Save incremental progress
                if (len(full_df) % 1000) == 0:
                    full_df.to_csv("processing_checkpoint.csv", index=False)

    # Final save
    final_path = os.path.join(OUTPUT_DIR, "final_processed_dataset.csv")
    full_df.to_csv(final_path, index=False)

    # Checkpoint save (if keeping)
    checkpoint_path = os.path.join(OUTPUT_DIR, "processing_checkpoint.csv")
    if (len(full_df) % 1000) == 0:
        full_df.to_csv(checkpoint_path, index=False)

if __name__ == "__main__":
    main()

1.tar.gz:   0%|          | 0.00/112M [00:00<?, ?B/s]

10.tar.gz:   0%|          | 0.00/8.95M [00:00<?, ?B/s]

100.tar.gz:   0%|          | 0.00/23.6M [00:00<?, ?B/s]

101.tar.gz:   0%|          | 0.00/11.8M [00:00<?, ?B/s]

102.tar.gz:   0%|          | 0.00/95.7M [00:00<?, ?B/s]

103.tar.gz:   0%|          | 0.00/51.4M [00:00<?, ?B/s]

104.tar.gz:   0%|          | 0.00/31.7M [00:00<?, ?B/s]

105.tar.gz:   0%|          | 0.00/91.6M [00:00<?, ?B/s]

106.tar.gz:   0%|          | 0.00/22.0M [00:00<?, ?B/s]

107.tar.gz:   0%|          | 0.00/130M [00:00<?, ?B/s]

108.tar.gz:   0%|          | 0.00/48.4M [00:00<?, ?B/s]

109.tar.gz:   0%|          | 0.00/91.1M [00:00<?, ?B/s]

11.tar.gz:   0%|          | 0.00/18.7M [00:00<?, ?B/s]

110.tar.gz:   0%|          | 0.00/24.9M [00:00<?, ?B/s]

111.tar.gz:   0%|          | 0.00/42.1M [00:00<?, ?B/s]

112.tar.gz:   0%|          | 0.00/56.9M [00:00<?, ?B/s]

113.tar.gz:   0%|          | 0.00/39.1M [00:00<?, ?B/s]

114.tar.gz:   0%|          | 0.00/38.9M [00:00<?, ?B/s]

115.tar.gz:   0%|          | 0.00/228M [00:00<?, ?B/s]

116.tar.gz:   0%|          | 0.00/43.1M [00:00<?, ?B/s]

117.tar.gz:   0%|          | 0.00/60.5M [00:00<?, ?B/s]

118.tar.gz:   0%|          | 0.00/6.84M [00:00<?, ?B/s]

119.tar.gz:   0%|          | 0.00/106M [00:00<?, ?B/s]

12.tar.gz:   0%|          | 0.00/56.2M [00:00<?, ?B/s]

120.tar.gz:   0%|          | 0.00/62.2M [00:00<?, ?B/s]

121.tar.gz:   0%|          | 0.00/24.7M [00:00<?, ?B/s]

122.tar.gz:   0%|          | 0.00/45.7M [00:00<?, ?B/s]

123.tar.gz:   0%|          | 0.00/57.1M [00:00<?, ?B/s]

124.tar.gz:   0%|          | 0.00/3.67M [00:00<?, ?B/s]

125.tar.gz:   0%|          | 0.00/79.1M [00:00<?, ?B/s]

126.tar.gz:   0%|          | 0.00/25.0M [00:00<?, ?B/s]

127.tar.gz:   0%|          | 0.00/67.2M [00:00<?, ?B/s]

128.tar.gz:   0%|          | 0.00/21.8M [00:00<?, ?B/s]

129.tar.gz:   0%|          | 0.00/26.8M [00:00<?, ?B/s]

13.tar.gz:   0%|          | 0.00/31.2M [00:00<?, ?B/s]

130.tar.gz:   0%|          | 0.00/33.1M [00:00<?, ?B/s]

131.tar.gz:   0%|          | 0.00/160M [00:00<?, ?B/s]

132.tar.gz:   0%|          | 0.00/12.7M [00:00<?, ?B/s]

133.tar.gz:   0%|          | 0.00/18.8M [00:00<?, ?B/s]

134.tar.gz:   0%|          | 0.00/52.2M [00:00<?, ?B/s]

135.tar.gz:   0%|          | 0.00/112M [00:00<?, ?B/s]

136.tar.gz:   0%|          | 0.00/28.4M [00:00<?, ?B/s]

137.tar.gz:   0%|          | 0.00/3.87M [00:00<?, ?B/s]

138.tar.gz:   0%|          | 0.00/97.0M [00:00<?, ?B/s]

139.tar.gz:   0%|          | 0.00/29.5M [00:00<?, ?B/s]

14.tar.gz:   0%|          | 0.00/74.8M [00:00<?, ?B/s]

140.tar.gz:   0%|          | 0.00/97.4M [00:00<?, ?B/s]

141.tar.gz:   0%|          | 0.00/46.7M [00:00<?, ?B/s]

142.tar.gz:   0%|          | 0.00/299M [00:00<?, ?B/s]

143.tar.gz:   0%|          | 0.00/14.9M [00:00<?, ?B/s]

144.tar.gz:   0%|          | 0.00/77.3M [00:00<?, ?B/s]

145.tar.gz:   0%|          | 0.00/12.7M [00:00<?, ?B/s]

146.tar.gz:   0%|          | 0.00/53.2M [00:00<?, ?B/s]

147.tar.gz:   0%|          | 0.00/67.9M [00:00<?, ?B/s]

148.tar.gz:   0%|          | 0.00/31.4M [00:00<?, ?B/s]

149.tar.gz:   0%|          | 0.00/16.5M [00:00<?, ?B/s]

15.tar.gz:   0%|          | 0.00/302M [00:00<?, ?B/s]

150.tar.gz:   0%|          | 0.00/31.6M [00:00<?, ?B/s]

151.tar.gz:   0%|          | 0.00/91.1M [00:00<?, ?B/s]

152.tar.gz:   0%|          | 0.00/62.2M [00:00<?, ?B/s]

153.tar.gz:   0%|          | 0.00/15.9M [00:00<?, ?B/s]

154.tar.gz:   0%|          | 0.00/104M [00:00<?, ?B/s]

155.tar.gz:   0%|          | 0.00/24.3M [00:00<?, ?B/s]

156.tar.gz:   0%|          | 0.00/20.4M [00:00<?, ?B/s]

157.tar.gz:   0%|          | 0.00/16.4M [00:00<?, ?B/s]

158.tar.gz:   0%|          | 0.00/79.3M [00:00<?, ?B/s]

159.tar.gz:   0%|          | 0.00/87.4M [00:00<?, ?B/s]

16.tar.gz:   0%|          | 0.00/105M [00:00<?, ?B/s]

160.tar.gz:   0%|          | 0.00/59.0M [00:00<?, ?B/s]

161.tar.gz:   0%|          | 0.00/19.7M [00:00<?, ?B/s]

162.tar.gz:   0%|          | 0.00/37.3M [00:00<?, ?B/s]

163.tar.gz:   0%|          | 0.00/81.6M [00:00<?, ?B/s]

164.tar.gz:   0%|          | 0.00/54.6M [00:00<?, ?B/s]

165.tar.gz:   0%|          | 0.00/87.2M [00:00<?, ?B/s]

166.tar.gz:   0%|          | 0.00/31.9M [00:00<?, ?B/s]

167.tar.gz:   0%|          | 0.00/21.4M [00:00<?, ?B/s]

168.tar.gz:   0%|          | 0.00/32.3M [00:00<?, ?B/s]

169.tar.gz:   0%|          | 0.00/281M [00:00<?, ?B/s]

17.tar.gz:   0%|          | 0.00/37.4M [00:00<?, ?B/s]

170.tar.gz:   0%|          | 0.00/9.13M [00:00<?, ?B/s]

171.tar.gz:   0%|          | 0.00/22.2M [00:00<?, ?B/s]

172.tar.gz:   0%|          | 0.00/32.5M [00:00<?, ?B/s]

173.tar.gz:   0%|          | 0.00/117M [00:00<?, ?B/s]

174.tar.gz:   0%|          | 0.00/34.0M [00:00<?, ?B/s]

175.tar.gz:   0%|          | 0.00/59.6M [00:00<?, ?B/s]

176.tar.gz:   0%|          | 0.00/108M [00:00<?, ?B/s]

177.tar.gz:   0%|          | 0.00/54.9M [00:00<?, ?B/s]

178.tar.gz:   0%|          | 0.00/61.5M [00:00<?, ?B/s]

179.tar.gz:   0%|          | 0.00/231M [00:00<?, ?B/s]

18.tar.gz:   0%|          | 0.00/33.0M [00:00<?, ?B/s]

180.tar.gz:   0%|          | 0.00/23.3M [00:00<?, ?B/s]

181.tar.gz:   0%|          | 0.00/120M [00:00<?, ?B/s]

182.tar.gz:   0%|          | 0.00/78.6M [00:00<?, ?B/s]

183.tar.gz:   0%|          | 0.00/58.8M [00:00<?, ?B/s]

184.tar.gz:   0%|          | 0.00/31.8M [00:00<?, ?B/s]

185.tar.gz:   0%|          | 0.00/272M [00:00<?, ?B/s]

186.tar.gz:   0%|          | 0.00/39.2M [00:00<?, ?B/s]

187.tar.gz:   0%|          | 0.00/16.2M [00:00<?, ?B/s]

188.tar.gz:   0%|          | 0.00/46.6M [00:00<?, ?B/s]

189.tar.gz:   0%|          | 0.00/54.7M [00:00<?, ?B/s]

19.tar.gz:   0%|          | 0.00/25.6M [00:00<?, ?B/s]

190.tar.gz:   0%|          | 0.00/522M [00:00<?, ?B/s]

191.tar.gz:   0%|          | 0.00/12.0M [00:00<?, ?B/s]

192.tar.gz:   0%|          | 0.00/274M [00:00<?, ?B/s]

193.tar.gz:   0%|          | 0.00/176M [00:00<?, ?B/s]

194.tar.gz:   0%|          | 0.00/94.9M [00:00<?, ?B/s]

195.tar.gz:   0%|          | 0.00/10.2M [00:00<?, ?B/s]

196.tar.gz:   0%|          | 0.00/66.0M [00:00<?, ?B/s]

197.tar.gz:   0%|          | 0.00/10.6M [00:00<?, ?B/s]

198.tar.gz:   0%|          | 0.00/14.2M [00:00<?, ?B/s]

199.tar.gz:   0%|          | 0.00/35.6M [00:00<?, ?B/s]

2.tar.gz:   0%|          | 0.00/49.6M [00:00<?, ?B/s]

20.tar.gz:   0%|          | 0.00/55.4M [00:00<?, ?B/s]

200.tar.gz:   0%|          | 0.00/73.8M [00:00<?, ?B/s]

201.tar.gz:   0%|          | 0.00/132M [00:00<?, ?B/s]

202.tar.gz:   0%|          | 0.00/15.7M [00:00<?, ?B/s]

203.tar.gz:   0%|          | 0.00/133M [00:00<?, ?B/s]

204.tar.gz:   0%|          | 0.00/108M [00:00<?, ?B/s]

205.tar.gz:   0%|          | 0.00/27.2M [00:00<?, ?B/s]

206.tar.gz:   0%|          | 0.00/16.7M [00:00<?, ?B/s]

207.tar.gz:   0%|          | 0.00/282M [00:00<?, ?B/s]

208.tar.gz:   0%|          | 0.00/25.7M [00:00<?, ?B/s]

209.tar.gz:   0%|          | 0.00/118M [00:00<?, ?B/s]

21.tar.gz:   0%|          | 0.00/29.4M [00:00<?, ?B/s]

210.tar.gz:   0%|          | 0.00/80.4M [00:00<?, ?B/s]

211.tar.gz:   0%|          | 0.00/65.5M [00:00<?, ?B/s]

212.tar.gz:   0%|          | 0.00/93.4M [00:00<?, ?B/s]

213.tar.gz:   0%|          | 0.00/98.7M [00:00<?, ?B/s]

214.tar.gz:   0%|          | 0.00/13.1M [00:00<?, ?B/s]

215.tar.gz:   0%|          | 0.00/41.5M [00:00<?, ?B/s]

216.tar.gz:   0%|          | 0.00/44.1M [00:00<?, ?B/s]

217.tar.gz:   0%|          | 0.00/47.6M [00:00<?, ?B/s]

218.tar.gz:   0%|          | 0.00/45.5M [00:00<?, ?B/s]

219.tar.gz:   0%|          | 0.00/97.4M [00:00<?, ?B/s]

22.tar.gz:   0%|          | 0.00/55.0M [00:00<?, ?B/s]

220.tar.gz:   0%|          | 0.00/56.1M [00:00<?, ?B/s]

221.tar.gz:   0%|          | 0.00/55.6M [00:00<?, ?B/s]

222.tar.gz:   0%|          | 0.00/54.1M [00:00<?, ?B/s]

223.tar.gz:   0%|          | 0.00/185M [00:00<?, ?B/s]

224.tar.gz:   0%|          | 0.00/28.2M [00:00<?, ?B/s]

225.tar.gz:   0%|          | 0.00/59.3M [00:00<?, ?B/s]

226.tar.gz:   0%|          | 0.00/74.6M [00:00<?, ?B/s]

227.tar.gz:   0%|          | 0.00/43.9M [00:00<?, ?B/s]

228.tar.gz:   0%|          | 0.00/53.0M [00:00<?, ?B/s]

229.tar.gz:   0%|          | 0.00/115M [00:00<?, ?B/s]

23.tar.gz:   0%|          | 0.00/31.4M [00:00<?, ?B/s]

230.tar.gz:   0%|          | 0.00/89.0M [00:00<?, ?B/s]

231.tar.gz:   0%|          | 0.00/41.8M [00:00<?, ?B/s]

232.tar.gz:   0%|          | 0.00/56.7M [00:00<?, ?B/s]

233.tar.gz:   0%|          | 0.00/25.2M [00:00<?, ?B/s]

234.tar.gz:   0%|          | 0.00/20.6M [00:00<?, ?B/s]

235.tar.gz:   0%|          | 0.00/26.7M [00:00<?, ?B/s]

236.tar.gz:   0%|          | 0.00/39.0M [00:00<?, ?B/s]

237.tar.gz:   0%|          | 0.00/64.1M [00:00<?, ?B/s]

238.tar.gz:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

239.tar.gz:   0%|          | 0.00/19.8M [00:00<?, ?B/s]

24.tar.gz:   0%|          | 0.00/23.6M [00:00<?, ?B/s]

240.tar.gz:   0%|          | 0.00/14.9M [00:00<?, ?B/s]

241.tar.gz:   0%|          | 0.00/41.8M [00:00<?, ?B/s]

242.tar.gz:   0%|          | 0.00/54.0M [00:00<?, ?B/s]

243.tar.gz:   0%|          | 0.00/45.4M [00:00<?, ?B/s]

244.tar.gz:   0%|          | 0.00/68.6M [00:00<?, ?B/s]

245.tar.gz:   0%|          | 0.00/28.9M [00:00<?, ?B/s]

246.tar.gz:   0%|          | 0.00/25.0M [00:00<?, ?B/s]

247.tar.gz:   0%|          | 0.00/71.1M [00:00<?, ?B/s]

KeyboardInterrupt: 