In [None]:
# Install necessary libraries
!pip install pandas requests google-cloud-storage scikit-learn



In [None]:
# Import required libraries
import pandas as pd
import requests
import os
from google.cloud import storage
from sklearn.model_selection import train_test_split

In [None]:
# Authenticate your Google Cloud account
from google.colab import auth
auth.authenticate_user()

# Initialize Google Cloud Storage client
client = storage.Client()

In [None]:
# Define the bucket name and set up access
bucket_name = 'gif-bucket-1000'  # Your bucket name
bucket = client.get_bucket(bucket_name)

# Set dataset path and local folder for downloading
dataset_path = '/content/tgif-v2.1.tsv'  # Assuming the dataset is already in Colab
local_folder = '/content/gifs/'
os.makedirs(local_folder, exist_ok=True)


In [None]:
# Read the .tsv dataset (assuming it's a TSV file)
df = pd.read_csv(dataset_path, sep='\t')

# Shuffle the dataset and select 1000 random GIFs
df_sampled = df.sample(n=1000, random_state=42).reset_index(drop=True)


In [None]:
# Download the 1000 random GIFs and assign their original unique IDs as names
gif_names = []  # To store the names of the downloaded GIFs (based on original ID)

for index, row in df_sampled.iterrows():
    gif_url = row['links']  # Assuming 'links' column contains the GIF URLs
    gif_id = row['ID']  # Assuming the unique GIF ID is in a column called 'gif_id'
    gif_name = f'{gif_id}.gif'  # Use the original ID as the file name
    gif_names.append(gif_name)  # Store the GIF name for splitting later

    # Download the GIF
    gif_path = os.path.join(local_folder, gif_name)
    try:
        response = requests.get(gif_url)
        with open(gif_path, 'wb') as f:
            f.write(response.content)
        print(f'Downloaded {gif_name}')
    except Exception as e:
        print(f'Error downloading {gif_url}: {e}')

Downloaded 7531.gif
Downloaded 8782.gif
Downloaded 37256.gif
Downloaded 8547.gif
Downloaded 28280.gif
Downloaded 22035.gif
Downloaded 48901.gif
Downloaded 43924.gif
Downloaded 15460.gif
Downloaded 2970.gif
Downloaded 15976.gif
Downloaded 19618.gif
Downloaded 13063.gif
Downloaded 47729.gif
Downloaded 1902.gif
Downloaded 46529.gif
Downloaded 47526.gif
Downloaded 40455.gif
Downloaded 48261.gif
Downloaded 39719.gif
Downloaded 12400.gif
Downloaded 20169.gif
Downloaded 53076.gif
Downloaded 15558.gif
Downloaded 31538.gif
Downloaded 30868.gif
Downloaded 28868.gif
Downloaded 4131.gif
Downloaded 20868.gif
Downloaded 45963.gif
Downloaded 30056.gif
Downloaded 51093.gif
Downloaded 22810.gif
Downloaded 21383.gif
Downloaded 46061.gif
Downloaded 843.gif
Downloaded 2009.gif
Downloaded 14866.gif
Downloaded 42261.gif
Downloaded 48986.gif
Downloaded 31786.gif
Downloaded 43615.gif
Downloaded 1222.gif
Downloaded 21411.gif
Downloaded 47104.gif
Downloaded 1633.gif
Downloaded 15950.gif
Downloaded 35769.gif
Dow

In [None]:
# Split into training (80%), validation (10%), and test (10%) sets
train_names, test_names = train_test_split(gif_names, test_size=0.2, random_state=42)
val_names, test_names = train_test_split(test_names, test_size=0.5, random_state=42)

# Save the split names into text files
train_file = '/content/train.txt'
val_file = '/content/val.txt'
test_file = '/content/test.txt'

with open(train_file, 'w') as f:
    f.write('\n'.join(train_names))

with open(val_file, 'w') as f:
    f.write('\n'.join(val_names))

with open(test_file, 'w') as f:
    f.write('\n'.join(test_names))

print('Saved train.txt, val.txt, and test.txt locally.')

Saved train.txt, val.txt, and test.txt locally.


In [None]:
# Upload GIFs and text files to the GCS bucket
for gif_file in os.listdir(local_folder):
    local_file_path = os.path.join(local_folder, gif_file)
    if gif_file.endswith('.gif'):  # Ensure only GIF files are uploaded
        blob = bucket.blob(f'gifs/{gif_file}')  # Create blob path under 'gifs/' in bucket
        blob.upload_from_filename(local_file_path)
        print(f'Uploaded {gif_file} to {bucket_name}/gifs/')

# Upload the text files (train.txt, val.txt, test.txt)
for txt_file in [train_file, val_file, test_file]:
    blob = bucket.blob(os.path.basename(txt_file))  # Upload text files at root level
    blob.upload_from_filename(txt_file)
    print(f'Uploaded {os.path.basename(txt_file)} to {bucket_name}')

print('All GIFs and split text files uploaded successfully to GCP.')

Uploaded 38152.gif to gif-bucket-1000/gifs/
Uploaded 14096.gif to gif-bucket-1000/gifs/
Uploaded 1403.gif to gif-bucket-1000/gifs/
Uploaded 35716.gif to gif-bucket-1000/gifs/
Uploaded 13632.gif to gif-bucket-1000/gifs/
Uploaded 48951.gif to gif-bucket-1000/gifs/
Uploaded 47993.gif to gif-bucket-1000/gifs/
Uploaded 48261.gif to gif-bucket-1000/gifs/
Uploaded 53539.gif to gif-bucket-1000/gifs/
Uploaded 4507.gif to gif-bucket-1000/gifs/
Uploaded 40314.gif to gif-bucket-1000/gifs/
Uploaded 6574.gif to gif-bucket-1000/gifs/
Uploaded 37256.gif to gif-bucket-1000/gifs/
Uploaded 18680.gif to gif-bucket-1000/gifs/
Uploaded 29457.gif to gif-bucket-1000/gifs/
Uploaded 45758.gif to gif-bucket-1000/gifs/
Uploaded 32707.gif to gif-bucket-1000/gifs/
Uploaded 40856.gif to gif-bucket-1000/gifs/
Uploaded 31408.gif to gif-bucket-1000/gifs/
Uploaded 24468.gif to gif-bucket-1000/gifs/
Uploaded 13237.gif to gif-bucket-1000/gifs/
Uploaded 38133.gif to gif-bucket-1000/gifs/
Uploaded 20758.gif to gif-bucket-10