Kheshini Budhna (C0909662)

In [41]:
from google.colab import drive
import pandas as pd
import os

# Mount Google Drive
drive.mount('/content/drive')

# Set the folder path where your TSV files are located
folder_path = '/content/drive/My Drive/Colab_Notebooks/frame_tsv'  # Adjust this path accordingly

# List to hold the DataFrames
dataframes = []

# Loop through all files in the specified folder
for file in os.listdir(folder_path):
    if file.endswith('.tsv') and file.startswith('updated_csv_'):
        # Read the TSV file
        file_path = os.path.join(folder_path, file)
        df = pd.read_csv(file_path, sep='\t')
        dataframes.append(df)

# Combine all DataFrames into one
combined_df = pd.concat(dataframes, ignore_index=True)

# Save the combined DataFrame to a TSV file with the specified name
combined_file_path = '/content/drive/My Drive/Colab_Notebooks/count_frames_total.tsv'  # Specify the new filename
combined_df.to_csv(combined_file_path, sep='\t', index=False)

print(f"Combined file saved as: {combined_file_path}")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Combined file saved as: /content/drive/My Drive/Colab_Notebooks/count_frames_total.tsv


In [42]:
# Path to the combined TSV file
combined_file_path = '/content/drive/My Drive/Colab_Notebooks/count_frames_total.tsv'

# Open the combined TSV file
df = pd.read_csv(combined_file_path, sep='\t')

# Show the first few rows of the DataFrame
print("Initial DataFrame:")
print(df.head())

# Sort the DataFrame by the 'ID' column
sorted_df = df.sort_values(by='ID')

# Show the sorted DataFrame
print("\nSorted DataFrame:")
print(sorted_df.head())

df = sorted_df.copy()


Initial DataFrame:
   ID                                              links  \
0   1  https://38.media.tumblr.com/9f6c25cc350f12aa74...   
1   2  https://38.media.tumblr.com/9ead028ef62004ef6a...   
2   3  https://38.media.tumblr.com/9f43dc410be85b1159...   
3   4  https://38.media.tumblr.com/9f659499c8754e40cf...   
4   5  https://38.media.tumblr.com/9ed1c99afa7d714118...   

                                         description Frame Count  
0  a man is glaring, and someone with sunglasses ...        18.0  
1           a cat tries to catch a mouse on a tablet        53.0  
2                   a man dressed in red is dancing.        22.0  
3     an animal comes close to another in the jungle       100.0  
4  a man in a hat adjusts his tie and makes a wei...        16.0  

Sorted DataFrame:
   ID                                              links  \
0   1  https://38.media.tumblr.com/9f6c25cc350f12aa74...   
1   2  https://38.media.tumblr.com/9ead028ef62004ef6a...   
2   3  https://38.m

In [43]:
# Filter rows where the frame count is "Unreachable"
unreachable_df = df[df['Frame Count'] == 'Unreachable']  # Replace 'frame_count' with the actual column name

# Print out the filtered DataFrame
print("\nRows with frame count 'Unreachable':")
print(unreachable_df)

new_df = unreachable_df.copy()


Rows with frame count 'Unreachable':
           ID                                              links  \
108766  29032  https://33.media.tumblr.com/cf3d907da94bc417fc...   
119557  39823  https://33.media.tumblr.com/34f3484910558595cd...   
140188  60454  https://38.media.tumblr.com/b169f4e3b96603031d...   
145623  65889  https://33.media.tumblr.com/4add60f0e4d5123b5b...   
165748  86014  https://38.media.tumblr.com/b82ad15ed8404591ae...   

                                           description  Frame Count  
108766  two soccer teams are playing a game of soccer.  Unreachable  
119557          two hands are reaching for each other.  Unreachable  
140188           a policeman put his cap on and leaved  Unreachable  
145623               two men are fighting in the cage.  Unreachable  
165748           the woman in black is blowing a kiss.  Unreachable  


In [44]:
import pandas as pd
import imageio

def get_frame_count(gif_link):
    """Function to return the number of frames in a GIF given its URL."""
    try:
        # First, attempt to use the count_frames() method
        gif = imageio.get_reader(gif_link)

        # Check if the gif has the count_frames method
        if hasattr(gif, 'count_frames'):
            return gif.count_frames()
        else:
            raise AttributeError("'LegacyReader' object has no attribute 'count_frames'")

    except AttributeError:
        # If we encounter the LegacyReader issue, fallback to manual frame counting
        print(f"LegacyReader issue detected for {gif_link}. Falling back to manual frame counting.")
        return count_frames_manually(gif_link)

    except Exception as e:
        print(f"Error reading {gif_link}: {e}")
        return None

def count_frames_manually(gif_link):
    """Fallback function to manually count frames by iterating through the GIF."""
    try:
        gif = imageio.get_reader(gif_link)
        frame_count = 0
        for _ in gif:
            frame_count += 1
        return frame_count
    except Exception as e:
        print(f"Manual frame count failed for {gif_link}: {e}")
        return None

def update_frame_counts(original_df):
    """Function to update frame counts for GIF links in the DataFrame."""
    # Iterate through the DataFrame to update frame counts
    for index, row in original_df.iterrows():
        if row['Frame Count'] == 'Unreachable':
            gif_link = row['links']  # Use the 'links' column
            frame_count = get_frame_count(gif_link)  # Get frame count
            original_df.at[index, 'Frame Count'] = frame_count  # Update the count in DataFrame

# Update the DataFrame in place without changing IDs
update_frame_counts(new_df)

# View the updated df to check that the IDs remain the same
print(new_df.head())


LegacyReader issue detected for https://33.media.tumblr.com/cf3d907da94bc417fc1234d71d3c9260/tumblr_np5v6oemH61sbwmk4o1_400.gif. Falling back to manual frame counting.
LegacyReader issue detected for https://33.media.tumblr.com/34f3484910558595cd64b072fc1ae128/tumblr_nex5arpyUG1u2473po1_250.gif. Falling back to manual frame counting.
LegacyReader issue detected for https://38.media.tumblr.com/b169f4e3b96603031d808beda4be86a6/tumblr_nkuohvy4ff1qadfw9o1_400.gif. Falling back to manual frame counting.
LegacyReader issue detected for https://33.media.tumblr.com/4add60f0e4d5123b5b490f61592ea27a/tumblr_ndwjaaRuOU1s8r7lxo1_400.gif. Falling back to manual frame counting.
LegacyReader issue detected for https://38.media.tumblr.com/b82ad15ed8404591aecd16d2ff0c51cf/tumblr_nbkigaAxRs1qej93ko1_500.gif. Falling back to manual frame counting.
           ID                                              links  \
108766  29032  https://33.media.tumblr.com/cf3d907da94bc417fc...   
119557  39823  https://3

In [45]:
# Update the 'Frame Count' in df using the values from new_df based on matching IDs
for index, row in new_df.iterrows():
    df.loc[df['ID'] == row['ID'], 'Frame Count'] = row['Frame Count']

# View the updated df
print(df.head())

   ID                                              links  \
0   1  https://38.media.tumblr.com/9f6c25cc350f12aa74...   
1   2  https://38.media.tumblr.com/9ead028ef62004ef6a...   
2   3  https://38.media.tumblr.com/9f43dc410be85b1159...   
3   4  https://38.media.tumblr.com/9f659499c8754e40cf...   
4   5  https://38.media.tumblr.com/9ed1c99afa7d714118...   

                                         description Frame Count  
0  a man is glaring, and someone with sunglasses ...        18.0  
1           a cat tries to catch a mouse on a tablet        53.0  
2                   a man dressed in red is dancing.        22.0  
3     an animal comes close to another in the jungle       100.0  
4  a man in a hat adjusts his tie and makes a wei...        16.0  


In [49]:
df = df.drop_duplicates(subset=['ID'], keep='first')  # Keep the first occurrence

# List of IDs to check
ids_to_check = [29032, 39823, 60454, 65889, 86014]

# Filter the DataFrame for the specified IDs
filtered_df = df[df['ID'].isin(ids_to_check)]

# Display the filtered DataFrame with relevant information
print(filtered_df[['ID', 'Frame Count']])

           ID Frame Count
29031   29032         100
119557  39823         112
60453   60454          35
145623  65889          48
165748  86014          50


In [50]:
# Get the maximum ID in the DataFrame
max_id = df['ID'].max()

# Get the total number of items (rows) in the DataFrame
total_items = len(df)

# Check if the total items match the maximum ID
if total_items == max_id:
    print(f"The DataFrame contains {total_items} items, which matches the maximum ID of {max_id}.")
else:
    print(f"The DataFrame contains {total_items} items, which does not match the maximum ID of {max_id}.")


The DataFrame contains 89735 items, which matches the maximum ID of 89735.


In [51]:
# Specify the path where you want to save the new TSV file
output_file_path = '/content/drive/My Drive/Colab_Notebooks/all_frame_counts.tsv'  # Change the name as needed

# Save the DataFrame to a TSV file
df.to_csv(output_file_path, sep='\t', index=False)

print(f"DataFrame has been saved to: {output_file_path}")

DataFrame has been saved to: /content/drive/My Drive/Colab_Notebooks/all_frame_counts.tsv
