# Flickr data Mining script

Zhongrui Ning

## 1. import your flickr api

In [2]:
# Import Flickr API
import os
import json
import requests
import pandas as pd
from datetime import datetime, timedelta
from time import sleep
from concurrent.futures import ThreadPoolExecutor, as_completed
api_key ='Insert your key'
api_secret = 'Insert your secret'

## 2. Get the user data in the U.S. from 2015 to 2023

In [3]:
import os
import json
import requests
import shutil
import pandas as pd
from datetime import datetime, timedelta
from time import sleep
from concurrent.futures import ThreadPoolExecutor, as_completed

# Define the parameters
EXTRAS = "url_o,license,geo,lat,lon,tags,description,date_taken"
PER_PAGE = 250
BASE_URL = "https://www.flickr.com/services/rest/"
TEMP_FILE = 'flickr_fetch_temp.json'
FINAL_FILE = 'flickr_images_sorted.csv'

def daterange(start_date, end_date, delta_days=10):
    while start_date < end_date:
        yield start_date
        start_date += timedelta(days=delta_days)

def fetch_photos(min_date, max_date, retries=5):
    params = {
        "method": "flickr.photos.search",
        "api_key": api_key,
        "min_taken_date": min_date,
        "max_taken_date": max_date,
        "extras": EXTRAS,
        "format": "json",
        "nojsoncallback": "1",
        "per_page": PER_PAGE,
        "page": 1,  # Start at page 1
        "bbox": "-125.001650,24.9493,-66.9326,49.5904"  # Coordinates for the US
    }
    all_photos = []
    while True:
        for attempt in range(retries):
            try:
                response = requests.get(BASE_URL, params=params)
                
                if response.status_code == 200:
                    data = response.json()
                    photos = data['photos']['photo']
                    all_photos.extend(photos)
                    
                    # Get the total number of pages and stop if we've reached the last page
                    total_pages = data['photos']['pages']
                    current_page = params['page']
                    
                    if current_page >= total_pages:
                        return all_photos
                    
                    params['page'] += 1  # Move to the next page
                    break
                else:
                    print(f"Error: Received status code {response.status_code}. Retrying...")
                    sleep(2 ** attempt)
            except requests.exceptions.RequestException as e:
                print(f"Network error: {e}. Retrying...")
                sleep(2 ** attempt)
        else:
            break
    return all_photos

def load_checkpoint():
    if os.path.exists(TEMP_FILE):
        try:
            with open(TEMP_FILE, 'r') as f:
                data = json.load(f)
                return data['date_ranges'], data['photos']
        except json.JSONDecodeError as e:
            print(f"Warning: Temp file {TEMP_FILE} is corrupted. Starting fresh. Error: {e}")
            # 如果文件损坏，删除它并返回空结果
            os.remove(TEMP_FILE)
            return [], []
    return [], []

def save_checkpoint(date_ranges, photos, all_photos):
    # Combine existing photos with new photos and remove duplicates based on 'id'
    combined_photos = all_photos + photos
    unique_photos = {photo['id']: photo for photo in combined_photos}.values()

    temp_file = TEMP_FILE + '.tmp'
    # Save unique photos to a temporary file
    with open(temp_file, 'w') as f:
        json.dump({'date_ranges': date_ranges, 'photos': list(unique_photos)}, f)
    
    # After successful write, replace the original file
    shutil.move(temp_file, TEMP_FILE)

def save_to_csv(photos):
    if os.path.exists(FINAL_FILE):
        existing_df = pd.read_csv(FINAL_FILE)
    else:
        existing_df = pd.DataFrame()

    new_df = pd.DataFrame(photos)
    combined_df = pd.concat([existing_df, new_df]).drop_duplicates(subset='id')

    combined_df.to_csv(FINAL_FILE, index=False)

def process_date_range(min_date_str, max_date_str):
    photos = fetch_photos(min_date_str, max_date_str)
    return photos

def main():
    start_date = datetime.strptime("2015-01-01", "%Y-%m-%d")
    end_date = datetime.strptime("2023-12-31", "%Y-%m-%d")

# load checkpoint
    completed_date_ranges, all_photos = load_checkpoint()

    date_ranges = []
    for single_date in daterange(start_date, end_date, delta_days=10):
        min_date_str = single_date.strftime("%Y-%m-%d")
        max_date_str = (single_date + timedelta(days=10)).strftime("%Y-%m-%d")
        if [min_date_str, max_date_str] not in completed_date_ranges:
            date_ranges.append((min_date_str, max_date_str))

    if not date_ranges:
        print("All date ranges have been processed.")
        return

    # using multiple threads to process the date ranges
    with ThreadPoolExecutor(max_workers=5) as executor:
        futures = {executor.submit(process_date_range, dr[0], dr[1]): dr for dr in date_ranges}
        for future in as_completed(futures):
            try:
                photos = future.result()
                if photos:
                    all_photos.extend(photos)
                    completed_date_ranges.append(futures[future])
                    print(f"Total photos fetched so far: {len(all_photos)}")

                    save_checkpoint(completed_date_ranges, photos, all_photos)
                    save_to_csv(photos)
            except Exception as e:
                print(f"Error occurred: {e}")
            sleep(1)

if __name__ == "__main__":
    main()

Error occurred: 'photos'
Error occurred: 'photos'
Error occurred: 'photos'
Error occurred: 'photos'
Error occurred: 'photos'
Error occurred: 'photos'
Error occurred: 'photos'
Error occurred: 'photos'


KeyboardInterrupt: 

## New Attempt

In [None]:
import os
import json
import requests
import pandas as pd
from datetime import datetime, timedelta
from time import sleep
from sklearn.cluster import DBSCAN
import geopandas as gpd
from shapely.geometry import Point
import matplotlib.pyplot as plt

# Flickr API key
API_KEY = api_key
EXTRAS = "url_o,license,geo,lat,lon,tags,description,date_taken"
BASE_URL = 'https://www.flickr.com/services/rest/'
PROGRESS_FILE = 'flickr_progress.json'
OUTPUT_FILE = 'flickr_data_daily.csv'
MAX_PAGES = 100 
MAX_RETRIES = 5 

# Fetch photos within a date range
def fetch_photos(start_date, end_date, bbox="-125.001650,24.9493,-66.9326,49.5904"):
    all_photos = []
    page = 1
    retries = 0

    while page <= MAX_PAGES:
        params = {
            "method": "flickr.photos.search",
            "api_key": API_KEY,
            "min_taken_date": start_date,
            "max_taken_date": end_date,
            "bbox": bbox,
            "extras": EXTRAS,
            "format": "json",
            "nojsoncallback": "1",
            "per_page": 250,
            "page": page
        }
        response = requests.get(BASE_URL, params=params)
        
        if response.status_code != 200:
            retries += 1
            if retries >= MAX_RETRIES:
                print(f"Failed to retrieve data after {MAX_RETRIES} retries for {start_date} to {end_date}.")
                break
            print(f"Retrying ({retries}/{MAX_RETRIES}) due to network issues...")
            sleep(2)
            continue

        data = response.json()
        
        if 'photos' not in data or 'photo' not in data['photos']:
            print(f"No data returned for {start_date} to {end_date}. Skipping this range.")
            break

        photos = data['photos']['photo']
        all_photos.extend(photos)
        print(f"Downloaded {len(photos)} photos on page {page} for date range {start_date} to {end_date}")

        if len(photos) < 250 or page >= data['photos']['pages']:
            break
        
        page += 1
    return all_photos

# Daily date range generator for finer-grained data fetching
def generate_date_ranges(start, end, delta_days=1):
    current = start
    while current < end:
        yield current, current + timedelta(days=delta_days)
        current += timedelta(days=delta_days)

# Load previous progress if it exists
def load_progress():
    if os.path.exists(PROGRESS_FILE):
        with open(PROGRESS_FILE, 'r') as f:
            return json.load(f)
    return {"completed_ranges": []}

# Save progress to a JSON file
def save_progress(completed_ranges):
    with open(PROGRESS_FILE, 'w') as f:
        json.dump({"completed_ranges": completed_ranges}, f)

# Download and save data for each daily date range with progress tracking
def collect_flickr_data(start_year, end_year):
    progress = load_progress()
    completed_ranges = progress["completed_ranges"]

    # Generate daily ranges and filter out already completed ranges
    date_ranges = [(start_date, end_date) for start_date, end_date in generate_date_ranges(datetime(start_year, 1, 1), datetime(end_year, 12, 31))]
    date_ranges = [dr for dr in date_ranges if dr[0].strftime("%Y-%m-%d") not in completed_ranges]
    
    for start_date, end_date in date_ranges:
        photos = fetch_photos(start_date.strftime('%Y-%m-%d'), end_date.strftime('%Y-%m-%d'))
        
        if photos:
            pd.DataFrame(photos).to_csv(OUTPUT_FILE, mode='a', header=not os.path.exists(OUTPUT_FILE), index=False)

        # Update progress tracking
        completed_ranges.append(start_date.strftime("%Y-%m-%d"))
        save_progress(completed_ranges)
        
        print(f"Progress: {len(completed_ranges)}/{len(date_ranges)} days completed. Total photos collected so far: {len(photos)}")
        sleep(1)

    print("Data collection complete.")

# Data Cleaning
def clean_data(df):
    df['latitude'] = pd.to_numeric(df['latitude'], errors='coerce')
    df['longitude'] = pd.to_numeric(df['longitude'], errors='coerce')
    df.dropna(subset=['latitude', 'longitude'], inplace=True)
    df.drop_duplicates(subset='id', inplace=True)
    return df

# Temporal Analysis
def temporal_analysis(df):
    df['datetaken'] = pd.to_datetime(df['datetaken'], errors='coerce')
    df['month'] = df['datetaken'].dt.to_period('M')
    df.groupby('month').size().plot(kind='line', title='Temporal Distribution')
    plt.show()

# Spatial Clustering
def spatial_clustering(df):
    coords = df[['latitude', 'longitude']].values
    clustering = DBSCAN(eps=0.05, min_samples=10, metric='haversine').fit(coords)
    df['cluster'] = clustering.labels_
    gdf = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df.longitude, df.latitude))
    gdf.plot(column='cluster', cmap='viridis', legend=True)
    plt.show()

# Main
if __name__ == "__main__":
    collect_flickr_data(2016, 2023)
    
    if os.path.exists(OUTPUT_FILE):
        df = pd.read_csv(OUTPUT_FILE)
        df = clean_data(df)
        temporal_analysis(df)
        spatial_clustering(df)