In [1]:
import pandas as pd
import random



In [2]:

# Define show-to-genre mapping
show_genre_mapping = {
    "Stranger Things": "Sci-Fi",
    "Breaking Bad": "Crime",
    "The Crown": "Drama",
    "Money Heist": "Action",
    "Dark": "Sci-Fi",
    "The Witcher": "Fantasy",
    "Friends": "Comedy",
    "The Office": "Comedy",
    "BoJack Horseman": "Animation",
    "Narcos": "Crime",
    "Black Mirror": "Sci-Fi",
    "You": "Thriller",
    "Sex Education": "Comedy",
    "Bridgerton": "Romance",
    "Lucifer": "Drama",
    "The Queen's Gambit": "Drama",
    "Emily in Paris": "Romance",
    "Peaky Blinders": "Crime",
    "The Umbrella Academy": "Action",
    "Ozark": "Crime",
    "Squid Game": "Thriller",
    "The Sandman": "Fantasy",
    "Shadow and Bone": "Fantasy",
    "Outer Banks": "Action",
    "The Recruit": "Thriller",
    "Ginny & Georgia": "Drama",
    "Locke & Key": "Fantasy",
    "Never Have I Ever": "Comedy",
    "The Lincoln Lawyer": "Crime",
    "Manifest": "Mystery"
}

In [3]:

# Generate user IDs
user_ids = [f"user_{i+1}" for i in range(200)]



In [4]:

# Generate dataset with 1000 entries
data = []
for _ in range(1000):
    user = random.choice(user_ids)
    show = random.choice(list(show_genre_mapping.keys()))
    genre = show_genre_mapping[show]
    watch_count = random.randint(1, 12)
    watch_hours = round(watch_count * random.uniform(0.5, 3.0), 2)
    last_watched = pd.Timestamp("2025-01-01") + pd.to_timedelta(random.randint(0, 180), unit="D")
    rating = random.choice([1, 2, 3, 4, 5])
    binge_score = round(random.uniform(0.1, 1.0), 2)
    liked = random.choice(["Yes", "No"])
    device = random.choice(["Mobile", "TV", "Laptop", "Tablet"])
    age_group = random.choice(["Teen", "Young Adult", "Adult", "Senior"])
    viewing_history = ", ".join(random.sample(list(show_genre_mapping.keys()), k=random.randint(3, 10)))

    data.append([
        user, show, genre, watch_count, watch_hours, last_watched.date(), rating,
        binge_score, liked, device, age_group, viewing_history
    ])


In [5]:

# Create DataFrame
columns = [
    "user_id", "show_title", "genre", "watch_count", "watch_hours",
    "last_watched_date", "rating", "binge_score", "liked", "device",
    "age_group", "viewing_history"
]
df = pd.DataFrame(data, columns=columns)

In [6]:

# Save to CSV
df.to_csv("netflix_user_behavior_final_1000.csv", index=False)
print("✅ CSV file 'netflix_user_behavior_final_1000.csv' has been created.")


✅ CSV file 'netflix_user_behavior_final_1000.csv' has been created.


In [3]:
import pandas as pd
import numpy as np
import random
from datetime import datetime, timedelta
import json

# Expanded movie list (100+ popular movies)
movies = [
    # Dramas
    {"title": "The Shawshank Redemption", "genre": "Drama", "runtime": 142},
    {"title": "Forrest Gump", "genre": "Drama", "runtime": 142},
    {"title": "The Godfather", "genre": "Crime", "runtime": 175},
    # Sci-Fi
    {"title": "The Matrix", "genre": "Sci-Fi", "runtime": 136},
    {"title": "Inception", "genre": "Sci-Fi", "runtime": 148},
    {"title": "Interstellar", "genre": "Sci-Fi", "runtime": 169},
    # Action
    {"title": "The Dark Knight", "genre": "Action", "runtime": 152},
    {"title": "Mad Max: Fury Road", "genre": "Action", "runtime": 120},
    # Thrillers
    {"title": "The Silence of the Lambs", "genre": "Thriller", "runtime": 118},
    {"title": "Se7en", "genre": "Thriller", "runtime": 127},
    # Add 90+ more movies...
]

devices = ["smart-tv", "laptop", "tablet", "mobile", "gaming-console"]
age_groups = ["18-25", "26-35", "36-45", "46-55", "56+"]

def generate_viewing_history(movie_runtime, last_watched):
    sessions = random.randint(1, 5)
    history = []
    for _ in range(sessions):
        watch_date = last_watched - timedelta(days=random.randint(1, 60))
        history.append({
            "timestamp": watch_date.strftime("%Y-%m-%d %H:%M"),
            "duration_min": min(movie_runtime, random.randint(20, movie_runtime))
        })
    return history

data = []
for user_id in range(1, 501):
    movie = random.choice(movies)
    watch_count = random.randint(1, 20)
    watch_hours = round(watch_count * movie["runtime"] / 60, 1)
    last_watched = datetime.now() - timedelta(days=random.randint(1, 90))
    rating = random.choice([1, 2, 3, 4, 5, np.nan])
    liked = random.choice([True, False])
    device = random.choice(devices)
    age_group = random.choice(age_groups)
    viewing_history = generate_viewing_history(movie["runtime"], last_watched)
    
    data.append([
        f"U{user_id}",
        movie["title"],
        movie["genre"],
        watch_count,
        watch_hours,
        last_watched.strftime("%Y-%m-%d %H:%M:%S"),
        rating,
        liked,
        device,
        age_group,
        json.dumps(viewing_history)
    ])

df = pd.DataFrame(data, columns=[
    "user_id", "title", "genre", "watch_count", "watch_hours",
    "last_watched", "rating", "liked", "device", "age_group", "viewing_history"
])
df.to_csv("movie_viewing_dataset.csv", index=False)