In [5]:
!pip install SQLAlchemy psycopg2-binary pandas python-dateutil



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.1.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3 -m pip install --upgrade pip[0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.1.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3 -m pip install --upgrade pip[0m


In [1]:
from sqlalchemy import create_engine, text

PG_USER = "postgres"
PG_PWD  = "postgres"
PG_HOST = "127.0.0.1"
PG_PORT = 5432
DB_NAME = "4260354_gb_youtube_trends"

engine = create_engine(f"postgresql+psycopg2://{PG_USER}:{PG_PWD}@{PG_HOST}:{PG_PORT}/{DB_NAME}")
with engine.connect() as conn:
    print(conn.execute(text("select version()")).scalar())


PostgreSQL 16.10 (Ubuntu 16.10-0ubuntu0.24.04.1) on x86_64-pc-linux-gnu, compiled by gcc (Ubuntu 13.3.0-6ubuntu2~24.04) 13.3.0, 64-bit


In [3]:
from sqlalchemy import text
with engine.begin() as conn:
    conn.execute(text("DROP TABLE IF EXISTS gb_videos CASCADE;"))


In [4]:
import os, pandas as pd
from sqlalchemy import text, Text, Integer, BigInteger, Boolean, Date, DateTime

csv_path = "GBvideos.csv"  # <-- set correctly
print("CSV exists?", os.path.exists(csv_path), csv_path)
df = pd.read_csv(csv_path)
df.columns = [c.strip() for c in df.columns]
print("Raw rows:", len(df), "| columns:", list(df.columns))

# Robust date parsing (handles multiple common formats)
def parse_trending_series(s):
    c = pd.to_datetime(s, format="%y.%d.%m", errors="coerce")
    if c.notna().sum() == 0:
        c = pd.to_datetime(s, format="%y.%m.%d", errors="coerce")
    if c.notna().sum() == 0:
        c = pd.to_datetime(s, errors="coerce")
    return c

df["trending_date"] = parse_trending_series(df["trending_date"]).dt.date
if "publish_time" in df.columns:
    df["publish_time"] = pd.to_datetime(df["publish_time"], utc=True, errors="coerce")

for col in ["comments_disabled","ratings_disabled","video_error_or_removed"]:
    if col in df.columns:
        df[col] = df[col].astype(str).str.strip().str.lower().map({"true": True, "false": False})

for col in ["views","likes","dislikes","comment_count","category_id"]:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors="coerce")

print("Parsed trending_date non-null:", df["trending_date"].notna().sum(), "of", len(df))

# Ensure table exists (no unique index yet to avoid conflicts on first load)
ddl = """
CREATE TABLE IF NOT EXISTS gb_videos (
  id BIGSERIAL PRIMARY KEY,
  video_id TEXT NOT NULL,
  trending_date DATE NOT NULL,
  title TEXT,
  channel_title TEXT,
  category_id INT,
  publish_time TIMESTAMPTZ,
  tags TEXT,
  views BIGINT,
  likes BIGINT,
  dislikes BIGINT,
  comment_count BIGINT,
  thumbnail_link TEXT,
  comments_disabled BOOLEAN,
  ratings_disabled BOOLEAN,
  video_error_or_removed BOOLEAN,
  description TEXT
);
"""
with engine.begin() as conn:
    conn.execute(text(ddl))

# Filter to known columns and drop empties on key fields
dtype_map = {
    "video_id": Text(),
    "trending_date": Date(),
    "title": Text(),
    "channel_title": Text(),
    "category_id": Integer(),
    "publish_time": DateTime(timezone=True),
    "tags": Text(),
    "views": BigInteger(),
    "likes": BigInteger(),
    "dislikes": BigInteger(),
    "comment_count": BigInteger(),
    "thumbnail_link": Text(),
    "comments_disabled": Boolean(),
    "ratings_disabled": Boolean(),
    "video_error_or_removed": Boolean(),
    "description": Text(),
}

cols = [c for c in dtype_map if c in df.columns]
before = len(df)
df = df.dropna(subset=["video_id","trending_date"])
print("After dropna on keys:", len(df), f"(dropped {before-len(df)})")
df = df.drop_duplicates(subset=["video_id","trending_date"])
print("After de-dup keys:", len(df))

df[cols].to_sql(
    "gb_videos",
    engine,
    if_exists="append",
    index=False,
    dtype=dtype_map,
    method="multi",
    chunksize=10000,
)
print("Inserted rows:", len(df))


CSV exists? True GBvideos.csv
Raw rows: 38916 | columns: ['video_id', 'trending_date', 'title', 'channel_title', 'category_id', 'publish_time', 'tags', 'views', 'likes', 'dislikes', 'comment_count', 'thumbnail_link', 'comments_disabled', 'ratings_disabled', 'video_error_or_removed', 'description']
Parsed trending_date non-null: 38916 of 38916
After dropna on keys: 38916 (dropped 0)
After de-dup keys: 38742
Inserted rows: 38742


In [5]:
from sqlalchemy import text
with engine.begin() as conn:
    n = conn.execute(text("SELECT COUNT(*) FROM gb_videos")).scalar()
    print("Row count now:", n)
    conn.execute(text("""
        CREATE UNIQUE INDEX IF NOT EXISTS ux_gb_videos_vid_trend
          ON gb_videos (video_id, trending_date);
        CREATE INDEX IF NOT EXISTS ix_gb_videos_category_id ON gb_videos (category_id);
        CREATE INDEX IF NOT EXISTS ix_gb_videos_publish_time ON gb_videos (publish_time);
    """))

# Quick peek
import pandas as pd
with engine.connect() as conn:
    sample = pd.read_sql("""
        SELECT video_id, title, trending_date, views, publish_time
        FROM gb_videos
        ORDER BY publish_time DESC NULLS LAST
        LIMIT 5
    """, conn)
    display(sample)


Row count now: 38742


Unnamed: 0,video_id,title,trending_date,views,publish_time
0,r63VBOagGAo,Shawn Mendes x Portugal (FPF Official World Cu...,2018-06-14,653114,2018-06-13 13:11:56+00:00
1,YQJmvXamKYg,Conway: People are bending to the will of Pres...,2018-06-14,99048,2018-06-13 12:56:49+00:00
2,-QPdRfqTnt4,Dumbo Official Teaser Trailer,2018-06-14,4427381,2018-06-13 07:00:00+00:00
3,6h8QgZF5Qu4,Drop the Mic w/ Ashton Kutcher & Sean Diddy Combs,2018-06-14,864189,2018-06-13 05:27:27+00:00
4,arY6lepNdzU,"E3 2018 Exclusive Gameplay Demos, Interviews a...",2018-06-13,349122,2018-06-13 04:09:23+00:00
