In [None]:
%pip install --upgrade pip
%pip install kagglehub
%pip install pandas

In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("justsahil/movielens-32m")

print("Path to dataset files:", path)

In [5]:
import pandas as pd

# Load each CSV file
movies = pd.read_csv("/home/bkalejaiye/ml-32m/movies.csv")
ratings = pd.read_csv("/home/bkalejaiye/ml-32m/ratings.csv")
tags = pd.read_csv("/home/bkalejaiye/ml-32m/tags.csv")
links = pd.read_csv("/home/bkalejaiye/ml-32m/links.csv")

# Check the shape and column names
print("Movies:", movies.shape, movies.columns)
print("Ratings:", ratings.shape, ratings.columns)
print("Tags:", tags.shape, tags.columns)
print("Links:", links.shape, links.columns)

Movies: (87585, 3) Index(['movieId', 'title', 'genres'], dtype='object')
Ratings: (32000204, 4) Index(['userId', 'movieId', 'rating', 'timestamp'], dtype='object')
Tags: (2000072, 4) Index(['userId', 'movieId', 'tag', 'timestamp'], dtype='object')
Links: (87585, 3) Index(['movieId', 'imdbId', 'tmdbId'], dtype='object')


In [None]:
# Preview datasets
print(movies.head())
print(ratings.head())
print(tags.head())
print(links.head())

In [None]:
import pandas as pd
import os

# Paths for input and output datasets
INPUT_PATH = "/home/bkalejaiye/ml-32m/"
OUTPUT_PATH = "/home/bkalejaiye/csv/"

# Load the datasets
movies = pd.read_csv(os.path.join(INPUT_PATH, "movies.csv"))
ratings = pd.read_csv(os.path.join(INPUT_PATH, "ratings.csv"))
tags = pd.read_csv(os.path.join(INPUT_PATH, "tags.csv"))
links = pd.read_csv(os.path.join(INPUT_PATH, "links.csv"))

for file in ["movies.csv", "ratings.csv", "tags.csv", "links.csv"]:
    if not os.path.exists(os.path.join(INPUT_PATH, file)):
        raise FileNotFoundError(f"{file} not found in {INPUT_PATH}")

# Rename columns to match the desired schema
movies.rename(columns={"movieId": "movie_id", "title": "title", "genres": "genres"}, inplace=True)
ratings.rename(columns={"userId": "user_id", "movieId": "movie_id", "rating": "rating", "timestamp": "timestamp"}, inplace=True)
tags.rename(columns={"userId": "user_id", "movieId": "movie_id", "tag": "tag", "timestamp": "timestamp"}, inplace=True)
links.rename(columns={"movieId": "movie_id", "imdbId": "imdb_id", "tmdbId": "tmdb_id"}, inplace=True)

# Convert the timestamp columns to a human-readable format
ratings["timestamp"] = pd.to_datetime(ratings["timestamp"], unit="s")  # Convert from UNIX seconds
tags["timestamp"] = pd.to_datetime(tags["timestamp"], unit="s")        # Convert from UNIX seconds

# Preview the datasets after renaming and conversion
print("Movies Preview:\n", movies.head())
print("Ratings Preview:\n", ratings.head())
print("Tags Preview:\n", tags.head())
print("Links Preview:\n", links.head())

# Save the cleaned datasets
os.makedirs(OUTPUT_PATH, exist_ok=True)  # Ensure the output directory exists
movies.to_csv(os.path.join(OUTPUT_PATH, "movies_clean.csv"), index=False)
ratings.to_csv(os.path.join(OUTPUT_PATH, "ratings_clean.csv"), index=False)
tags.to_csv(os.path.join(OUTPUT_PATH, "tags_clean.csv"), index=False)
links.to_csv(os.path.join(OUTPUT_PATH, "links_clean.csv"), index=False)

print("Datasets have been cleaned and saved to:", OUTPUT_PATH)


In [None]:
import pandas as pd

movies = pd.read_csv("/home/bkalejaiye/csv/movies_clean.csv")
ratings = pd.read_csv("/home/bkalejaiye/csv/ratings_clean.csv")
tags = pd.read_csv("/home/bkalejaiye/csv/tags_clean.csv")
links = pd.read_csv("/home/bkalejaiye/csv/links_clean.csv")

# Check the shape and column names
print("Movies:\n", movies.shape, movies.columns)
print("Ratings:\n", ratings.shape, ratings.columns)
print("Tags:\n", tags.shape, tags.columns)
print("Links:\n", links.shape, links.columns)

In [None]:
print("Movies Preview:\n", movies.head())

In [None]:
import pandas as pd
import os

# Paths for input and output
INPUT_DIR = "/home/babsdevsys/clean-csv"  # Directory containing CSV files
OUTPUT_DIR = "/home/babsdevsys/sql-files"  # Directory to save SQL files

os.makedirs(OUTPUT_DIR, exist_ok=True)

# Define table names and corresponding CSV files
csv_files = {
    "movies": "movies_clean.csv",
    "ratings": "ratings_clean.csv",
    "tags": "tags_clean.csv",
    "links": "links_clean.csv"
}

# Iterate over CSV files
for table_name, csv_file in csv_files.items():
    # Read CSV file
    csv_path = os.path.join(INPUT_DIR, csv_file)
    data = pd.read_csv(csv_path)

    # Generate SQL INSERT statements
    sql_file_path = os.path.join(OUTPUT_DIR, f"{table_name}.sql")
    with open(sql_file_path, "w") as sql_file:
        for _, row in data.iterrows():
            # Convert each row into a SQL INSERT statement
            columns = ", ".join([f'"{col}"' for col in data.columns])
            values = ", ".join([f"'{str(value).replace('\'', '\'\'')}'" for value in row])
            sql_file.write(f"INSERT INTO {table_name} ({columns}) VALUES ({values});\n")

    print(f"SQL file for {table_name} saved at {sql_file_path}")


In [None]:
import pandas as pd
import os

# Paths for input and output
INPUT_DIR = "/home/babsdevsys/clean-csv"  # Directory containing CSV files
OUTPUT_DIR = "/home/babsdevsys/sql-files"  # Directory to save SQL files

os.makedirs(OUTPUT_DIR, exist_ok=True)

# Define table names and corresponding CSV files
csv_files = {
    "movies": "movies_clean.csv",
    "ratings": "ratings_clean.csv",
    "tags": "tags_clean.csv",
    "links": "links_clean.csv"
}

# Define SQL data types mapping
def infer_sql_type(dtype):
    if pd.api.types.is_integer_dtype(dtype):
        return "INTEGER"
    elif pd.api.types.is_float_dtype(dtype):
        return "NUMERIC"
    elif pd.api.types.is_datetime64_any_dtype(dtype):
        return "TIMESTAMP"
    elif pd.api.types.is_object_dtype(dtype):
        return "TEXT"
    else:
        return "TEXT"

# Iterate over CSV files
for table_name, csv_file in csv_files.items():
    # Read CSV file
    csv_path = os.path.join(INPUT_DIR, csv_file)
    data = pd.read_csv(csv_path)

    # Infer column types and generate CREATE TABLE statement
    sql_file_path = os.path.join(OUTPUT_DIR, f"{table_name}_create.sql")
    with open(sql_file_path, "w") as sql_file:
        sql_file.write(f"CREATE TABLE {table_name} (\n")
        for col in data.columns:
            col_type = infer_sql_type(data[col].dtype)
            sql_file.write(f'  "{col}" {col_type},\n')
        sql_file.write("  created_at TIMESTAMP DEFAULT NOW(),\n")
        sql_file.write("  updated_at TIMESTAMP DEFAULT NOW()\n")
        sql_file.write(");\n")

        print(f"CREATE TABLE SQL for {table_name} saved at {sql_file_path}")


In [None]:
import pandas as pd
import os

# Paths for input and output
INPUT_DIR = "/home/babsdevsys/clean-csv"  # Directory containing CSV files
OUTPUT_DIR = "/home/babsdevsys/sql-files"  # Directory to save SQL files

os.makedirs(OUTPUT_DIR, exist_ok=True)

# Define table names, corresponding CSV files, and foreign key relationships
csv_files = {
    "movies": {
        "file": "movies_clean.csv",
        "columns": ["movie_id", "title", "genres"],
        "foreign_keys": []
    },
    "ratings": {
        "file": "ratings_clean.csv",
        "columns": ["user_id", "movie_id", "rating", "timestamp"],
        "foreign_keys": ["FOREIGN KEY (movie_id) REFERENCES movies(movie_id)"]
    },
    "tags": {
        "file": "tags_clean.csv",
        "columns": ["user_id", "movie_id", "tag", "timestamp"],
        "foreign_keys": ["FOREIGN KEY (movie_id) REFERENCES movies(movie_id)"]
    },
    "links": {
        "file": "links_clean.csv",
        "columns": ["movie_id", "imdb_id", "tmdb_id"],
        "foreign_keys": ["FOREIGN KEY (movie_id) REFERENCES movies(movie_id)"]
    }
}

# Iterate over CSV files
for table_name, table_config in csv_files.items():
    # Read CSV file
    csv_path = os.path.join(INPUT_DIR, table_config["file"])
    data = pd.read_csv(csv_path)

    # Generate SQL CREATE TABLE statements
    sql_file_path = os.path.join(OUTPUT_DIR, f"{table_name}_create.sql")
    with open(sql_file_path, "w") as sql_file:
        # Define columns
        column_definitions = []
        for column in table_config["columns"]:
            if "id" in column:
                column_definitions.append(f'"{column}" INTEGER')
            elif "timestamp" in column:
                column_definitions.append(f'"{column}" TIMESTAMP')
            else:
                column_definitions.append(f'"{column}" TEXT')

        # Add foreign key constraints
        foreign_keys = table_config.get("foreign_keys", [])
        column_definitions.extend(foreign_keys)

        # Create table SQL
        create_table_sql = f"""
        CREATE TABLE {table_name} (
            {', '.join(column_definitions)}
        );
        """
        sql_file.write(create_table_sql)

    print(f"Create Table SQL file for {table_name} saved at {sql_file_path}")

    # Generate SQL INSERT statements
    insert_sql_file_path = os.path.join(OUTPUT_DIR, f"{table_name}_insert.sql")
    with open(insert_sql_file_path, "w") as sql_file:
        for _, row in data.iterrows():
            # Convert each row into a SQL INSERT statement
            columns = ", ".join([f'"{col}"' for col in table_config["columns"]])
            values = ", ".join([f"'{str(value).replace('\'', '\'\'')}'" for value in row])
            sql_file.write(f"INSERT INTO {table_name} ({columns}) VALUES ({values});\n")

    print(f"Insert SQL file for {table_name} saved at {insert_sql_file_path}")


In [None]:
import pandas as pd
import os

# Paths for input and output
INPUT_DIR = "/home/bkalejaiye/csv"  # Directory containing CSV files
OUTPUT_DIR = "/home/bkalejaiye/sql-files"  # Directory to save SQL files

os.makedirs(OUTPUT_DIR, exist_ok=True)

# Define table names, corresponding CSV files, and foreign key relationships
csv_files = {
    "movies": {
        "file": "movies_clean.csv",
        "columns": ["movie_id", "title", "genres"],
        "foreign_keys": []
    },
    "ratings": {
        "file": "ratings_clean.csv",
        "columns": ["user_id", "movie_id", "rating", "timestamp"],
        "foreign_keys": ["FOREIGN KEY (movie_id) REFERENCES movies(movie_id)"]
    },
    "tags": {
        "file": "tags_clean.csv",
        "columns": ["user_id", "movie_id", "tag", "timestamp"],
        "foreign_keys": ["FOREIGN KEY (movie_id) REFERENCES movies(movie_id)"]
    },
    "links": {
        "file": "links_clean.csv",
        "columns": ["movie_id", "imdb_id", "tmdb_id"],
        "foreign_keys": ["FOREIGN KEY (movie_id) REFERENCES movies(movie_id)"]
    }
}

# Iterate over CSV files
for table_name, table_config in csv_files.items():
    # Read CSV file
    csv_path = os.path.join(INPUT_DIR, table_config["file"])

    # Generate SQL CREATE TABLE statements
    sql_file_path = os.path.join(OUTPUT_DIR, f"{table_name}_create.sql")
    with open(sql_file_path, "w") as sql_file:
        # Define columns
        column_definitions = []
        for column in table_config["columns"]:
            if "id" in column:
                column_definitions.append(f'"{column}" INTEGER')
            elif "timestamp" in column:
                column_definitions.append(f'"{column}" TIMESTAMP')
            else:
                column_definitions.append(f'"{column}" TEXT')

        # Add foreign key constraints
        foreign_keys = table_config.get("foreign_keys", [])
        column_definitions.extend(foreign_keys)

        # Create table SQL
        create_table_sql = f"""
        CREATE TABLE {table_name} (
            {', '.join(column_definitions)}
        );
        """
        sql_file.write(create_table_sql)

    print(f"Create Table SQL file for {table_name} saved at {sql_file_path}")

    # Generate SQL COPY statement
    copy_sql_file_path = os.path.join(OUTPUT_DIR, f"{table_name}_copy.sql")
    with open(copy_sql_file_path, "w") as sql_file:
        copy_statement = f"""
        COPY {table_name} ({', '.join(table_config["columns"])})
        FROM '{csv_path}'
        DELIMITER ','
        CSV HEADER;
        """
        sql_file.write(copy_statement)

    print(f"COPY SQL file for {table_name} saved at {copy_sql_file_path}")
