# Accessing MySQL Database from Python

This notebook demonstrates how to connect to and interact with a MySQL database using Python.

## Prerequisites
- MySQL server installed and running
- Database user credentials
- Python MySQL connector packages

In [None]:
# Install required packages (run this if packages are not installed)
%pip install mysql-connector-python
%pip install pymysql
%pip install sqlalchemy
%pip install pandas
%pip install tabulate
%pip install scikit-learn


In [None]:
import pandas as pd
from sqlalchemy import create_engine, text
from scipy.stats import norm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import numpy as np
import warnings

## Method 1: Using mysql-connector-python

In [None]:
import mysql.connector
from mysql.connector import Error

# Database connection parameters
config = {
    'host': 'localhost',        # or your MySQL server IP
    'port': 3306,              # default MySQL port
    'user': 'root',   # replace with your MySQL username
    'password': 'fuckingpassword', # replace with your MySQL password
    'database': 'imdb' # replace with your database name
}

try:
    # Create connection
    connection = mysql.connector.connect(**config)
    
    if connection.is_connected():
        print("Successfully connected to MySQL database")
        
        # Create cursor
        cursor = connection.cursor()
        
        # Execute a simple query
        cursor.execute("SELECT VERSION()")
        version = cursor.fetchone()
        print(f"MySQL version: {version[0]}")
        
except Error as e:
    print(f"Error connecting to MySQL: {e}")
    
finally:
    # Close connections
    if 'cursor' in locals():
        cursor.close()
    if 'connection' in locals() and connection.is_connected():
        connection.close()
        print("MySQL connection closed")

In [None]:
# Query the region_dt table
try:
    # Create connection using the same config as before
    connection = mysql.connector.connect(**config)
    
    if connection.is_connected():
        cursor = connection.cursor()
        
        # Execute SELECT * FROM region_dt
        cursor.execute("SELECT * FROM region_dt")
        
        # Fetch all results
        results = cursor.fetchall()
        
        # Get column names
        column_names = [desc[0] for desc in cursor.description]
        print(f"Columns: {column_names}")
        print("-" * 50)
        
        # Print all rows
        for row in results:
            print(row)
            
        print(f"\nTotal rows: {len(results)}")
        
except Error as e:
    print(f"Error querying database: {e}")
    
finally:
    # Close connections
    if 'cursor' in locals():
        cursor.close()
    if 'connection' in locals() and connection.is_connected():
        connection.close()
        print("Connection closed")

## Best Directors

## Movie Search

### Z-score percentile of movie performance in a genre

In [None]:
def get_movie_percentiles(movie_tconst, genre_name):
    print(f"'{movie_tconst}' performance statistics in '{genre_name}' genre...")
    
    sql_query = """
    WITH GenreMovies AS (
        SELECT 
            ft.tconst, ft.averageRating, ft.numVotes
        FROM 
            title_ft AS ft
        JOIN 
            title_genre_bridge AS tgb ON ft.tconst = tgb.tconst
        JOIN 
            genre_dt AS g ON tgb.genreID = g.genreID
        WHERE 
            g.genreName = %s AND ft.numVotes IS NOT NULL
    ),
    GenreStats AS (
        SELECT 
            AVG(averageRating) AS rating_mean, STDDEV(averageRating) AS rating_std,
            AVG(numVotes) AS votes_mean, STDDEV(numVotes) AS votes_std
        FROM GenreMovies
    ),
    MovieRankings AS (
        SELECT
            gm.tconst, gm.averageRating, gm.numVotes,
            (gm.averageRating - gs.rating_mean) / gs.rating_std AS rating_zscore,
            (gm.numVotes - gs.votes_mean) / gs.votes_std AS votes_zscore,
            CUME_DIST() OVER (ORDER BY gm.averageRating) AS rating_percentile,
            CUME_DIST() OVER (ORDER BY gm.numVotes) AS votes_percentile
        FROM GenreMovies AS gm, GenreStats AS gs
    )
    SELECT 
        t.primaryTitle, mr.averageRating, mr.rating_zscore,
        mr.rating_percentile, mr.numVotes, mr.votes_zscore,
        mr.votes_percentile
    FROM MovieRankings AS mr
    JOIN title_ft AS t ON mr.tconst = t.tconst
    WHERE mr.tconst = %s;
    """
    
    params = (genre_name, movie_tconst)
    
    try:
        with mysql.connector.connect(**config) as connection:
            with connection.cursor() as cursor:
                cursor.execute(sql_query, params)
                results = cursor.fetchall()
                
                if not results:
                    return f"Couldn't find {movie_tconst} in the {genre_name} genre."
                
                column_names = cursor.column_names
                result_df = pd.DataFrame(results, columns=column_names)
                return result_df
                
    except Error as e:
        return f"something went wrong: {e}"


movie_to_check = 'tt0111161'
genre_to_check = 'Drama'

print(f"Running test for: {movie_to_check} ('{genre_to_check}')")
results = get_movie_percentiles(movie_to_check, genre_to_check)

if isinstance(results, pd.DataFrame):
    print("\nTest Done:")
    print(results.to_markdown(index=False))
else:
    print(results)

In [None]:
print("\nFetching data...")

recommender_sql_query = """
SELECT 
    t.tconst, 
    t.primaryTitle,
    t.averageRating, 
    t.numVotes, 
    t.runtimeMinutes,
    
    (SELECT GROUP_CONCAT(g.genreName SEPARATOR ' ')
     FROM title_genre_bridge tgb
     JOIN genre_dt g ON tgb.genreID = g.genreID
     WHERE tgb.tconst = t.tconst) AS genres,
     
    (SELECT GROUP_CONCAT(d.nconst SEPARATOR ' ')
     FROM title_director_bridge tdb
     JOIN directors_dt d ON tdb.nconst = d.nconst
     WHERE tdb.tconst = t.tconst) AS director_ids,
     
    (SELECT GROUP_CONCAT(d.primaryName SEPARATOR ', ')
     FROM title_director_bridge tdb
     JOIN directors_dt d ON tdb.nconst = d.nconst
     WHERE tdb.tconst = t.tconst) AS director_names
FROM 
    title_ft AS t
WHERE 
    t.numVotes > 5000 
    AND t.typeID = (SELECT typeID FROM type_dt WHERE titleType = 'movie');
"""

try:
    with mysql.connector.connect(**config) as connection:
        print("Connecting...")
        with connection.cursor() as cursor:
            cursor.execute(recommender_sql_query)
            results = cursor.fetchall()
            
            if not results:
                raise Exception("No data returned from query.")
                
            column_names = cursor.column_names
            movies_df = pd.DataFrame(results, columns=column_names)

    print(f"Data loaded for {len(movies_df)} movies.")
    
    movies_df['genres'] = movies_df['genres'].fillna('')
    movies_df['director_ids'] = movies_df['director_ids'].fillna('')
    movies_df['director_names'] = movies_df['director_names'].fillna('')
    
    movies_df['averageRating'] = movies_df['averageRating'].fillna(0)
    movies_df['numVotes'] = movies_df['numVotes'].fillna(0)
    movies_df['runtimeMinutes'] = movies_df['runtimeMinutes'].fillna(0)

except Error as e:
    print(f"Failed to load recommender data. Error: {e}")
    raise e


print("building the recommendation engine...")

numeric_features = ['averageRating', 'numVotes', 'runtimeMinutes']
text_features_genres = 'genres'
text_features_directors = 'director_ids'

numeric_transformer = Pipeline(steps=[('scaler', MinMaxScaler())])
text_transformer_genres = Pipeline(steps=[('tfidf', TfidfVectorizer(stop_words='english'))])
text_transformer_directors = Pipeline(steps=[('tfidf', TfidfVectorizer())])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('genres', text_transformer_genres, text_features_genres),
        ('directors', text_transformer_directors, text_features_directors)
    ])

pipeline = Pipeline(steps=[('preprocessor', preprocessor)])

print("Analyzing movie features...")
feature_matrix = pipeline.fit_transform(movies_df)

print("Calculating similarities...")
cosine_sim_matrix = cosine_similarity(feature_matrix, feature_matrix)

print("Recommender finished.")


def get_recommendations(movie_tconst, n=5):
    """
    Gets the top N similar movies for a given movie tconst.
    """
    try:
        idx = movies_df[movies_df['tconst'] == movie_tconst].index[0]
    except IndexError:
        return f"Movie '{movie_tconst}' not found in the dataset."
    except NameError:
        return "the movie data isn't loaded. Please re-run the cell."

    sim_scores = list(enumerate(cosine_sim_matrix[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    top_movie_indices = [i[0] for i in sim_scores[1:n+1]]
    
    return movies_df.iloc[top_movie_indices][['primaryTitle', 'tconst', 'genres', 'director_names']]


movie_to_get_recs_for = 'tt0468569'

print(f"\nRecommendations for {movie_to_get_recs_for}")
try:
    recommendations = get_recommendations(movie_to_get_recs_for, n=5)
    print(recommendations.to_markdown(index=False))
except Exception as e:
    print(f"something went wrong: {e}")