In [25]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sqlalchemy import create_engine
import getpass


password = getpass.getpass()

connection_string = 'mysql+pymysql://root:' + password + '@localhost:3306/sakila'
engine = create_engine(connection_string)

query = '''
SELECT 
    f.film_id,
    f.title,
    f.language_id,
    f.length,
    f.rating,
    f.special_features,
    IF(MONTH(r.rental_date) = 5, 1, 0) AS rented_in_may
FROM
    film f
LEFT JOIN
    inventory i ON f.film_id = i.film_id
LEFT JOIN
    rental r ON i.inventory_id = r.inventory_id
WHERE
    YEAR(r.rental_date) = 2005;
'''

df = pd.read_sql(query, engine)

print(df.head())

print("Missing values in the DataFrame:\n", df.isnull().sum())

categorical_columns = ['rating', 'special_features']
for column in categorical_columns:
    print(f"\nUnique values for {column}:\n", df[column].unique())


df_encoded = pd.get_dummies(df, columns=categorical_columns, drop_first=True)

print("\nEncoded DataFrame:\n", df_encoded.head())

X = df_encoded.drop(['film_id', 'title', 'rented_in_may'], axis=1)
y = df_encoded['rented_in_may']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LogisticRegression(random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print("Confusion Matrix:\n", conf_matrix)
print("Classification Report:\n", classification_rep)


········
   film_id             title  language_id  length rating  \
0        1  ACADEMY DINOSAUR            1      86     PG   
1        1  ACADEMY DINOSAUR            1      86     PG   
2        1  ACADEMY DINOSAUR            1      86     PG   
3        1  ACADEMY DINOSAUR            1      86     PG   
4        1  ACADEMY DINOSAUR            1      86     PG   

                   special_features  rented_in_may  
0  Deleted Scenes,Behind the Scenes              0  
1  Deleted Scenes,Behind the Scenes              0  
2  Deleted Scenes,Behind the Scenes              0  
3  Deleted Scenes,Behind the Scenes              1  
4  Deleted Scenes,Behind the Scenes              0  
Missing values in the DataFrame:
 film_id             0
title               0
language_id         0
length              0
rating              0
special_features    0
rented_in_may       0
dtype: int64

Unique values for rating:
 ['PG' 'G' 'NC-17' 'PG-13' 'R']

Unique values for special_features:
 ['Deleted Scen

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
