## Lab instructions

In order to optimize our inventory, we would like to know which films will be rented next month and we are asked to create a model to predict it.

In [1]:
import pymysql
from sqlalchemy import create_engine
import pandas as pd
import getpass  # to get the password without showing the input
password = getpass.getpass()

········


In [2]:
connection_string = 'mysql+pymysql://root:' + password + '@localhost/sakila'
connection_string_2 = 'mysql+pymysql://root:' + password + '@localhost/demo_db'

engine = create_engine(connection_string)
engine_2 = create_engine(connection_string_2)

### 1- Create an SQL query or queries to extract the information you think may be relevant for building the prediction model. It should include some film features and some rental features

### 2- Read the data into a Pandas dataframe

1- Creating the table to have all features that would help us to make our predictions

In [40]:
query = '''
       create or replace view rented_films as
  SELECT rental.rental_id, film.film_id, rating, rental_rate, rental_duration, inventory.inventory_id
        FROM film
        JOIN inventory
        ON film.film_id = inventory.film_id
        JOIN rental
        ON inventory.inventory_id = rental.inventory_id
        ORDER BY rental_id;
        '''



In [50]:
data = pd.read_sql_query('SELECT * FROM rented_films', engine)
data

Unnamed: 0,rental_id,film_id,rating,rental_rate,rental_duration,inventory_id
0,1,80,G,2.99,7,367
1,2,333,R,2.99,7,1525
2,3,373,G,2.99,7,1711
3,4,535,R,0.99,6,2452
4,5,450,NC-17,2.99,5,2079
...,...,...,...,...,...,...
16039,16045,168,R,0.99,5,772
16040,16046,951,PG-13,0.99,6,4364
16041,16047,452,R,0.99,4,2088
16042,16048,439,PG-13,4.99,4,2019


2 - Counting, grouping and ordering some features 

In [42]:
query = """SELECT *,
COUNT(*) OVER (PARTITION BY film_id) topfilm
FROM rented_films
ORDER BY topfilm DESC;"""

df = pd.read_sql_query(query, engine)
df

Unnamed: 0,rental_id,film_id,rating,rental_rate,rental_duration,inventory_id,topfilm
0,6193,103,PG,4.99,7,465,34
1,8174,103,PG,4.99,7,465,34
2,14198,103,PG,4.99,7,465,34
3,83,103,PG,4.99,7,466,34
4,2087,103,PG,4.99,7,466,34
...,...,...,...,...,...,...,...
16039,14824,904,R,4.99,3,4162,4
16040,4829,584,PG-13,2.99,6,2661,4
16041,7054,584,PG-13,2.99,6,2661,4
16042,14625,584,PG-13,2.99,6,2661,4


In [43]:
query = """SELECT rating, count(rating)
FROM rented_films
GROUP BY rating;;"""

df1 = pd.read_sql_query(query, engine)
df1

Unnamed: 0,rating,count(rating)
0,PG,3212
1,G,2773
2,NC-17,3293
3,PG-13,3585
4,R,3181


In [44]:
query = """SELECT rental_rate, count(rental_rate)
FROM rented_films
GROUP BY rental_rate;"""

df2 = pd.read_sql_query(query, engine)
df2

Unnamed: 0,rental_rate,count(rental_rate)
0,0.99,5652
1,4.99,5272
2,2.99,5120


In [46]:
query = """SELECT rental_duration, count(rental_duration)
FROM rented_films
GROUP BY rental_duration;"""

df3 = pd.read_sql_query(query, engine)
df3

Unnamed: 0,rental_duration,count(rental_duration)
0,6,3392
1,3,3412
2,7,2824
3,5,3165
4,4,3251


### 3- Analyze extracted features and transform them. You may need to encode some categorical variables, or scale numerical variables

In [47]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16044 entries, 0 to 16043
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   rental_id        16044 non-null  int64  
 1   film_id          16044 non-null  int64  
 2   rating           16044 non-null  object 
 3   rental_rate      16044 non-null  float64
 4   rental_duration  16044 non-null  int64  
 5   inventory_id     16044 non-null  int64  
dtypes: float64(1), int64(4), object(1)
memory usage: 752.2+ KB


In [49]:
data.describe()

Unnamed: 0,film_id,rental_rate,rental_duration,inventory_id,topfilm
count,16044.0,16044.0,16044.0,16044.0,16044.0
mean,501.108888,2.94263,4.93549,2291.842558,19.406008
std,288.513529,1.649678,1.40169,1322.210643,6.394232
min,1.0,0.99,3.0,1.0,4.0
25%,255.0,0.99,4.0,1154.0,15.0
50%,496.0,2.99,5.0,2291.0,20.0
75%,753.0,4.99,6.0,3433.0,24.0
max,1000.0,4.99,7.0,4581.0,34.0


In [51]:
data.isna().sum()

rental_id          0
film_id            0
rating             0
rental_rate        0
rental_duration    0
inventory_id       0
dtype: int64

In [None]:
# encoding the categorical variables

In [52]:
def categorical_information (df):
    for col in df.select_dtypes('object'):
        print (df[col].nunique(), '\n')
        print(df[col].value_counts(), '\n')

In [53]:
categorical_information(data)

5 

PG-13    3585
NC-17    3293
PG       3212
R        3181
G        2773
Name: rating, dtype: int64 



In [54]:
X = pd.get_dummies(data)
X

Unnamed: 0,rental_id,film_id,rental_rate,rental_duration,inventory_id,rating_G,rating_NC-17,rating_PG,rating_PG-13,rating_R
0,1,80,2.99,7,367,1,0,0,0,0
1,2,333,2.99,7,1525,0,0,0,0,1
2,3,373,2.99,7,1711,1,0,0,0,0
3,4,535,0.99,6,2452,0,0,0,0,1
4,5,450,2.99,5,2079,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...
16039,16045,168,0.99,5,772,0,0,0,0,1
16040,16046,951,0.99,6,4364,0,0,0,1,0
16041,16047,452,0.99,4,2088,0,0,0,0,1
16042,16048,439,4.99,4,2019,0,0,0,1,0


### 4- Create a query to get the list of films and a boolean indicating if it was rented last month. This would be our target variable

I don't know how to solve this question.

### 5- Create a logistic regression model to predict this variable from the cleaned data

In [None]:
 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
def logistic_regression_model(X_train, X_test, y_train, y_test):

    # defining a function to apply the logistic regression model
    
    classification = LogisticRegression(random_state=42, max_iter=10000)
    classification.fit(X_train, y_train)
    
    # and to evaluate the model
    score = classification.score(X_test, y_test)
    print('The accuracy score is: ', score, '\n')
      
    predictions = classification.predict(X_test)
    confusion_matrix(y_test, predictions)  
   
    
    cf_matrix = confusion_matrix(y_test, predictions)
    group_names = ['True NO', 'False NO',
               'False YES', 'True YES']

    group_counts = ["{0:0.0f}".format(value) for value in cf_matrix.flatten()]
    group_percentages = ["{0:.2%}".format(value) for value in cf_matrix.flatten()/np.sum(cf_matrix)]
    labels = [f"{v1}\n{v2}\n{v3}" for v1, v2, v3 in zip(group_names,group_counts,group_percentages)]
    labels = np.asarray(labels).reshape(2,2)
    sns.heatmap(cf_matrix, annot=labels, fmt='', cmap='Blues')
    print (cf_matrix)

### 6- Evaluate the results (calculate the score of the model)

In [None]:
logistic_regression_model(X_train, X_test, y_train, y_test)