In [144]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [145]:
## Carga de librerias
import numpy as np
import pandas as pd
import sqlite3 as sql
import plotly.graph_objs as go ### para gráficos
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns

##Forma
from IPython.display import display, Markdown
palette_color=['#d4afb9', '#d1cfe2', '#9cadce', '#7ec4cf', '#52b2cf']

In [149]:
##### conectarse a BD #######
conn = sql.connect('/content/drive/MyDrive/AA/Analitica-en-Marketing/Data/db_movies')
cur=conn.cursor()

In [150]:
### para ver las tablas que hay en la base de datos
cur.execute("select name from sqlite_master where type='table' ")
cur.fetchall()

[('ratings',), ('movies',)]

In [152]:
## traer tabla de BD a python ####
movies= pd.read_sql("select *  from movies", conn)
ratings = pd.read_sql('select * from ratings', conn)

In [153]:
def check_df(dataframe):
    # Dimensiones base general
    display(Markdown('**Dimensiones base general**'))
    display(dataframe.shape)

    # Dimensiones sin duplicados
    display(Markdown('**Dimensiones sin duplicados**'))
    display(dataframe.drop_duplicates().shape)

    # Tipos de datos
    display(Markdown('**Tipos**'))
    display(dataframe.dtypes)

    # Valores nulos
    display(Markdown('**Nulos**'))
    display(dataframe.isnull().sum())

check_df(movies)
check_df(ratings)

**Dimensiones base general**

(9742, 3)

**Dimensiones sin duplicados**

(9742, 3)

**Tipos**

Unnamed: 0,0
movieId,int64
title,object
genres,object


**Nulos**

Unnamed: 0,0
movieId,0
title,0
genres,0


**Dimensiones base general**

(100836, 4)

**Dimensiones sin duplicados**

(100836, 4)

**Tipos**

Unnamed: 0,0
userId,int64
movieId,int64
rating,float64
timestamp,int64


**Nulos**

Unnamed: 0,0
userId,0
movieId,0
rating,0
timestamp,0


In [154]:
### ver el tipo de datos y faltantes
print(movies.info())
print(ratings.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  9742 non-null   int64 
 1   title    9742 non-null   object
 2   genres   9742 non-null   object
dtypes: int64(1), object(2)
memory usage: 228.5+ KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100836 entries, 0 to 100835
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     100836 non-null  int64  
 1   movieId    100836 non-null  int64  
 2   rating     100836 non-null  float64
 3   timestamp  100836 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 3.1 MB
None


In [155]:
# distribución de las calificaciones
cr=pd.read_sql(""" select rating,
                    count(*) as conteo
                    from ratings
                    group by rating
                    order by conteo desc""", conn)
cr

Unnamed: 0,rating,conteo
0,4.0,26818
1,3.0,20047
2,5.0,13211
3,3.5,13136
4,4.5,8551
5,2.0,7551
6,2.5,5550
7,1.0,2811
8,1.5,1791
9,0.5,1370


In [156]:
data  = go.Bar( x=cr.rating,marker_color=palette_color,y=cr.conteo, text=cr.conteo, textposition="outside")
Layout=go.Layout(title="Count of ratings",xaxis={'title':'Rating'},yaxis={'title':'Count'})
go.Figure(data,Layout)

In [157]:
# Cacular cada usuario cuantas peliculas calificó
rating_users = pd.read_sql(''' select userId,
                                count(*) as cnt_rat
                                from ratings
                                group by userId
                                order by cnt_rat asc
                                ''',conn )

In [158]:
fig  = px.histogram(rating_users, x= 'cnt_rat', color_discrete_sequence=palette_color, title= 'Histograma frecuencia de número de calificaciones por usuario')
fig.show()


In [159]:
## Descripción ratings
rating_users.describe()

Unnamed: 0,userId,cnt_rat
count,610.0,610.0
mean,305.5,165.304918
std,176.236111,269.480584
min,1.0,20.0
25%,153.25,35.0
50%,305.5,70.5
75%,457.75,168.0
max,610.0,2698.0


In [160]:
## excluir usuarios con menos de 500 libros calificados
rating_users2 = pd.read_sql('''select userId,
                                count(*) as cnt_rat
                                from ratings
                                group by userId
                                having cnt_rat <=500
                                order by cnt_rat asc
                                ''',conn )
rating_users2

Unnamed: 0,userId,cnt_rat
0,53,20
1,147,20
2,189,20
3,194,20
4,207,20
...,...,...
562,509,467
563,368,469
564,381,474
565,57,476


In [161]:
rating_users2.describe()

Unnamed: 0,userId,cnt_rat
count,567.0,567.0
mean,303.340388,107.199295
std,174.739864,105.154761
min,1.0,20.0
25%,152.5,34.0
50%,304.0,63.0
75%,453.5,139.5
max,609.0,500.0


In [162]:
fig  = px.histogram(rating_users2, x= 'cnt_rat', title= 'Histograma frecuencia de número de calificaciones por usuario')
fig.show()

In [163]:
### calificacion de cada pelicula
rating_movie = pd.read_sql('''select movieId,
                                count(*) as cnt_rat
                                from ratings
                                group by movieId
                                order by cnt_rat desc
                                ''',conn )

In [164]:
fig  = px.histogram(rating_movie, x= 'cnt_rat', title= 'Histograma frecuencia de número de calificaciones por pelicula')
fig.show()

In [165]:
rating_movie.describe()

Unnamed: 0,movieId,cnt_rat
count,9724.0,9724.0
mean,42245.024373,10.369807
std,52191.13732,22.401005
min,1.0,1.0
25%,3245.5,1.0
50%,7300.0,3.0
75%,76739.25,9.0
max,193609.0,329.0


In [166]:
####peliculas que tengan más de 10 calificaciones
rating_movie2=pd.read_sql(''' select movieId,
                                count(*) as cnt_rat
                                from ratings
                                group by movieId
                                having cnt_rat >= 10
                                order by cnt_rat desc
                                ''',conn )

In [167]:
fig  = px.histogram(rating_movie2, x= 'cnt_rat', title= 'Histograma frecuencia de número de calificaciones por pelicula')
fig.show()

In [168]:
rating_movie2.describe()

Unnamed: 0,movieId,cnt_rat
count,2269.0,2269.0
mean,20530.586161,35.749669
std,35185.840333,35.986989
min,1.0,10.0
25%,1345.0,14.0
50%,3256.0,22.0
75%,8958.0,43.0
max,187593.0,329.0


In [174]:
## crear copia de db_books datos originales, nombrarla books2 y procesar books2
conn1 = sql.connect('/content/drive/MyDrive/AA/Analitica-en-Marketing/Data/db_movies2.db') ### crear cuando no existe el nombre de cd y para conectarse cuando sí existe.
cur1 = conn.cursor() ###para funciones que ejecutan sql en base de datos

In [175]:
### para ver las tablas que hay en la base de datos
cur.execute("select name from sqlite_master where type='table' ")
cur.fetchall()

[('ratings',), ('movies',)]

In [171]:
##### consultar trayendo para pandas ###
df_final = pd.read_sql("select * from full_ratings", conn)
df_final

DatabaseError: Execution failed on sql 'select * from full_ratings': no such table: full_ratings